summaryrefslogtreecommitdiffstats
path: root/storage/innobase/trx
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/trx')
-rw-r--r--storage/innobase/trx/trx0i_s.cc1490
-rw-r--r--storage/innobase/trx/trx0purge.cc1297
-rw-r--r--storage/innobase/trx/trx0rec.cc2559
-rw-r--r--storage/innobase/trx/trx0roll.cc984
-rw-r--r--storage/innobase/trx/trx0rseg.cc768
-rw-r--r--storage/innobase/trx/trx0sys.cc339
-rw-r--r--storage/innobase/trx/trx0trx.cc2300
-rw-r--r--storage/innobase/trx/trx0undo.cc1401
8 files changed, 11138 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..d043c3d8
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1490 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM 1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE 39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd \
+ - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+ ulint offset; /*!< offset, in number of rows */
+ ulint rows_allocd; /*!< the size of this chunk, in number
+ of rows */
+ void* base; /*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+ ulint rows_used; /*!< number of used rows */
+ ulint rows_allocd; /*!< number of allocated rows */
+ ulint row_size; /*!< size of a single row */
+ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+ memory chunks that stores the
+ rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+ rw_lock_t rw_lock; /*!< read-write lock protecting
+ the rest of this structure */
+ Atomic_relaxed<ulonglong> last_read;
+ /*!< last time the cache was read;
+ measured in nanoseconds */
+ i_s_table_cache_t innodb_trx; /*!< innodb_trx table */
+ i_s_table_cache_t innodb_locks; /*!< innodb_locks table */
+ i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM 10000
+ hash_table_t locks_hash; /*!< hash table used to eliminate
+ duplicate entries in the
+ innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE 1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS 2048
+ ha_storage_t* storage; /*!< storage for external volatile
+ data that may become unavailable
+ when we release
+ lock_sys.mutex */
+ ulint mem_allocd; /*!< the amount of memory
+ allocated with mem_alloc*() */
+ bool is_truncated; /*!< this is true if the memory
+ limit was hit and thus the data
+ in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+ return lock_get_type(lock) == LOCK_REC
+ ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+ : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+ i_s_table_cache_t* table_cache, /*!< out: table cache */
+ size_t row_size) /*!< in: the size of a
+ row */
+{
+ ulint i;
+
+ table_cache->rows_used = 0;
+ table_cache->rows_allocd = 0;
+ table_cache->row_size = row_size;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ table_cache->chunks[i].base = NULL;
+ }
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+ i_s_table_cache_t* table_cache) /*!< in/out: table cache */
+{
+ ulint i;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ if (table_cache->chunks[i].base) {
+ ut_free(table_cache->chunks[i].base);
+ table_cache->chunks[i].base = NULL;
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+ i_s_table_cache_t* table_cache, /*!< in/out: table cache */
+ trx_i_s_cache_t* cache) /*!< in/out: cache to record
+ how many bytes are
+ allocated */
+{
+ ulint i;
+ void* row;
+
+ ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+ if (table_cache->rows_used == table_cache->rows_allocd) {
+
+ /* rows_used == rows_allocd means that new chunk needs
+ to be allocated: either no more empty rows in the
+ last allocated chunk or nothing has been allocated yet
+ (rows_num == rows_allocd == 0); */
+
+ i_s_mem_chunk_t* chunk;
+ ulint req_bytes;
+ ulint got_bytes;
+ ulint req_rows;
+ ulint got_rows;
+
+ /* find the first not allocated chunk */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].base == NULL) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ have been allocated :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ /* allocate the chunk we just found */
+
+ if (i == 0) {
+
+ /* first chunk, nothing is allocated yet */
+ req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+ } else {
+
+ /* Memory is increased by the formula
+ new = old + old / 2; We are trying not to be
+ aggressive here (= using the common new = old * 2)
+ because the allocated memory will not be freed
+ until InnoDB exit (it is reused). So it is better
+ to once allocate the memory in more steps, but
+ have less unused/wasted memory than to use less
+ steps in allocation (which is done once in a
+ lifetime) but end up with lots of unused/wasted
+ memory. */
+ req_rows = table_cache->rows_allocd / 2;
+ }
+ req_bytes = req_rows * table_cache->row_size;
+
+ if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+ return(NULL);
+ }
+
+ chunk = &table_cache->chunks[i];
+
+ got_bytes = req_bytes;
+ chunk->base = ut_malloc_nokey(req_bytes);
+
+ got_rows = got_bytes / table_cache->row_size;
+
+ cache->mem_allocd += got_bytes;
+
+#if 0
+ printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+ " row size=%lu,"
+ " req rows=%lu, got rows=%lu\n",
+ i, req_bytes, got_bytes,
+ table_cache->row_size,
+ req_rows, got_rows);
+#endif
+
+ chunk->rows_allocd = got_rows;
+
+ table_cache->rows_allocd += got_rows;
+
+ /* adjust the offset of the next chunk */
+ if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+ table_cache->chunks[i + 1].offset
+ = chunk->offset + chunk->rows_allocd;
+ }
+
+ /* return the first empty row in the newly allocated
+ chunk */
+ row = chunk->base;
+ } else {
+
+ char* chunk_start;
+ ulint offset;
+
+ /* there is an empty row, no need to allocate new
+ chunks */
+
+ /* find the first chunk that contains allocated but
+ empty/unused rows */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd
+ > table_cache->rows_used) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ are full, but
+ table_cache->rows_used != table_cache->rows_allocd means
+ exactly the opposite - there are allocated but
+ empty/unused rows :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ chunk_start = (char*) table_cache->chunks[i].base;
+ offset = table_cache->rows_used
+ - table_cache->chunks[i].offset;
+
+ row = chunk_start + offset * table_cache->row_size;
+ }
+
+ table_cache->rows_used++;
+
+ return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+ const i_s_locks_row_t* row) /*!< in: row to validate */
+{
+ ut_ad(row->lock_mode);
+ ut_ad(row->lock_table != NULL);
+ ut_ad(row->lock_table_id != 0);
+
+ if (!row->lock_index) {
+ /* table lock */
+ ut_ad(!row->lock_data);
+ ut_ad(row->lock_page == page_id_t(0, 0));
+ ut_ad(!row->lock_rec);
+ } else {
+ /* record lock */
+ /* row->lock_data == NULL if buf_page_try_get() == NULL */
+ }
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+ i_s_trx_row_t* row, /*!< out: result object
+ that's filled */
+ const trx_t* trx, /*!< in: transaction to
+ get data from */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ corresponding row in
+ innodb_locks if trx is
+ waiting or NULL if trx
+ is not waiting */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into
+ which to copy volatile
+ strings */
+{
+ const char* s;
+
+ ut_ad(lock_mutex_own());
+
+ row->trx_id = trx_get_id_for_print(trx);
+ row->trx_started = trx->start_time;
+ row->trx_state = trx_get_que_state_str(trx);
+ row->requested_lock_row = requested_lock_row;
+ ut_ad(requested_lock_row == NULL
+ || i_s_locks_row_validate(requested_lock_row));
+
+ if (trx->lock.wait_lock != NULL) {
+
+ ut_a(requested_lock_row != NULL);
+ row->trx_wait_started = trx->lock.wait_started;
+ } else {
+ ut_a(requested_lock_row == NULL);
+ row->trx_wait_started = 0;
+ }
+
+ row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+ if (trx->mysql_thd == NULL) {
+ /* For internal transactions e.g., purge and transactions
+ being recovered at startup there is no associated MySQL
+ thread data structure. */
+ row->trx_mysql_thread_id = 0;
+ row->trx_query = NULL;
+ goto thd_done;
+ }
+
+ row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+ char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+ if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+ sizeof query)) {
+ row->trx_query = static_cast<const char*>(
+ ha_storage_put_memlim(
+ cache->storage, query, stmt_len + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache)));
+
+ row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+ if (row->trx_query == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_query = NULL;
+ }
+
+thd_done:
+ row->trx_operation_state = trx->op_info;
+
+ row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+ row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+ /* These are protected by both trx->mutex or lock_sys.mutex,
+ or just lock_sys.mutex. For reading, it suffices to hold
+ lock_sys.mutex. */
+
+ row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+ row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+ row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
+
+ row->trx_rows_modified = trx->undo_no;
+
+ row->trx_isolation_level = trx->isolation_level;
+
+ row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+ row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+ s = trx->detailed_error;
+
+ if (s != NULL && s[0] != '\0') {
+
+ TRX_I_S_STRING_COPY(s,
+ row->trx_foreign_key_error,
+ TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+ if (row->trx_foreign_key_error == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+ row->trx_foreign_key_error = NULL;
+ }
+
+ row->trx_is_read_only = trx->read_only;
+
+ row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+ char* buf, /*!< out: buffer */
+ ulint buf_size,/*!< in: buffer size in bytes */
+ ulint n, /*!< in: number of field */
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets)/*!< in: record offsets, returned
+ by rec_get_offsets() */
+{
+ const byte* data;
+ ulint data_len;
+ dict_field_t* dict_field;
+ ulint ret;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ ret = 0;
+
+ if (n > 0) {
+ /* we must append ", " before the actual data */
+
+ if (buf_size < 3) {
+
+ buf[0] = '\0';
+ return(1);
+ }
+
+ memcpy(buf, ", ", 3);
+
+ buf += 2;
+ buf_size -= 2;
+ ret += 2;
+ }
+
+ /* now buf_size >= 1 */
+
+ data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+ dict_field = dict_index_get_nth_field(index, n);
+
+ ret += row_raw_format((const char*) data, data_len,
+ dict_field, buf, buf_size);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+ const char** lock_data,/*!< out: "lock_data" to fill */
+ const lock_t* lock, /*!< in: lock used to find the data */
+ ulint heap_no,/*!< in: rec num used to find the data */
+ trx_i_s_cache_t* cache) /*!< in/out: cache where to store
+ volatile data */
+{
+ ut_a(lock_get_type(lock) == LOCK_REC);
+
+ switch (heap_no) {
+ case PAGE_HEAP_NO_INFIMUM:
+ case PAGE_HEAP_NO_SUPREMUM:
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage,
+ heap_no == PAGE_HEAP_NO_INFIMUM
+ ? "infimum pseudo-record"
+ : "supremum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ return(*lock_data != NULL);
+ }
+
+ mtr_t mtr;
+
+ const buf_block_t* block;
+ const page_t* page;
+ const rec_t* rec;
+ const dict_index_t* index;
+ ulint n_fields;
+ mem_heap_t* heap;
+ rec_offs offsets_onstack[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ char buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+ ulint buf_used;
+ ulint i;
+
+ mtr_start(&mtr);
+
+ block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+ if (block == NULL) {
+
+ *lock_data = NULL;
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+ rec_offs_init(offsets_onstack);
+ offsets = offsets_onstack;
+
+ rec = page_find_rec_with_heap_no(page, heap_no);
+
+ index = lock_rec_get_index(lock);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ ut_a(n_fields > 0);
+
+ heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ n_fields, &heap);
+
+ /* format and store the data */
+
+ buf_used = 0;
+ for (i = 0; i < n_fields; i++) {
+
+ buf_used += put_nth_field(
+ buf + buf_used, sizeof(buf) - buf_used,
+ i, index, rec, offsets) - 1;
+ }
+
+ *lock_data = (const char*) ha_storage_put_memlim(
+ cache->storage, buf, buf_used + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (heap != NULL) {
+
+ /* this means that rec_get_offsets() has created a new
+ heap and has stored offsets in it; check that this is
+ really the case and free the heap */
+ ut_a(offsets != offsets_onstack);
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ if (*lock_data == NULL) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+ i_s_locks_row_t* row, /*!< out: result object that's filled */
+ const lock_t* lock, /*!< in: lock to get data from */
+ uint16_t heap_no,/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy
+ volatile strings */
+{
+ row->lock_trx_id = lock->trx->id;
+ const auto lock_type = lock_get_type(lock);
+ ut_ad(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
+
+ const bool is_gap_lock = lock_type == LOCK_REC
+ && (lock->type_mode & LOCK_GAP);
+ switch (lock->type_mode & LOCK_MODE_MASK) {
+ case LOCK_S:
+ row->lock_mode = uint8_t(1 + is_gap_lock);
+ break;
+ case LOCK_X:
+ row->lock_mode = uint8_t(3 + is_gap_lock);
+ break;
+ case LOCK_IS:
+ row->lock_mode = uint8_t(5 + is_gap_lock);
+ break;
+ case LOCK_IX:
+ row->lock_mode = uint8_t(7 + is_gap_lock);
+ break;
+ case LOCK_AUTO_INC:
+ row->lock_mode = 9;
+ break;
+ default:
+ ut_ad("unknown lock mode" == 0);
+ row->lock_mode = 0;
+ }
+
+ row->lock_table = ha_storage_put_str_memlim(
+ cache->storage, lock_get_table_name(lock).m_name,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_table == NULL) {
+
+ return false;
+ }
+
+ if (lock_type == LOCK_REC) {
+ row->lock_index = ha_storage_put_str_memlim(
+ cache->storage, lock_rec_get_index_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_index == NULL) {
+
+ return false;
+ }
+
+ row->lock_page = lock->un_member.rec_lock.page_id;
+ row->lock_rec = heap_no;
+
+ if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ return false;
+ }
+ } else {
+ row->lock_index = NULL;
+
+ row->lock_page = page_id_t(0, 0);
+ row->lock_rec = 0;
+
+ row->lock_data = NULL;
+ }
+
+ row->lock_table_id = lock_get_table_id(lock);
+
+ row->hash_chain.value = row;
+ ut_ad(i_s_locks_row_validate(row));
+
+ return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+ i_s_lock_waits_row_t* row, /*!< out: result object
+ that's filled */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ ut_ad(i_s_locks_row_validate(requested_lock_row));
+ ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+ row->requested_lock_row = requested_lock_row;
+ row->blocking_lock_row = blocking_lock_row;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+ const lock_t* lock, /*!< in: lock object to fold */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+ static ulint fold = 0;
+
+ return(fold++);
+#else
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != 0xFFFF);
+ ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+ lock->un_member.rec_lock.page_id.
+ fold());
+ ret = ut_fold_ulint_pair(ret, heap_no);
+
+ break;
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ ret = (ulint) lock_get_table_id(lock);
+
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ const lock_t* lock, /*!< in: lock object */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+ return(0);
+#else
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_page == lock->un_member.rec_lock.page_id
+ && row->lock_rec == heap_no);
+
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_table_id == lock_get_table_id(lock));
+
+ default:
+ ut_error;
+ return(FALSE);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ const lock_t* lock, /*!< in: lock to search for */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ i_s_hash_chain_t* hash_chain;
+
+ HASH_SEARCH(
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* the type of the next variable */
+ i_s_hash_chain_t*,
+ /* auxiliary variable */
+ hash_chain,
+ /* assertion on every traversed item */
+ ut_ad(i_s_locks_row_validate(hash_chain->value)),
+ /* this determines if we have found the lock */
+ locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+ if (hash_chain == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const lock_t* lock, /*!< in: the element to add */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+{
+ i_s_locks_row_t* dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ ulint i;
+ for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+ /* quit if this lock is already present */
+ dst_row = search_innodb_locks(cache, lock, heap_no);
+ if (dst_row != NULL) {
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+ }
+#endif
+
+ dst_row = (i_s_locks_row_t*)
+ table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(NULL);
+ }
+
+ if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_locks.rows_used--;
+ return(NULL);
+ }
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+ HASH_INSERT(
+ /* the type used in the hash chain */
+ i_s_hash_chain_t,
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* add this data to the hash */
+ &dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ } /* for()-loop */
+#endif
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ i_s_lock_waits_row_t* dst_row;
+
+ dst_row = (i_s_lock_waits_row_t*)
+ table_cache_create_empty_row(&cache->innodb_lock_waits,
+ cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(FALSE);
+ }
+
+ fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const trx_t* trx, /*!< in: transaction */
+ i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the
+ requested lock row, or NULL or
+ undefined */
+{
+ ut_ad(lock_mutex_own());
+
+ /* If transaction is waiting we add the wait lock and all locks
+ from another transactions that are blocking the wait lock. */
+ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ const lock_t* curr_lock;
+ i_s_locks_row_t* blocking_lock_row;
+ lock_queue_iterator_t iter;
+
+ ut_a(trx->lock.wait_lock != NULL);
+
+ uint16_t wait_lock_heap_no
+ = wait_lock_get_heap_no(trx->lock.wait_lock);
+
+ /* add the requested lock */
+ *requested_lock_row
+ = add_lock_to_cache(cache, trx->lock.wait_lock,
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (*requested_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* then iterate over the locks before the wait lock and
+ add the ones that are blocking it */
+
+ lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
+ ULINT_UNDEFINED);
+
+ for (curr_lock = lock_queue_iterator_get_prev(&iter);
+ curr_lock != NULL;
+ curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+ if (lock_has_to_wait(trx->lock.wait_lock,
+ curr_lock)) {
+
+ /* add the lock that is
+ blocking trx->lock.wait_lock */
+ blocking_lock_row
+ = add_lock_to_cache(
+ cache, curr_lock,
+ /* heap_no is the same
+ for the wait and waited
+ locks */
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (blocking_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* add the relation between both locks
+ to innodb_lock_waits */
+ if (!add_lock_wait_to_cache(
+ cache, *requested_lock_row,
+ blocking_lock_row)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+ }
+ }
+ } else {
+
+ *requested_lock_row = NULL;
+ }
+
+ return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS 100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+ /* cache->last_read is only updated when a shared rw lock on the
+ whole cache is being held (see trx_i_s_cache_end_read()) and
+ we are currently holding an exclusive rw lock on the cache.
+ So it is not possible for last_read to be updated while we are
+ reading it. */
+
+ ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+ return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+ trx_i_s_cache_t* cache) /*!< out: cache to clear */
+{
+ cache->innodb_trx.rows_used = 0;
+ cache->innodb_locks.rows_used = 0;
+ cache->innodb_lock_waits.rows_used = 0;
+
+ cache->locks_hash.clear();
+
+ ha_storage_empty(&cache->storage);
+}
+
+
+/**
+ Add transactions to innodb_trx's cache.
+
+ We also add all locks that are relevant to each transaction into
+ innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+ i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+ {
+ const auto state= trx->state;
+
+ if (trx->is_autocommit_non_locking())
+ {
+ ut_ad(trx->read_only);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->mysql_thd);
+ ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+ }
+ else
+ ut_ad(state == TRX_STATE_ACTIVE ||
+ state == TRX_STATE_PREPARED ||
+ state == TRX_STATE_PREPARED_RECOVERED ||
+ state == TRX_STATE_COMMITTED_IN_MEMORY);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+ {
+ if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+ table_cache_create_empty_row(&cache->innodb_trx, cache)))
+ {
+ if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+ return;
+ --cache->innodb_trx.rows_used;
+ }
+ }
+
+ /* memory could not be allocated */
+ cache->is_truncated= true;
+}
+
+
+/**
+ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+ table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+ ut_ad(lock_mutex_own());
+ trx_i_s_cache_clear(cache);
+
+ /* Capture the state of transactions */
+ trx_sys.trx_list.for_each([cache](trx_t &trx) {
+ if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+ &trx != purge_sys.query->trx)
+ {
+ mutex_enter(&trx.mutex);
+ if (trx.state != TRX_STATE_NOT_STARTED)
+ fetch_data_into_cache_low(cache, &trx);
+ mutex_exit(&trx.mutex);
+ }
+ });
+ cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ if (!can_cache_be_updated(cache)) {
+
+ return(1);
+ }
+
+ /* We need to read trx_sys and record/table lock queues */
+
+ lock_mutex_enter();
+ fetch_data_into_cache(cache);
+ lock_mutex_exit();
+
+ /* update cache last read time */
+ cache->last_read = my_interval_timer();
+
+ return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< out: cache to init */
+{
+ /* The latching is done in the following order:
+ acquire trx_i_s_cache_t::rw_lock, X
+ acquire lock mutex
+ release lock mutex
+ release trx_i_s_cache_t::rw_lock
+ acquire trx_i_s_cache_t::rw_lock, S
+ release trx_i_s_cache_t::rw_lock */
+
+ rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+ SYNC_TRX_I_S_RWLOCK);
+
+ cache->last_read = 0;
+
+ table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+ table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+ table_cache_init(&cache->innodb_lock_waits,
+ sizeof(i_s_lock_waits_row_t));
+
+ cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+ cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+ CACHE_STORAGE_HASH_CELLS);
+
+ cache->mem_allocd = 0;
+
+ cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< in, own: cache to free */
+{
+ rw_lock_free(&cache->rw_lock);
+
+ cache->locks_hash.free();
+ ha_storage_free(cache->storage);
+ table_cache_free(&cache->innodb_trx);
+ table_cache_free(&cache->innodb_locks);
+ table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->last_read = my_interval_timer();
+ rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+ rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+ trx_i_s_cache_t* cache, /*!< in: whole cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ ut_ad(rw_lock_own_flagged(&cache->rw_lock,
+ RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+ switch (table) {
+ case I_S_INNODB_TRX:
+ return &cache->innodb_trx;
+ case I_S_INNODB_LOCKS:
+ return &cache->innodb_locks;
+ case I_S_INNODB_LOCK_WAITS:
+ return &cache->innodb_lock_waits;
+ }
+
+ ut_error;
+ return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+ table_cache = cache_select_table(cache, table);
+
+ return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n) /*!< in: row number */
+{
+ i_s_table_cache_t* table_cache;
+ ulint i;
+ void* row;
+
+ table_cache = cache_select_table(cache, table);
+
+ ut_a(n < table_cache->rows_used);
+
+ row = NULL;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd > n) {
+
+ row = (char*) table_cache->chunks[i].base
+ + (n - table_cache->chunks[i].offset)
+ * table_cache->row_size;
+ break;
+ }
+ }
+
+ ut_a(row != NULL);
+
+ return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size)/*!< in: size of the lock id
+ buffer */
+{
+ int res_len;
+
+ /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+ if (row->lock_index) {
+ /* record lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT
+ ":%u:%u:%u",
+ row->lock_trx_id, row->lock_page.space(),
+ row->lock_page.page_no(), row->lock_rec);
+ } else {
+ /* table lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT":" UINT64PF,
+ row->lock_trx_id,
+ row->lock_table_id);
+ }
+
+ /* the typecast is safe because snprintf(3) never returns
+ negative result */
+ ut_a(res_len >= 0);
+ ut_a((ulint) res_len < lock_id_size);
+
+ return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..28491853
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "fut0fut.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0thread.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0sync.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include <mysql/service_wsrep.h>
+
+#include <unordered_map>
+
+/** Maximum allowable purge history length. <=0 means 'infinite'. */
+ulong srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+trx_undo_rec_t trx_purge_dummy_rec;
+
+#ifdef UNIV_DEBUG
+my_bool srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+ : m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@return whether anything is to be purged */
+inline bool TrxUndoRsegsIterator::set_next()
+{
+ mutex_enter(&purge_sys.pq_mutex);
+
+ /* Only purge consumes events from the priority queue, user
+ threads only produce the events. */
+
+ /* Check if there are more rsegs to process in the
+ current element. */
+ if (m_iter != m_rsegs.end()) {
+ /* We are still processing rollback segment from
+ the same transaction and so expected transaction
+ number shouldn't increase. Undo the increment of
+ expected commit done by caller assuming rollback
+ segments from given transaction are done. */
+ purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+ } else if (!purge_sys.purge_queue.empty()) {
+ m_rsegs = purge_sys.purge_queue.top();
+ purge_sys.purge_queue.pop();
+ ut_ad(purge_sys.purge_queue.empty()
+ || purge_sys.purge_queue.top() != m_rsegs);
+ m_iter = m_rsegs.begin();
+ } else {
+ /* Queue is empty, reset iterator. */
+ purge_sys.rseg = NULL;
+ mutex_exit(&purge_sys.pq_mutex);
+ m_rsegs = NullElement;
+ m_iter = m_rsegs.begin();
+ return false;
+ }
+
+ purge_sys.rseg = *m_iter++;
+ mutex_exit(&purge_sys.pq_mutex);
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+ ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no);
+
+ /* We assume in purge of externally stored fields that space id is
+ in the range of UNDO tablespace space ids */
+ ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+ || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+ ut_a(purge_sys.tail.trx_no <= purge_sys.rseg->last_trx_no());
+
+ purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no();
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+ mutex_exit(&purge_sys.rseg->mutex);
+
+ return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+ ut_a(srv_n_purge_threads > 0);
+
+ trx_t* trx = trx_create();
+ ut_ad(!trx->id);
+ trx->start_time = time(NULL);
+ trx->start_time_micro = microsecond_interval_timer();
+ trx->state = TRX_STATE_ACTIVE;
+ trx->op_info = "purge trx";
+
+ mem_heap_t* heap = mem_heap_create(512);
+ que_fork_t* fork = que_fork_create(
+ NULL, NULL, QUE_FORK_PURGE, heap);
+ fork->trx = trx;
+
+ for (auto i = innodb_purge_threads_MAX; i; i--) {
+ que_thr_t* thr = que_thr_create(fork, heap, NULL);
+ thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+ purge_node_t(thr);
+ }
+
+ return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+ ut_ad(this == &purge_sys);
+ ut_ad(!heap);
+ ut_ad(!enabled());
+ m_paused= 0;
+ query= purge_graph_build();
+ next_stored= false;
+ rseg= NULL;
+ page_no= 0;
+ offset= 0;
+ hdr_page_no= 0;
+ hdr_offset= 0;
+ rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH);
+ mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex);
+ truncate.current= NULL;
+ truncate.last= NULL;
+ heap= mem_heap_create(4096);
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+ ut_ad(this == &purge_sys);
+ if (!heap)
+ return;
+
+ ut_ad(!enabled());
+ trx_t* trx = query->trx;
+ que_graph_free(query);
+ ut_ad(!trx->id);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ trx->state= TRX_STATE_NOT_STARTED;
+ trx->free();
+ rw_lock_free(&latch);
+ mutex_free(&pq_mutex);
+ mem_heap_free(heap);
+ heap= nullptr;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in] trx transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+ DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+ trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+ ut_ad(undo == trx->rsegs.m_redo.undo);
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(undo->rseg == rseg);
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, mtr);
+ buf_block_t* undo_page = trx_undo_set_state_at_finish(
+ undo, mtr);
+ trx_ulogf_t* undo_header = undo_page->frame + undo->hdr_offset;
+
+ ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame))) {
+ /* This database must have been upgraded from
+ before MariaDB 10.3.5. */
+ trx_rseg_format_upgrade(rseg_header, mtr);
+ }
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ /* The undo log segment will not be reused */
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(rseg_header,
+ TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
+ uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE
+ + TRX_RSEG
+ + rseg_header->frame);
+
+ ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
+ + TRX_UNDO_PAGE_LIST
+ + undo_page->frame));
+
+ mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_header->frame,
+ hist_size + undo->size);
+ mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+ + rseg_header->frame,
+ trx_sys.get_max_trx_id());
+ }
+
+ /* After the purge thread has been given permission to exit,
+ we may roll back transactions (trx->undo_no==0)
+ in THD::cleanup() invoked from unlink_thd() in fast shutdown,
+ or in trx_rollback_recovered() in slow shutdown.
+
+ Before any transaction-generating background threads or the
+ purge have been started, we can
+ start transactions in row_merge_drop_temp_indexes() and
+ fts_drop_orphaned_tables(), and roll back recovered transactions.
+
+ Arbitrary user transactions may be executed when all the undo log
+ related background processes (including purge) are disabled due to
+ innodb_force_recovery=2 or innodb_force_recovery=3.
+ DROP TABLE may be executed at any innodb_force_recovery level.
+
+ During fast shutdown, we may also continue to execute
+ user transactions. */
+ ut_ad(srv_undo_sources
+ || trx->undo_no == 0
+ || (!purge_sys.enabled()
+ && (srv_is_being_started
+ || trx_rollback_is_active
+ || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+ || ((trx->mysql_thd || trx->internal)
+ && srv_fast_shutdown));
+
+#ifdef WITH_WSREP
+ if (wsrep_is_wsrep_xid(trx->xid)) {
+ trx_rseg_update_wsrep_checkpoint(rseg_header, trx->xid, mtr);
+ }
+#endif
+
+ if (trx->mysql_log_file_name && *trx->mysql_log_file_name) {
+ /* Update the latest MySQL binlog name and offset info
+ in rollback segment header if MySQL binlogging is on
+ or the database server is a MySQL replication save. */
+ trx_rseg_update_binlog_offset(rseg_header, trx, mtr);
+ }
+
+ /* Add the log as the first in the history list */
+ flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+ static_cast<uint16_t>(undo->hdr_offset
+ + TRX_UNDO_HISTORY_NODE), mtr);
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
+ undo_header + TRX_UNDO_TRX_NO,
+ trx->rw_trx_hash_element->no);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header
+ + TRX_UNDO_NEEDS_PURGE, 1U);
+
+ if (rseg->last_page_no == FIL_NULL) {
+ rseg->last_page_no = undo->hdr_page_no;
+ rseg->set_last_commit(undo->hdr_offset,
+ trx->rw_trx_hash_element->no);
+ rseg->needs_purge = true;
+ }
+
+ trx_sys.rseg_history_len++;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+ ut_free(undo);
+ }
+
+ undo = NULL;
+}
+
+/** Remove undo log header from the history list.
+@param[in,out] rseg rollback segment header page
+@param[in] log undo log segment header page
+@param[in] offset byte offset in the undo log segment header page
+@param[in,out] mtr mini-transaction */
+static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+ uint16_t offset, mtr_t *mtr)
+{
+ flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY,
+ log, static_cast<uint16_t>(offset + TRX_UNDO_HISTORY_NODE), mtr);
+ trx_sys.rseg_history_len--;
+}
+
+/** Free an undo log segment, and remove the header from the history list.
+@param[in,out] rseg rollback segment
+@param[in] hdr_addr file address of log_hdr */
+static
+void
+trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
+{
+ mtr_t mtr;
+
+ mtr.start();
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+
+ /* Mark the last undo log totally purged, so that if the
+ system crashes, the tail of the undo log will not get accessed
+ again. The list of pages in the undo log tail gets
+ inconsistent during the freeing of the segment, and therefore
+ purge should not try to access them again. */
+ mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->frame + hdr_addr.boffset
+ + TRX_UNDO_NEEDS_PURGE, 0U);
+
+ while (!fseg_free_step_not_header(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + block->frame, &mtr)) {
+ mutex_exit(&rseg->mutex);
+
+ mtr.commit();
+ mtr.start();
+
+ mutex_enter(&rseg->mutex);
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+ block = trx_undo_page_get(
+ page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+ }
+
+ /* The page list may now be inconsistent, but the length field
+ stored in the list base node tells us how big it was before we
+ started the freeing. */
+
+ const uint32_t seg_size = flst_get_len(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+ /* We may free the undo log segment header page; it must be freed
+ within the same mtr as the undo log header is removed from the
+ history list: otherwise, in case of a database crash, the segment
+ could become inaccessible garbage in the file space. */
+
+ trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
+
+ do {
+
+ /* Here we assume that a file segment with just the header
+ page can be freed in a few steps, so that the buffer pool
+ is not flooded with bufferfixed pages: see the note in
+ fsp0fsp.cc. */
+
+ } while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + block->frame, &mtr));
+
+ byte* hist = TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame;
+ ut_ad(mach_read_from_4(hist) >= seg_size);
+
+ mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+
+ ut_ad(rseg->curr_size >= seg_size);
+
+ rseg->curr_size -= seg_size;
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param[in,out] rseg rollback segment
+@param[in] limit truncate anything before this */
+static
+void
+trx_purge_truncate_rseg_history(
+ trx_rseg_t& rseg,
+ const purge_sys_t::iterator& limit)
+{
+ fil_addr_t hdr_addr;
+ fil_addr_t prev_hdr_addr;
+ mtr_t mtr;
+ trx_id_t undo_trx_no;
+
+ mtr.start();
+ ut_ad(rseg.is_persistent());
+ mutex_enter(&rseg.mutex);
+
+ buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+ hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
+ + rseg_hdr->frame);
+ hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+loop:
+ if (hdr_addr.page == FIL_NULL) {
+func_exit:
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+ return;
+ }
+
+ buf_block_t* block = trx_undo_page_get(page_id_t(rseg.space->id,
+ hdr_addr.page),
+ &mtr);
+ undo_trx_no = mach_read_from_8(block->frame + hdr_addr.boffset
+ + TRX_UNDO_TRX_NO);
+
+ if (undo_trx_no >= limit.trx_no) {
+ if (undo_trx_no == limit.trx_no) {
+ trx_undo_truncate_start(
+ &rseg, hdr_addr.page,
+ hdr_addr.boffset, limit.undo_no);
+ }
+
+ goto func_exit;
+ }
+
+ prev_hdr_addr = flst_get_prev_addr(block->frame + hdr_addr.boffset
+ + TRX_UNDO_HISTORY_NODE);
+ prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+ if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame)
+ == TRX_UNDO_TO_PURGE
+ && !mach_read_from_2(block->frame + hdr_addr.boffset
+ + TRX_UNDO_NEXT_LOG)) {
+
+ /* We can free the whole log segment */
+
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+
+ /* calls the trx_purge_remove_log_hdr()
+ inside trx_purge_free_segment(). */
+ trx_purge_free_segment(&rseg, hdr_addr);
+ } else {
+ /* Remove the log hdr from the rseg history. */
+ trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset,
+ &mtr);
+
+ mutex_exit(&rseg.mutex);
+ mtr.commit();
+ }
+
+ mtr.start();
+ mutex_enter(&rseg.mutex);
+
+ rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+ hdr_addr = prev_hdr_addr;
+
+ goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in] space undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+ typedef std::vector<TrxUndoRsegs> purge_elem_list_t;
+ purge_elem_list_t purge_elem_list;
+
+ mutex_enter(&purge_sys.pq_mutex);
+
+ /* Remove rseg instances that are in the purge queue before we start
+ truncate of corresponding UNDO truncate. */
+ while (!purge_sys.purge_queue.empty()) {
+ purge_elem_list.push_back(purge_sys.purge_queue.top());
+ purge_sys.purge_queue.pop();
+ }
+
+ for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+ it != purge_elem_list.end();
+ ++it) {
+
+ for (TrxUndoRsegs::iterator it2 = it->begin();
+ it2 != it->end();
+ ++it2) {
+ if ((*it2)->space == &space) {
+ it->erase(it2);
+ break;
+ }
+ }
+
+ if (!it->empty()) {
+ purge_sys.purge_queue.push(*it);
+ }
+ }
+
+ mutex_exit(&purge_sys.pq_mutex);
+}
+
+/**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages!
+*/
+static void trx_purge_truncate_history()
+{
+ ut_ad(purge_sys.head <= purge_sys.tail);
+ purge_sys_t::iterator& head = purge_sys.head.trx_no
+ ? purge_sys.head : purge_sys.tail;
+
+ if (head.trx_no >= purge_sys.low_limit_no()) {
+ /* This is sometimes necessary. TODO: find out why. */
+ head.trx_no = purge_sys.low_limit_no();
+ head.undo_no = 0;
+ }
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->id == i);
+ trx_purge_truncate_rseg_history(*rseg, head);
+ }
+ }
+
+ if (srv_undo_tablespaces_active < 2) {
+ return;
+ }
+
+ while (srv_undo_log_truncate) {
+ if (!purge_sys.truncate.current) {
+ const ulint threshold = ulint(srv_max_undo_log_size
+ >> srv_page_size_shift);
+ for (ulint i = purge_sys.truncate.last
+ ? purge_sys.truncate.last->id
+ - srv_undo_space_id_start
+ : 0, j = i;; ) {
+ ulint space_id = srv_undo_space_id_start + i;
+ ut_ad(srv_is_undo_tablespace(space_id));
+ fil_space_t* space= fil_space_get(space_id);
+
+ if (space && space->get_size() > threshold) {
+ purge_sys.truncate.current = space;
+ break;
+ }
+
+ ++i;
+ i %= srv_undo_tablespaces_active;
+ if (i == j) {
+ break;
+ }
+ }
+ }
+
+ if (!purge_sys.truncate.current) {
+ return;
+ }
+
+ fil_space_t& space = *purge_sys.truncate.current;
+ /* Undo tablespace always are a single file. */
+ ut_a(UT_LIST_GET_LEN(space.chain) == 1);
+ fil_node_t* file = UT_LIST_GET_FIRST(space.chain);
+ /* The undo tablespace files are never closed. */
+ ut_ad(file->is_open());
+
+ DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->is_persistent());
+ if (rseg->space == &space) {
+ /* Once set, this rseg will
+ not be allocated to subsequent
+ transactions, but we will wait
+ for existing active
+ transactions to finish. */
+ rseg->skip_allocation = true;
+ }
+ }
+ }
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+ if (!rseg || rseg->space != &space) {
+ continue;
+ }
+ mutex_enter(&rseg->mutex);
+ ut_ad(rseg->skip_allocation);
+ if (rseg->trx_ref_count) {
+not_free:
+ mutex_exit(&rseg->mutex);
+ return;
+ }
+
+ if (rseg->curr_size != 1) {
+ /* Check if all segments are
+ cached and safe to remove. */
+ ulint cached = 0;
+
+ for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+ rseg->undo_cached);
+ undo;
+ undo = UT_LIST_GET_NEXT(undo_list,
+ undo)) {
+ if (head.trx_no < undo->trx_id) {
+ goto not_free;
+ } else {
+ cached += undo->size;
+ }
+ }
+
+ ut_ad(rseg->curr_size > cached);
+
+ if (rseg->curr_size > cached + 1) {
+ goto not_free;
+ }
+ }
+
+ mutex_exit(&rseg->mutex);
+ }
+
+ ib::info() << "Truncating " << file->name;
+ trx_purge_cleanse_purge_queue(space);
+
+ /* Flush all to-be-discarded pages of the tablespace.
+
+ During truncation, we do not want any writes to the
+ to-be-discarded area, because we must set the space.size
+ early in order to have deterministic page allocation.
+
+ If a log checkpoint was completed at LSN earlier than our
+ mini-transaction commit and the server was killed, then
+ discarding the to-be-trimmed pages without flushing would
+ break crash recovery. So, we cannot avoid the write. */
+ while (buf_flush_list_space(&space));
+
+ log_free_check();
+
+ /* Adjust the tablespace metadata. */
+ if (!fil_truncate_prepare(space.id)) {
+ ib::error() << "Failed to find UNDO tablespace "
+ << file->name;
+ return;
+ }
+
+ /* Re-initialize tablespace, in a single mini-transaction. */
+ mtr_t mtr;
+ const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+ mtr.start();
+ mtr_x_lock_space(purge_sys.truncate.current, &mtr);
+ /* Associate the undo tablespace with mtr.
+ During mtr::commit(), InnoDB can use the undo
+ tablespace object to clear all freed ranges */
+ mtr.set_named_space(purge_sys.truncate.current);
+ mtr.trim_pages(page_id_t(space.id, size));
+ fsp_header_init(purge_sys.truncate.current, size, &mtr);
+ mutex_enter(&fil_system.mutex);
+ purge_sys.truncate.current->size = file->size = size;
+ mutex_exit(&fil_system.mutex);
+
+ buf_block_t* sys_header = trx_sysf_get(&mtr);
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+ if (!rseg || rseg->space != &space) {
+ continue;
+ }
+
+ ut_ad(rseg->is_persistent());
+ ut_d(const ulint old_page = rseg->page_no);
+
+ buf_block_t* rblock = trx_rseg_header_create(
+ purge_sys.truncate.current,
+ rseg->id, sys_header, &mtr);
+ ut_ad(rblock);
+ rseg->page_no = rblock
+ ? rblock->page.id().page_no() : FIL_NULL;
+ ut_ad(old_page == rseg->page_no);
+
+ /* Before re-initialization ensure that we
+ free the existing structure. There can't be
+ any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+ trx_undo_t* next_undo;
+
+ for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+ rseg->undo_cached);
+ undo; undo = next_undo) {
+
+ next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ ut_free(undo);
+ }
+
+ UT_LIST_INIT(rseg->undo_list,
+ &trx_undo_t::undo_list);
+ UT_LIST_INIT(rseg->undo_cached,
+ &trx_undo_t::undo_list);
+
+ /* These were written by trx_rseg_header_create(). */
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rblock->frame));
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rblock->frame));
+
+ /* Initialize the undo log lists according to
+ the rseg header */
+ rseg->curr_size = 1;
+ rseg->trx_ref_count = 0;
+ rseg->last_page_no = FIL_NULL;
+ rseg->last_commit_and_offset = 0;
+ rseg->needs_purge = false;
+ }
+
+ mtr.commit();
+ /* Write-ahead the redo log record. */
+ log_write_up_to(mtr.commit_lsn(), true);
+
+ /* Trim the file size. */
+ os_file_truncate(file->name, file->handle,
+ os_offset_t(size) << srv_page_size_shift,
+ true);
+
+ /* This is only executed by srv_purge_coordinator_thread. */
+ export_vars.innodb_undo_truncations++;
+
+ /* In MDEV-8319 (10.5) we will PUNCH_HOLE the garbage
+ (with write-ahead logging). */
+ mutex_enter(&fil_system.mutex);
+ ut_ad(&space == purge_sys.truncate.current);
+ ut_ad(space.is_being_truncated);
+ purge_sys.truncate.current->set_stopping(false);
+ purge_sys.truncate.current->is_being_truncated = false;
+ mutex_exit(&fil_system.mutex);
+
+ if (purge_sys.rseg != NULL
+ && purge_sys.rseg->last_page_no == FIL_NULL) {
+ /* If purge_sys.rseg is pointing to rseg that
+ was recently truncated then move to next rseg
+ element. Note: Ideally purge_sys.rseg should
+ be NULL because purge should complete
+ processing of all the records but there is
+ purge_batch_size that can force the purge loop
+ to exit before all the records are purged and
+ in this case purge_sys.rseg could point to a
+ valid rseg waiting for next purge cycle. */
+ purge_sys.next_stored = false;
+ purge_sys.rseg = NULL;
+ }
+
+ DBUG_EXECUTE_IF("ib_undo_trunc",
+ ib::info() << "ib_undo_trunc";
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+ ut_ad(rseg->is_persistent());
+ if (rseg->space == &space) {
+ rseg->skip_allocation = false;
+ }
+ }
+ }
+
+ ib::info() << "Truncated " << file->name;
+ purge_sys.truncate.last = purge_sys.truncate.current;
+ purge_sys.truncate.current = NULL;
+ }
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */
+static void trx_purge_rseg_get_next_history_log(
+ ulint* n_pages_handled)/*!< in/out: number of UNDO pages
+ handled */
+{
+ fil_addr_t prev_log_addr;
+ trx_id_t trx_no;
+ mtr_t mtr;
+
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+
+ purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no() + 1;
+ purge_sys.tail.undo_no = 0;
+ purge_sys.next_stored = false;
+
+ mtr.start();
+
+ const buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(purge_sys.rseg->space->id,
+ purge_sys.rseg->last_page_no), &mtr);
+
+ const trx_ulogf_t* log_hdr = undo_page->frame
+ + purge_sys.rseg->last_offset();
+
+ /* Increase the purge page count by one for every handled log */
+
+ (*n_pages_handled)++;
+
+ prev_log_addr = flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+ prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset
+ - TRX_UNDO_HISTORY_NODE);
+
+
+ const bool empty = prev_log_addr.page == FIL_NULL;
+
+ if (empty) {
+ /* No logs left in the history list */
+ purge_sys.rseg->last_page_no = FIL_NULL;
+ }
+
+ mutex_exit(&purge_sys.rseg->mutex);
+ mtr.commit();
+
+ if (empty) {
+ return;
+ }
+
+ /* Read the previous log header. */
+ mtr.start();
+
+ log_hdr = trx_undo_page_get_s_latched(
+ page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
+ &mtr)->frame
+ + prev_log_addr.boffset;
+
+ trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+ ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+ mtr_commit(&mtr);
+
+ mutex_enter(&purge_sys.rseg->mutex);
+
+ purge_sys.rseg->last_page_no = prev_log_addr.page;
+ purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+ purge_sys.rseg->needs_purge = log_hdr[TRX_UNDO_NEEDS_PURGE + 1] != 0;
+
+ /* Purge can also produce events, however these are already ordered
+ in the rollback segment and any user generated event will be greater
+ than the events that Purge produces. ie. Purge can never produce
+ events from an empty rollback segment. */
+
+ mutex_enter(&purge_sys.pq_mutex);
+
+ purge_sys.purge_queue.push(*purge_sys.rseg);
+
+ mutex_exit(&purge_sys.pq_mutex);
+
+ mutex_exit(&purge_sys.rseg->mutex);
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging. */
+static void trx_purge_read_undo_rec()
+{
+ uint16_t offset;
+ uint32_t page_no;
+ ib_uint64_t undo_no;
+
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+ if (purge_sys.rseg->needs_purge) {
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* undo_page;
+ if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
+ *purge_sys.rseg->space, purge_sys.hdr_page_no,
+ purge_sys.hdr_offset, RW_S_LATCH,
+ undo_page, &mtr)) {
+
+ offset = page_offset(undo_rec);
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+ page_no = undo_page->page.id().page_no();
+ } else {
+ offset = 0;
+ undo_no = 0;
+ }
+
+ mtr.commit();
+ } else {
+ offset = 0;
+ undo_no = 0;
+ }
+
+ purge_sys.offset = offset;
+ purge_sys.page_no = page_no;
+ purge_sys.tail.undo_no = undo_no;
+
+ purge_sys.next_stored = true;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ ut_ad(!purge_sys.next_stored);
+
+ if (purge_sys.rseg_iter.set_next()) {
+ trx_purge_read_undo_rec();
+ } else {
+ /* There is nothing to do yet. */
+ os_thread_yield();
+ }
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+ ulint* n_pages_handled,/*!< in/out: number of UNDO pages
+ handled */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ mtr_t mtr;
+
+ ut_ad(purge_sys.next_stored);
+ ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
+
+ const ulint space = purge_sys.rseg->space->id;
+ const uint32_t page_no = purge_sys.page_no;
+ const uint16_t offset = purge_sys.offset;
+
+ if (offset == 0) {
+ /* It is the dummy undo log record, which means that there is
+ no need to purge this undo log */
+
+ trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ return(&trx_purge_dummy_rec);
+ }
+
+ mtr_start(&mtr);
+
+ buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(space, page_no), &mtr);
+ buf_block_t* rec2_page = undo_page;
+
+ const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
+ undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
+
+ if (rec2 == NULL) {
+ rec2 = trx_undo_get_next_rec(rec2_page, offset,
+ purge_sys.hdr_page_no,
+ purge_sys.hdr_offset, &mtr);
+ }
+
+ if (rec2 == NULL) {
+ mtr_commit(&mtr);
+
+ trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(
+ page_id_t(space, page_no), &mtr);
+ } else {
+ purge_sys.offset = page_offset(rec2);
+ purge_sys.page_no = rec2_page->page.id().page_no();
+ purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2);
+
+ if (undo_page != rec2_page) {
+ /* We advance to a new page of the undo log: */
+ (*n_pages_handled)++;
+ }
+ }
+
+ trx_undo_rec_t* rec_copy = trx_undo_rec_copy(undo_page->frame + offset,
+ heap);
+
+ mtr_commit(&mtr);
+
+ return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+static MY_ATTRIBUTE((warn_unused_result))
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */
+ ulint* n_pages_handled,/*!< in/out: number of UNDO log pages
+ handled */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ if (!purge_sys.next_stored) {
+ trx_purge_choose_next_log();
+
+ if (!purge_sys.next_stored) {
+ DBUG_PRINT("ib_purge",
+ ("no logs left in the history list"));
+ return(NULL);
+ }
+ }
+
+ if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
+
+ return(NULL);
+ }
+
+ /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+ os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ /* row_purge_record_func() will later set
+ ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */
+ false,
+ purge_sys.rseg->id,
+ purge_sys.page_no, purge_sys.offset);
+
+ /* The following call will advance the stored values of the
+ purge iterator. */
+
+ return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/** Run a purge batch.
+@param n_purge_threads number of purge threads
+@return number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(ulint n_purge_threads)
+{
+ que_thr_t* thr;
+ ulint i;
+ ulint n_pages_handled = 0;
+ ulint n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs);
+
+ ut_a(n_purge_threads > 0);
+
+ purge_sys.head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+ i = 0;
+ /* Debug code to validate some pre-requisites and reset done flag. */
+ for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+ thr != NULL && i < n_purge_threads;
+ thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+ purge_node_t* node;
+
+ /* Get the purge node. */
+ node = (purge_node_t*) thr->child;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+ ut_ad(node->undo_recs.empty());
+ ut_ad(!node->in_progress);
+ ut_d(node->in_progress = true);
+ }
+
+ /* There should never be fewer nodes than threads, the inverse
+ however is allowed because we only use purge threads as needed. */
+ ut_ad(i == n_purge_threads);
+#endif
+
+ /* Fetch and parse the UNDO records. The UNDO records are added
+ to a per purge node vector. */
+ thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+ ut_a(n_thrs > 0 && thr != NULL);
+
+ ut_ad(purge_sys.head <= purge_sys.tail);
+
+ i = 0;
+
+ const ulint batch_size = srv_purge_batch_size;
+ std::unordered_map<table_id_t, purge_node_t*> table_id_map;
+ mem_heap_empty(purge_sys.heap);
+
+ while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+ purge_node_t* node;
+ trx_purge_rec_t purge_rec;
+
+ ut_a(!thr->is_active);
+
+ /* Get the purge node. */
+ node = (purge_node_t*) thr->child;
+ ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+ /* Track the max {trx_id, undo_no} for truncating the
+ UNDO logs once we have purged the records. */
+
+ if (purge_sys.head <= purge_sys.tail) {
+ purge_sys.head = purge_sys.tail;
+ }
+
+ /* Fetch the next record, and advance the purge_sys.tail. */
+ purge_rec.undo_rec = trx_purge_fetch_next_rec(
+ &purge_rec.roll_ptr, &n_pages_handled,
+ purge_sys.heap);
+
+ if (purge_rec.undo_rec == NULL) {
+ break;
+ } else if (purge_rec.undo_rec == &trx_purge_dummy_rec) {
+ continue;
+ }
+
+ table_id_t table_id = trx_undo_rec_get_table_id(
+ purge_rec.undo_rec);
+
+ purge_node_t *& table_node = table_id_map[table_id];
+
+ if (table_node) {
+ node = table_node;
+ } else {
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+
+ if (!(++i % n_purge_threads)) {
+ thr = UT_LIST_GET_FIRST(
+ purge_sys.query->thrs);
+ }
+
+ ut_a(thr != NULL);
+ table_node = node;
+ }
+
+ node->undo_recs.push(purge_rec);
+
+ if (n_pages_handled >= batch_size) {
+ break;
+ }
+ }
+
+ ut_ad(purge_sys.head <= purge_sys.tail);
+
+ return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+ /* Determine how much data manipulation language (DML) statements
+ need to be delayed in order to reduce the lagging of the purge
+ thread. */
+ ulint delay = 0; /* in microseconds; default: no delay */
+
+ /* If purge lag is set then calculate the new DML delay. */
+
+ if (srv_max_purge_lag > 0) {
+ double ratio = static_cast<double>(trx_sys.rseg_history_len) /
+ static_cast<double>(srv_max_purge_lag);
+
+ if (ratio > 1.0) {
+ /* If the history list length exceeds the
+ srv_max_purge_lag, the data manipulation
+ statements are delayed by at least 5000
+ microseconds. */
+ delay = (ulint) ((ratio - .5) * 10000);
+ }
+
+ if (delay > srv_max_purge_lag_delay) {
+ delay = srv_max_purge_lag_delay;
+ }
+
+ MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+ }
+
+ return(delay);
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+ bool notify_wait = purge_worker_task.is_running();
+
+ if (notify_wait)
+ tpool::tpool_wait_begin();
+
+ purge_worker_task.wait();
+
+ if(notify_wait)
+ tpool::tpool_wait_end();
+
+ /* There should be no outstanding tasks as long
+ as the worker threads are active. */
+ ut_ad(srv_get_task_queue_length() == 0);
+}
+
+/**
+Run a purge batch.
+@param n_tasks number of purge tasks to submit to the queue
+@param truncate whether to truncate the history at the end of the batch
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, bool truncate)
+{
+ que_thr_t* thr = NULL;
+ ulint n_pages_handled;
+
+ ut_ad(n_tasks > 0);
+
+ srv_dml_needed_delay = trx_purge_dml_delay();
+
+ purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+ if (srv_purge_view_update_only_debug) {
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Fetch the UNDO recs that need to be purged. */
+ n_pages_handled = trx_purge_attach_undo_recs(n_tasks);
+
+ /* Submit tasks to workers queue if using multi-threaded purge. */
+ for (ulint i = n_tasks; --i; ) {
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+ ut_a(thr);
+ srv_que_task_enqueue_low(thr);
+ srv_thread_pool->submit_task(&purge_worker_task);
+ }
+
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+ que_run_threads(thr);
+
+ trx_purge_wait_for_workers_to_complete();
+
+ if (truncate) {
+ trx_purge_truncate_history();
+ }
+
+ MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+ MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+ return(n_pages_handled);
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..438dfcf9
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+ /* This also works for REC_INFO_METADATA_ALTER, because the
+ delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+ REC_INFO_METADATA_ADD, 0, 0,
+ NULL, 0, NULL
+#ifdef UNIV_DEBUG
+ , DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block undo log page
+@param ptr current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+ ut_ad(ptr >= &undo_block->frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+ /* The 10 is supposed to be an extra safety margin (and needed for
+ compatibility with older versions) */
+ lint left= srv_page_size - (ptr - undo_block->frame) -
+ (10 + FIL_PAGE_DATA_END);
+ ut_ad(left >= 0);
+ return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+ buf_block_t* undo_block, /*!< in/out: undo log page */
+ byte* ptr, /*!< in: ptr up to where data has been
+ written on this undo page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page_align(ptr) == undo_block->frame);
+
+ if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+ return 0;
+
+ byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_FREE +
+ undo_block->frame);
+
+ const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+ /* Write offset of the previous undo log record */
+ memcpy(ptr, ptr_to_first_free, 2);
+ ptr += 2;
+
+ const uint16_t end_of_rec= static_cast<uint16_t>(ptr - undo_block->frame);
+
+ /* Update the offset to first free undo record */
+ mach_write_to_2(ptr_to_first_free, end_of_rec);
+ /* Write offset of the next undo log record */
+ memcpy(undo_block->frame + first_free, ptr_to_first_free, 2);
+ const byte *start= undo_block->frame + first_free + 2;
+
+ mtr->undo_append(*undo_block, start, ptr - start - 2);
+ return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out] undo_block undo log page
+@param[in] table the table
+@param[in] pos the virtual column position
+@param[in] ptr undo log record being written
+@param[in] first_v_col whether this is the first virtual column
+ which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+ buf_block_t* undo_block,
+ const dict_table_t* table,
+ ulint pos,
+ byte* ptr,
+ bool first_v_col)
+{
+ ut_ad(pos < table->n_v_def);
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(table, pos);
+ byte* old_ptr;
+
+ ut_ad(!vcol->v_indexes.empty());
+
+ ulint size = first_v_col ? 1 + 2 : 2;
+ const ulint avail = trx_undo_left(undo_block, ptr);
+
+ /* The mach_write_compressed(ptr, flen) in
+ trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ulint n_idx = 0;
+ for (const auto& v_index : vcol->v_indexes) {
+ n_idx++;
+ /* FIXME: index->id is 64 bits! */
+ size += mach_get_compressed_size(uint32_t(v_index.index->id));
+ size += mach_get_compressed_size(v_index.nth_field);
+ }
+
+ size += mach_get_compressed_size(n_idx);
+
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ut_d(const byte* orig_ptr = ptr);
+
+ if (first_v_col) {
+ /* write the version marker */
+ mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+ ptr += 1;
+ }
+
+ old_ptr = ptr;
+
+ ptr += 2;
+
+ ptr += mach_write_compressed(ptr, n_idx);
+
+ for (const auto& v_index : vcol->v_indexes) {
+ ptr += mach_write_compressed(
+ /* FIXME: index->id is 64 bits! */
+ ptr, uint32_t(v_index.index->id));
+
+ ptr += mach_write_compressed(ptr, v_index.nth_field);
+ }
+
+ ut_ad(orig_ptr + size == ptr);
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[out] col_pos the column number or FIL_NULL
+ if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+ const dict_table_t* table,
+ const byte* ptr,
+ uint32_t* col_pos)
+{
+ ulint len = mach_read_from_2(ptr);
+ const byte* old_ptr = ptr;
+
+ *col_pos = FIL_NULL;
+
+ ptr += 2;
+
+ ulint num_idx = mach_read_next_compressed(&ptr);
+
+ ut_ad(num_idx > 0);
+
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+
+ for (ulint i = 0; i < num_idx; i++) {
+ index_id_t id = mach_read_next_compressed(&ptr);
+ ulint pos = mach_read_next_compressed(&ptr);
+ dict_index_t* index = dict_table_get_next_index(clust_index);
+
+ while (index != NULL) {
+ /* Return if we find a matching index.
+ TODO: in the future, it might be worth to add
+ checks on other indexes */
+ if (index->id == id) {
+ const dict_col_t* col = dict_index_get_nth_col(
+ index, pos);
+ ut_ad(col->is_virtual());
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(col);
+ *col_pos = vcol->v_pos;
+ return(old_ptr + len);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+ }
+
+ return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in] first_v_col if this is the first virtual column, which
+ has the version marker
+@param[in,out] is_undo_log this function is used to parse both undo log,
+ and online log for virtual columns. So
+ check to see if this is undo log. When
+ first_v_col is true, is_undo_log is output,
+ when first_v_col is false, is_undo_log is input
+@param[out] field_no the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+ const dict_table_t* table,
+ const byte* ptr,
+ bool first_v_col,
+ bool* is_undo_log,
+ uint32_t* field_no)
+{
+ /* Version marker only put on the first virtual column */
+ if (first_v_col) {
+ /* Undo log has the virtual undo log marker */
+ *is_undo_log = (mach_read_from_1(ptr)
+ == VIRTUAL_COL_UNDO_FORMAT_1);
+
+ if (*is_undo_log) {
+ ptr += 1;
+ }
+ }
+
+ if (*is_undo_log) {
+ ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+ } else {
+ *field_no -= REC_MAX_N_FIELDS;
+ }
+
+ return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in] undo_block undo log page
+@param[in] table the table
+@param[in] row dtuple contains the virtual columns
+@param[in,out] ptr log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+ buf_block_t* undo_block,
+ dict_table_t* table,
+ const dtuple_t* row,
+ byte** ptr)
+{
+ byte* start = *ptr;
+ bool first_v_col = true;
+
+ if (trx_undo_left(undo_block, *ptr) < 2) {
+ return(false);
+ }
+
+ /* Reserve 2 bytes to write the number
+ of bytes the stored fields take in this
+ undo record */
+ *ptr += 2;
+
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+
+ /* make sure enought space to write the length */
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ ulint pos = col_no;
+ pos += REC_MAX_N_FIELDS;
+ *ptr += mach_write_compressed(*ptr, pos);
+
+ *ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, *ptr, first_v_col);
+ first_v_col = false;
+
+ if (*ptr == NULL) {
+ return(false);
+ }
+
+ const dfield_t* vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ switch (ulint flen = vfield->len) {
+ case 0: case UNIV_SQL_NULL:
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ *ptr += mach_write_compressed(*ptr, flen);
+ break;
+ default:
+ ulint max_len
+ = dict_max_v_field_len_store_undo(
+ table, col_no);
+
+ if (flen > max_len) {
+ flen = max_len;
+ }
+
+ if (trx_undo_left(undo_block, *ptr)
+ < flen + 5) {
+ return(false);
+ }
+ *ptr += mach_write_compressed(*ptr, flen);
+
+ memcpy(*ptr, vfield->data, flen);
+ *ptr += flen;
+ }
+ }
+ }
+
+ /* Always mark the end of the log with 2 bytes length field */
+ mach_write_to_2(start, ulint(*ptr - start));
+
+ return(true);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+/*========================*/
+ buf_block_t* undo_block, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: index entry which will be
+ inserted to the clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(index->is_primary());
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->frame) <= 2);
+
+ uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_block->frame));
+ byte* ptr = undo_block->frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+ /* Not enough space for writing the general parameters */
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ *ptr++ = TRX_UNDO_INSERT_REC;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+ if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+ ut_ad(clust_entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(undo_block->frame[first_free + 2]
+ == TRX_UNDO_INSERT_REC);
+ undo_block->frame[first_free + 2] = TRX_UNDO_INSERT_METADATA;
+ goto done;
+ }
+
+ for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
+ ulint flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr) < flen) {
+
+ return(0);
+ }
+
+ memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ if (index->table->n_v_cols) {
+ if (!trx_undo_report_insert_virtual(
+ undo_block, index->table, clust_entry, &ptr)) {
+ return(0);
+ }
+ }
+
+done:
+ return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ bool* updated_extern, /*!< out: true if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+{
+ const byte* ptr;
+ ulint type_cmpl;
+
+ ptr = undo_rec + 2;
+
+ type_cmpl = mach_read_from_1(ptr);
+ ptr++;
+
+ *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+ type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+ ut_ad(*type <= TRX_UNDO_DEL_MARK_REC);
+ *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+ *undo_no = mach_read_next_much_compressed(&ptr);
+ *table_id = mach_read_next_much_compressed(&ptr);
+ ut_ad(*table_id);
+
+ return(const_cast<byte*>(ptr));
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param[in,out] ptr pointer to remaining part of the undo record
+@param[in,out] field stored field
+@param[in,out] len length of the field, or UNIV_SQL_NULL
+@param[in,out] orig_len original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_col_val(
+ const byte* ptr,
+ const byte** field,
+ uint32_t* len,
+ uint32_t* orig_len)
+{
+ *len = mach_read_next_compressed(&ptr);
+ *orig_len = 0;
+
+ switch (*len) {
+ case UNIV_SQL_NULL:
+ *field = NULL;
+ break;
+ case UNIV_EXTERN_STORAGE_FIELD:
+ *orig_len = mach_read_next_compressed(&ptr);
+ *len = mach_read_next_compressed(&ptr);
+ *field = ptr;
+ ptr += *len & ~SPATIAL_STATUS_MASK;
+
+ ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(*len > *orig_len);
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ /* we do not have access to index->table here
+ ut_ad(dict_table_has_atomic_blobs(index->table)
+ || *len >= col->max_prefix
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ */
+
+ *len += UNIV_EXTERN_STORAGE_FIELD;
+ break;
+ default:
+ *field = ptr;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+ & ~SPATIAL_STATUS_MASK;
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t**ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr && ref && heap);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ dtuple_t* tuple = dtuple_create(heap, ref_len);
+ *ref = tuple;
+
+ dict_index_copy_types(tuple, index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+static
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index) /*!< in: clustered index */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ for (i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ }
+
+ return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out] ext_buf buffer to hold the prefix data and BLOB pointer
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field an externally stored column
+@param[in,out] len input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ /* Fetch the BLOB. */
+ ulint ext_len = btr_copy_externally_stored_field_prefix(
+ ext_buf, prefix_len, zip_size, field, *len);
+ /* BLOBs should always be nonempty. */
+ ut_a(ext_len);
+ /* Append the BLOB pointer to the prefix. */
+ memcpy(ext_buf + ext_len,
+ field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+ return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out] ptr undo log position, at least 15 bytes must be
+available
+@param[out] ext_buf a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+ size, or NULL when should not fetch a longer
+ prefix
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] field the locally stored part of the externally
+stored column
+@param[in,out] len length of field, in bytes
+@param[in] spatial_status whether the column is used by spatial index or
+ regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+ byte* ptr,
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte** field,
+ ulint* len,
+ spatial_status_t spatial_status)
+{
+ ulint spatial_len= 0;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ case SPATIAL_NONE:
+ break;
+
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ spatial_len = DATA_MBR_LEN;
+ break;
+ }
+
+ /* Encode spatial status into length. */
+ spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+ if (spatial_status == SPATIAL_ONLY) {
+ /* If the column is only used by gis index, log its
+ MBR is enough.*/
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + spatial_len);
+
+ return(ptr);
+ }
+
+ if (ext_buf) {
+ ut_a(prefix_len > 0);
+
+ /* If an ordering column is externally stored, we will
+ have to store a longer prefix of the field. In this
+ case, write to the log a marker followed by the
+ original length and the real length of the field. */
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+ ptr += mach_write_compressed(ptr, *len);
+
+ *field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+ zip_size, *field, len);
+
+ ptr += mach_write_compressed(ptr, *len + spatial_len);
+ } else {
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + *len + spatial_len);
+ }
+
+ return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out] mbr MBR to fill
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field field contain the geometry data
+@param[in,out] len length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+ double* mbr,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ uchar* dptr = NULL;
+ ulint dlen;
+ mem_heap_t* heap = mem_heap_create(100);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, field, zip_size, *len, heap);
+
+ if (dlen <= GEO_DATA_HEADER_SIZE) {
+ for (uint i = 0; i < SPDIMS; ++i) {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+ } else {
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+ }
+
+ mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+ buf_block_t* undo_block, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index where update or
+ delete marking is done */
+ const rec_t* rec, /*!< in: clustered index record which
+ has NOT yet been modified */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const dtuple_t* row, /*!< in: clustered index row contains
+ virtual column info */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->frame) <= 2);
+
+ byte* ptr_to_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_block->frame);
+
+ const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+ byte *ptr = undo_block->frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 50) {
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+ return 0;
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ dict_table_t* table = index->table;
+ const byte* field;
+ ulint flen;
+ ulint col_no;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+ trx_id_t trx_id;
+ ibool ignore_prefix = FALSE;
+ byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE];
+ bool first_v_col = true;
+
+ /* Store first some general parameters to the undo log */
+
+ if (!update) {
+ ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ } else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ /* We are about to update a delete marked record.
+ We don't typically need the prefix in this case unless
+ the delete marking is done by the same transaction
+ (which we check below). */
+ ignore_prefix = TRUE;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+
+ type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+ type_cmpl_ptr = ptr;
+
+ *ptr++ = (byte) type_cmpl;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+ ut_ad(flen == DATA_TRX_ID_LEN);
+
+ trx_id = trx_read_trx_id(field);
+
+ /* If it is an update of a delete marked record, then we are
+ allowed to ignore blob prefixes if the delete marking was done
+ by some other trx as it must have committed by now for us to
+ allow an over-write. */
+ if (trx_id == trx->id) {
+ ignore_prefix = false;
+ }
+ ptr += mach_u64_write_compressed(ptr, trx_id);
+
+ field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+ ut_ad(flen == DATA_ROLL_PTR_LEN);
+ ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+ ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ /* The ordering columns must not be instant added columns. */
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ /* The ordering columns must not be stored externally. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ulint n_updated = upd_get_n_fields(update);
+
+ /* If this is an online update while an inplace alter table
+ is in progress and the table has virtual column, we will
+ need to double check if there are any non-indexed columns
+ being registered in update vector in case they will be indexed
+ in new table */
+ if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ upd_field_t* fld = upd_get_nth_field(
+ update, i);
+ ulint pos = fld->field_no;
+
+ /* These columns must not have an index
+ on them */
+ if (upd_fld_is_virtual_col(fld)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ n_updated--;
+ }
+ }
+ }
+
+ i = 0;
+
+ if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+ ut_ad(update->n_fields >= 1);
+ ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+ ut_ad(update->fields[0].field_no
+ == index->first_user_field());
+ ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+ ut_ad(!dfield_is_null(&update->fields[0].new_val));
+ /* The instant ADD COLUMN metadata record does not
+ contain the BLOB. Do not write anything for it. */
+ i = !rec_is_alter_metadata(rec, *index);
+ n_updated -= i;
+ }
+
+ ptr += mach_write_compressed(ptr, n_updated);
+
+ for (; i < upd_get_n_fields(update); i++) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return 0;
+ }
+
+ upd_field_t* fld = upd_get_nth_field(update, i);
+
+ bool is_virtual = upd_fld_is_virtual_col(fld);
+ ulint max_v_log_len = 0;
+
+ ulint pos = fld->field_no;
+ const dict_col_t* col = NULL;
+
+ if (is_virtual) {
+ /* Skip the non-indexed column, during
+ an online alter table */
+ if (dict_index_is_online_ddl(index)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ continue;
+ }
+
+ /* add REC_MAX_N_FIELDS to mark this
+ is a virtual col */
+ ptr += mach_write_compressed(
+ ptr, pos + REC_MAX_N_FIELDS);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ ut_ad(fld->field_no < table->n_v_def);
+
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ fld->field_no, ptr,
+ first_v_col);
+ if (ptr == NULL) {
+ return(0);
+ }
+ first_v_col = false;
+
+ max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, fld->field_no);
+
+ field = static_cast<byte*>(
+ fld->old_v_val->data);
+ flen = fld->old_v_val->len;
+
+ /* Only log sufficient bytes for index
+ record update */
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ goto store_len;
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(pos >= index->first_user_field());
+ ut_ad(rec_is_metadata(rec, *index));
+
+ if (rec_is_alter_metadata(rec, *index)) {
+ ut_ad(update->is_alter_metadata());
+
+ field = rec_offs_n_fields(offsets)
+ > pos
+ && !rec_offs_nth_default(
+ offsets, pos)
+ ? rec_get_nth_field(
+ rec, offsets,
+ pos, &flen)
+ : index->instant_field_value(
+ pos - 1, &flen);
+
+ if (pos == index->first_user_field()) {
+ ut_ad(rec_offs_nth_extern(
+ offsets, pos));
+ ut_ad(flen == FIELD_REF_SIZE);
+ goto write_field;
+ }
+ col = dict_index_get_nth_col(index,
+ pos - 1);
+ } else if (!update->is_alter_metadata()) {
+ goto get_field;
+ } else {
+ /* We are converting an ADD COLUMN
+ metadata record to an ALTER TABLE
+ metadata record, with BLOB. Subtract
+ the missing metadata BLOB field. */
+ ut_ad(pos > index->first_user_field());
+ --pos;
+ goto get_field;
+ }
+ } else {
+get_field:
+ col = dict_index_get_nth_col(index, pos);
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+ }
+write_field:
+ /* Write field number to undo log */
+ ptr += mach_write_compressed(ptr, pos);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ if (rec_offs_n_fields(offsets) > pos
+ && rec_offs_nth_extern(offsets, pos)) {
+ ut_ad(col || pos == index->first_user_field());
+ ut_ad(col || update->is_alter_metadata());
+ ut_ad(col
+ || rec_is_alter_metadata(rec, *index));
+ ulint prefix_len = col
+ ? dict_max_field_len_store_undo(
+ table, col)
+ : 0;
+
+ ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+ <= sizeof ext_buf);
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ col
+ && col->ord_part
+ && !ignore_prefix
+ && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ ? ext_buf : NULL, prefix_len,
+ table->space->zip_size(),
+ &field, &flen, SPATIAL_UNKNOWN);
+
+ *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+ } else {
+store_len:
+ ptr += mach_write_compressed(ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ /* Also record the new value for virtual column */
+ if (is_virtual) {
+ field = static_cast<byte*>(fld->new_val.data);
+ flen = fld->new_val.len;
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+ }
+
+ /* Reset the first_v_col, so to put the virtual column undo
+ version marker again, when we log all the indexed columns */
+ first_v_col = true;
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. Starting from 5.2, we no longer
+ store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+ but we can construct the column prefix fields in the index by
+ fetching the first page of the BLOB that is pointed to by the
+ clustered index. This works also in crash recovery, because all pages
+ (including BLOBs) are recovered before anything is rolled back. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ byte* old_ptr = ptr;
+ double mbr[SPDIMS * 2];
+ mem_heap_t* row_heap = NULL;
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ /* Reserve 2 bytes to write the number of bytes the stored
+ fields take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table);
+ col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+
+ if (!col->ord_part) {
+ continue;
+ }
+
+ const ulint pos = dict_index_get_nth_col_pos(
+ index, col_no, NULL);
+ /* All non-virtual columns must be present in
+ the clustered index. */
+ ut_ad(pos != ULINT_UNDEFINED);
+
+ const bool is_ext = rec_offs_nth_extern(offsets, pos);
+ const spatial_status_t spatial_status = is_ext
+ ? dict_col_get_spatial_status(col)
+ : SPATIAL_NONE;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ /* fall through */
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ /* Externally stored spatially indexed
+ columns will be (redundantly) logged
+ again, because we did not write the
+ MBR yet, that is, the previous call to
+ trx_undo_page_report_modify_ext()
+ was with SPATIAL_UNKNOWN. */
+ break;
+ case SPATIAL_NONE:
+ if (!update) {
+ /* This is a DELETE operation. */
+ break;
+ }
+ /* Avoid redundantly logging indexed
+ columns that were updated. */
+
+ for (i = 0; i < update->n_fields; i++) {
+ const ulint field_no
+ = upd_get_nth_field(update, i)
+ ->field_no;
+ if (field_no >= index->n_fields
+ || dict_index_get_nth_field(
+ index, field_no)->col
+ == col) {
+ goto already_logged;
+ }
+ }
+ }
+
+ if (true) {
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+
+ if (is_ext) {
+ const dict_col_t* col =
+ dict_index_get_nth_col(
+ index, pos);
+ ulint prefix_len =
+ dict_max_field_len_store_undo(
+ table, col);
+
+ ut_a(prefix_len < sizeof ext_buf);
+ const ulint zip_size
+ = table->space->zip_size();
+
+ /* If there is a spatial index on it,
+ log its MBR */
+ if (spatial_status != SPATIAL_NONE) {
+ ut_ad(DATA_GEOMETRY_MTYPE(
+ col->mtype));
+
+ trx_undo_get_mbr_from_ext(
+ mbr, zip_size,
+ field, &flen);
+ }
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ && !ignore_prefix
+ ? ext_buf : NULL, prefix_len,
+ zip_size,
+ &field, &flen,
+ spatial_status);
+ } else {
+ ptr += mach_write_compressed(
+ ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL
+ && spatial_status != SPATIAL_ONLY) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ if (spatial_status != SPATIAL_NONE) {
+ if (trx_undo_left(undo_block, ptr)
+ < DATA_MBR_LEN) {
+ return(0);
+ }
+
+ for (int i = 0; i < SPDIMS * 2;
+ i++) {
+ mach_double_write(
+ ptr, mbr[i]);
+ ptr += sizeof(double);
+ }
+ }
+ }
+
+already_logged:
+ continue;
+ }
+
+ for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+ ulint pos = col_no;
+ ulint max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, pos);
+
+ /* Write field number to undo log.
+ Make sure there is enought space in log */
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ pos += REC_MAX_N_FIELDS;
+ ptr += mach_write_compressed(ptr, pos);
+
+ ut_ad(col_no < table->n_v_def);
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, ptr,
+ first_v_col);
+ first_v_col = false;
+
+ if (!ptr) {
+ return(0);
+ }
+
+ const dfield_t* vfield = NULL;
+
+ if (update) {
+ ut_ad(!row);
+ if (update->old_vrow == NULL) {
+ flen = UNIV_SQL_NULL;
+ } else {
+ vfield = dtuple_get_nth_v_field(
+ update->old_vrow,
+ col->v_pos);
+ }
+ } else if (row) {
+ vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ } else {
+ ut_ad(0);
+ }
+
+ if (vfield) {
+ field = static_cast<byte*>(vfield->data);
+ flen = vfield->len;
+ } else {
+ ut_ad(flen == UNIV_SQL_NULL);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ if (row_heap) {
+ mem_heap_free(row_heap);
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_block, ptr) < 2) {
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ const uint16_t new_free = static_cast<uint16_t>(
+ ptr + 2 - undo_block->frame);
+ mach_write_to_2(undo_block->frame + first_free, new_free);
+
+ mach_write_to_2(ptr_to_first_free, new_free);
+
+ const byte* start = &undo_block->frame[first_free + 2];
+ mtr->undo_append(*undo_block, start, ptr - start);
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ const byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ byte* info_bits) /*!< out: info bits state */
+{
+ /* Read the state of the info bits */
+ *info_bits = *ptr++;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_u64_read_next_compressed(&ptr);
+ *roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undo record */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ byte info_bits,/*!< in: info bits from this undo record */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /*!< out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+ ulint n_skip_field = 0;
+
+ ut_a(dict_index_is_clust(index));
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ n_fields = mach_read_next_compressed(&ptr);
+ } else {
+ n_fields = 0;
+ }
+
+ *upd = update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+ mach_write_to_6(buf, trx_id);
+
+ upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (ulint i = 0; i < n_fields; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ upd_field = upd_get_nth_field(update, i);
+ uint32_t field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ /* If new version, we need to check index list to figure
+ out the correct virtual column position */
+ ptr = trx_undo_read_v_idx(
+ index->table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ /* This column could be dropped or no longer indexed */
+ if (field_no >= index->n_fields) {
+ /* Mark this is no longer needed */
+ upd_field->field_no = REC_MAX_N_FIELDS;
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ n_skip_field++;
+ continue;
+ }
+
+ upd_field_set_v_field_no(
+ upd_field, static_cast<uint16_t>(field_no),
+ index);
+ } else if (UNIV_UNLIKELY((update->info_bits
+ & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG)) {
+ ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+ const uint32_t uf = index->first_user_field();
+ ut_ad(field_no >= uf);
+
+ if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+ /* Generic instant ALTER TABLE */
+ if (field_no == uf) {
+ upd_field->new_val.type
+ .metadata_blob_init();
+ } else if (field_no >= index->n_fields) {
+ /* This is reachable during
+ purge if the table was emptied
+ and converted to the canonical
+ format on a later ALTER TABLE.
+ In this case,
+ row_purge_upd_exist_or_extern()
+ would only be interested in
+ freeing any BLOBs that were
+ updated, that is, the metadata
+ BLOB above. Other BLOBs in
+ the metadata record are never
+ updated; they are for the
+ initial DEFAULT values of the
+ instantly added columns, and
+ they will never change.
+
+ Note: if the table becomes
+ empty during ROLLBACK or is
+ empty during subsequent ALTER
+ TABLE, and btr_page_empty() is
+ called to re-create the root
+ page without the metadata
+ record, in that case we should
+ only free the latest version
+ of BLOBs in the record,
+ which purge would never touch. */
+ field_no = REC_MAX_N_FIELDS;
+ n_skip_field++;
+ } else {
+ dict_col_copy_type(
+ dict_index_get_nth_col(
+ index, field_no - 1),
+ &upd_field->new_val.type);
+ }
+ } else {
+ /* Instant ADD COLUMN...LAST */
+ dict_col_copy_type(
+ dict_index_get_nth_col(index,
+ field_no),
+ &upd_field->new_val.type);
+ }
+ upd_field->field_no = field_no
+ & dict_index_t::MAX_N_FIELDS;
+ } else if (field_no < index->n_fields) {
+ upd_field_set_field_no(upd_field,
+ static_cast<uint16_t>(field_no),
+ index);
+ } else {
+ ib::error() << "Trying to access update undo rec"
+ " field " << field_no
+ << " in index " << index->name
+ << " of table " << index->table->name
+ << " but index has only "
+ << dict_index_get_n_fields(index)
+ << " fields " << BUG_REPORT_MSG
+ << ". Run also CHECK TABLE "
+ << index->table->name << "."
+ " n_fields = " << n_fields << ", i = " << i;
+
+ ut_ad(0);
+ *upd = NULL;
+ return(NULL);
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(&upd_field->new_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(&upd_field->new_val, field, len);
+ } else {
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+
+ dfield_set_data(&upd_field->new_val, field, len);
+ dfield_set_ext(&upd_field->new_val);
+ }
+
+ ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG)
+ || field_no != index->first_user_field()
+ || (upd_field->new_val.ext
+ && upd_field->new_val.len == FIELD_REF_SIZE));
+
+ if (is_virtual) {
+ upd_field->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(
+ heap, sizeof *upd_field->old_v_val));
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(upd_field->old_v_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(
+ upd_field->old_v_val, field, len);
+ } else {
+ ut_ad(0);
+ }
+ }
+ }
+
+ /* We may have to skip dropped indexed virtual columns.
+ Also, we may have to trim the update vector of a metadata record
+ if dict_index_t::clear_instant_alter() was invoked on the table
+ later, and the number of fields no longer matches. */
+
+ if (n_skip_field) {
+ upd_field_t* d = upd_get_nth_field(update, 0);
+ const upd_field_t* const end = d + n_fields + 2;
+
+ for (const upd_field_t* s = d; s != end; s++) {
+ if (s->field_no != REC_MAX_N_FIELDS) {
+ *d++ = *s;
+ }
+ }
+
+ ut_ad(d + n_skip_field == end);
+ update->n_fields = d - upd_get_nth_field(update, 0);
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: updated columns */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ const byte* end_ptr;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ ut_ad(index->is_primary());
+
+ *row = dtuple_create_with_vcol(
+ heap, dict_table_get_n_cols(index->table),
+ dict_table_get_n_v_cols(index->table));
+
+ /* Mark all columns in the row uninitialized, so that
+ we can distinguish missing fields from fields that are SQL NULL. */
+ for (ulint i = 0; i < dict_table_get_n_cols(index->table); i++) {
+ dfield_get_type(dtuple_get_nth_field(*row, i))
+ ->mtype = DATA_MISSING;
+ }
+
+ dtuple_init_v_fld(*row);
+
+ for (const upd_field_t* uf = update->fields, * const ue
+ = update->fields + update->n_fields;
+ uf != ue; uf++) {
+ if (uf->old_v_val) {
+ continue;
+ }
+ const dict_col_t& c = *dict_index_get_nth_col(index,
+ uf->field_no);
+ if (!c.is_dropped()) {
+ *dtuple_get_nth_field(*row, c.ind) = uf->new_val;
+ }
+ }
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ dfield_t* dfield;
+ const byte* field;
+ uint32_t field_no;
+ const dict_col_t* col;
+ uint32_t len, orig_len;
+
+ field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ ptr = trx_undo_read_v_idx(
+ index->table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ /* This column could be dropped or no longer indexed */
+ if (field_no == FIL_NULL) {
+ ut_ad(is_virtual);
+ continue;
+ }
+
+ if (is_virtual) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ index->table, field_no);
+ col = &vcol->m_col;
+ dfield = dtuple_get_nth_v_field(*row, vcol->v_pos);
+ dict_col_copy_type(
+ &vcol->m_col,
+ dfield_get_type(dfield));
+ } else {
+ col = dict_index_get_nth_col(index, field_no);
+
+ if (col->is_dropped()) {
+ continue;
+ }
+
+ dfield = dtuple_get_nth_field(*row, col->ind);
+ ut_ad(dfield->type.mtype == DATA_MISSING
+ || dict_col_type_assert_equal(col,
+ &dfield->type));
+ ut_ad(dfield->type.mtype == DATA_MISSING
+ || dfield->len == len
+ || (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD));
+ dict_col_copy_type(col, dfield_get_type(dfield));
+ }
+
+ dfield_set_data(dfield, field, len);
+
+ if (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD) {
+ spatial_status_t spatial_status;
+
+ /* Decode spatial status. */
+ spatial_status = static_cast<spatial_status_t>(
+ (len & SPATIAL_STATUS_MASK)
+ >> SPATIAL_STATUS_SHIFT);
+ len &= ~SPATIAL_STATUS_MASK;
+
+ /* Keep compatible with 5.7.9 format. */
+ if (spatial_status == SPATIAL_UNKNOWN) {
+ spatial_status =
+ dict_col_get_spatial_status(col);
+ }
+
+ switch (spatial_status) {
+ case SPATIAL_ONLY:
+ ut_ad(len - UNIV_EXTERN_STORAGE_FIELD
+ == DATA_MBR_LEN);
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+
+ case SPATIAL_MIXED:
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD
+ - DATA_MBR_LEN);
+ break;
+
+ case SPATIAL_NONE:
+ dfield_set_len(
+ dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ break;
+
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ break;
+ }
+
+ dfield_set_ext(dfield);
+ dfield_set_spatial_status(dfield, spatial_status);
+
+ /* If the prefix of this column is indexed,
+ ensure that enough prefix is stored in the
+ undo log record. */
+ if (!ignore_prefix && col->ord_part
+ && spatial_status != SPATIAL_ONLY) {
+ ut_a(dfield_get_len(dfield)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_has_atomic_blobs(index->table)
+ || dfield_get_len(dfield)
+ >= REC_ANTELOPE_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@param[in,out] block undo page
+@param[in,out] mtr mini-transaction
+@return byte offset of the undo log record
+@retval 0 in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+ buf_block_t* block, mtr_t* mtr)
+{
+ byte* ptr_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + block->frame);
+ const uint16_t first_free = mach_read_from_2(ptr_first_free);
+ ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+ byte* const start = block->frame + first_free;
+ size_t len = strlen(table->name.m_name);
+ const size_t fixed = 2 + 1 + 11 + 11 + 2;
+ ut_ad(len <= NAME_LEN * 2 + 1);
+ /* The -10 is used in trx_undo_left() */
+ compile_time_assert((NAME_LEN * 1) * 2 + fixed
+ + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+ < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+ if (trx_undo_left(block, start) < fixed + len) {
+ ut_ad(first_free > TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_HDR_SIZE);
+ return 0;
+ }
+
+ byte* ptr = start + 2;
+ *ptr++ = TRX_UNDO_RENAME_TABLE;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+ memcpy(ptr, table->name.m_name, len);
+ ptr += len;
+ mach_write_to_2(ptr, first_free);
+ mach_write_to_2(ptr_first_free, ptr + 2 - block->frame);
+ memcpy(start, ptr_first_free, 2);
+ mtr->undo_append(*block, start + 2, ptr - start - 2);
+ return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@return DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ ut_ad(!table->is_temporary());
+
+ mtr_t mtr;
+ dberr_t err;
+ mtr.start();
+ if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+ ut_ad(err == DB_SUCCESS);
+ ut_ad(undo);
+ for (ut_d(int loop_count = 0);;) {
+ ut_ad(loop_count++ < 2);
+ ut_ad(undo->last_page_no
+ == block->page.id().page_no());
+
+ if (uint16_t offset = trx_undo_page_report_rename(
+ trx, table, block, &mtr)) {
+ undo->top_page_no = undo->last_page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = block;
+ ut_ad(!undo->empty());
+
+ err = DB_SUCCESS;
+ break;
+ } else {
+ mtr.commit();
+ mtr.start();
+ block = trx_undo_add_page(undo, &mtr);
+ if (!block) {
+ err = DB_OUT_OF_FILE_SPACE;
+ break;
+ }
+ }
+ }
+ }
+
+ mtr.commit();
+ return err;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index; in updates,
+ may contain a clustered index
+ record tuple that also contains
+ virtual columns of the table;
+ otherwise, NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index; NULL if insert */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the
+ undo log record */
+{
+ trx_t* trx;
+ mtr_t mtr;
+#ifdef UNIV_DEBUG
+ int loop_count = 0;
+#endif /* UNIV_DEBUG */
+
+ ut_a(dict_index_is_clust(index));
+ ut_ad(!update || rec);
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!srv_read_only_mode);
+
+ trx = thr_get_trx(thr);
+ /* This function must not be invoked during rollback
+ (of a TRX_STATE_PREPARE transaction or otherwise). */
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(!trx->in_rollback);
+
+ mtr.start();
+ trx_undo_t** pundo;
+ trx_rseg_t* rseg;
+ const bool is_temp = index->table->is_temporary();
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ rseg = trx->get_temp_rseg();
+ pundo = &trx->rsegs.m_noredo.undo;
+ } else {
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ pundo = &trx->rsegs.m_redo.undo;
+ rseg = trx->rsegs.m_redo.rseg;
+ }
+
+ dberr_t err;
+ buf_block_t* undo_block = trx_undo_assign_low(trx, rseg, pundo,
+ &err, &mtr);
+ trx_undo_t* undo = *pundo;
+
+ ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+ if (UNIV_UNLIKELY(undo_block == NULL)) {
+ goto err_exit;
+ }
+
+ ut_ad(undo != NULL);
+
+ do {
+ uint16_t offset = !rec
+ ? trx_undo_page_report_insert(
+ undo_block, trx, index, clust_entry, &mtr)
+ : trx_undo_page_report_modify(
+ undo_block, trx, index, rec, offsets, update,
+ cmpl_info, clust_entry, &mtr);
+
+ if (UNIV_UNLIKELY(offset == 0)) {
+ const uint16_t first_free = mach_read_from_2(
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->frame);
+ memset(undo_block->frame + first_free, 0,
+ (srv_page_size - FIL_PAGE_DATA_END)
+ - first_free);
+
+ if (first_free
+ == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+ /* The record did not fit on an empty
+ undo page. Discard the freshly allocated
+ page and return an error. */
+
+ /* When we remove a page from an undo
+ log, this is analogous to a
+ pessimistic insert in a B-tree, and we
+ must reserve the counterpart of the
+ tree latch, which is the rseg
+ mutex. We must commit the mini-transaction
+ first, because it may be holding lower-level
+ latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+
+ mtr.commit();
+ mtr.start();
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_free_last_page(undo, &mtr);
+ mutex_exit(&rseg->mutex);
+
+ err = DB_UNDO_RECORD_TOO_BIG;
+ goto err_exit;
+ } else {
+ /* Write log for clearing the unused
+ tail of the undo page. It might
+ contain some garbage from a previously
+ written record, and mtr_t::write()
+ will optimize away writes of unchanged
+ bytes. Failure to write this caused a
+ recovery failure when we avoided
+ reading the undo log page from the
+ data file and initialized it based on
+ redo log records (which included the
+ write of the previous garbage). */
+ mtr.memset(*undo_block, first_free,
+ srv_page_size - first_free
+ - FIL_PAGE_DATA_END, 0);
+ }
+
+ mtr.commit();
+ } else {
+ /* Success */
+ undo->top_page_no = undo_block->page.id().page_no();
+ mtr.commit();
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = undo_block;
+ ut_ad(!undo->empty());
+
+ if (!is_temp) {
+ const undo_no_t limit = undo->top_undo_no;
+ /* Determine if this is the first time
+ when this transaction modifies a
+ system-versioned column in this table. */
+ trx_mod_table_time_t& time
+ = trx->mod_tables.insert(
+ trx_mod_tables_t::value_type(
+ index->table, limit))
+ .first->second;
+ ut_ad(time.valid(limit));
+
+ if (!time.is_versioned()
+ && index->table->versioned_by_id()
+ && (!rec /* INSERT */
+ || (update
+ && update->affects_versioned()))) {
+ time.set_versioned(limit);
+ }
+ }
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ !rec, rseg->id, undo->top_page_no, offset);
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ ut_ad(++loop_count < 2);
+ mtr.start();
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ undo_block = trx_undo_add_page(undo, &mtr);
+
+ DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+ undo_block = NULL;);
+ } while (UNIV_LIKELY(undo_block != NULL));
+
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ DB_OUT_OF_FILE_SPACE,
+ //ER_INNODB_UNDO_LOG_FULL,
+ "No more space left over in %s tablespace for allocating UNDO"
+ " log pages. Please add new data file to the tablespace or"
+ " check if filesystem is full or enable auto-extension for"
+ " the tablespace",
+ undo->rseg->space == fil_system.sys_space
+ ? "system" : is_temp ? "temporary" : "undo");
+
+ /* Did not succeed: out of space */
+ err = DB_OUT_OF_FILE_SPACE;
+
+err_exit:
+ mtr_commit(&mtr);
+ return(err);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in] roll_ptr roll pointer to a record that exists
+@param[in,out] heap memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap)
+{
+ trx_undo_rec_t* undo_rec;
+ ulint rseg_id;
+ uint32_t page_no;
+ uint16_t offset;
+ trx_rseg_t* rseg;
+ bool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+ &offset);
+ ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+ ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ rseg = trx_sys.rseg_array[rseg_id];
+ ut_ad(rseg->is_persistent());
+
+ mtr.start();
+
+ buf_block_t* undo_page = trx_undo_page_get_s_latched(
+ page_id_t(rseg->space->id, page_no), &mtr);
+
+ undo_rec = trx_undo_rec_copy(undo_page->frame + offset, heap);
+
+ mtr.commit();
+
+ return(undo_rec);
+}
+
+/** Copy an undo record to heap.
+@param[in] roll_ptr roll pointer to record
+@param[in,out] heap memory heap where copied
+@param[in] trx_id id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction
+@param[in] name table name
+@param[out] undo_rec own: copy of the record
+@retval true if the undo log has been
+truncated and we cannot fetch the old version
+@retval false if the undo log record is available
+NOTE: the caller must have latches on the clustered index page. */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+trx_undo_get_undo_rec(
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap,
+ trx_id_t trx_id,
+ const table_name_t& name,
+ trx_undo_rec_t** undo_rec)
+{
+ rw_lock_s_lock(&purge_sys.latch);
+
+ bool missing_history = purge_sys.changes_visible(trx_id, name);
+ if (!missing_history) {
+ *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+ }
+
+ rw_lock_s_unlock(&purge_sys.latch);
+
+ return(missing_history);
+}
+
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG MY_ATTRIBUTE((unused))
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+or being purged, which means that it may have been removed */
+bool
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+ /*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
+ /*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers,/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted (an error),
+ or if the purge COULD have removed the version
+ though it has not yet done so */
+ mem_heap_t* v_heap, /* !< in: memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection */
+ dtuple_t** vrow, /*!< out: virtual column info, if any */
+ ulint v_status)
+ /*!< in: status determine if it is going
+ into this function by purge thread or not.
+ And if we read "after image" of undo log */
+{
+ trx_undo_rec_t* undo_rec = NULL;
+ dtuple_t* entry;
+ trx_id_t rec_trx_id;
+ ulint type;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ upd_t* update;
+ byte* ptr;
+ byte info_bits;
+ ulint cmpl_info;
+ bool dummy_extern;
+ byte* buf;
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
+ ut_ad(index_mtr->memo_contains_page_flagged(index_rec,
+ MTR_MEMO_PAGE_S_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_a(index->is_primary());
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+ /* The record rec is the first inserted version */
+ return(true);
+ }
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ ut_ad(!index->table->skip_alter_undo);
+
+ if (trx_undo_get_undo_rec(
+ roll_ptr, heap, rec_trx_id, index->table->name,
+ &undo_rec)) {
+ if (v_status & TRX_UNDO_PREV_IN_PURGE) {
+ /* We are fetching the record being purged */
+ undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+ } else {
+ /* The undo record may already have been purged,
+ during purge or semi-consistent read. */
+ return(false);
+ }
+ }
+
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ if (table_id != index->table->id) {
+ /* The table should have been rebuilt, but purge has
+ not yet removed the undo log records for the
+ now-dropped old table (table_id). */
+ return(true);
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ /* (a) If a clustered index record version is such that the
+ trx id stamp in it is bigger than purge_sys.view, then the
+ BLOBs in that version are known to exist (the purge has not
+ progressed that far);
+
+ (b) if the version is the first version such that trx id in it
+ is less than purge_sys.view, and it is not delete-marked,
+ then the BLOBs in that version are known to exist (the purge
+ cannot have purged the BLOBs referenced by that version
+ yet).
+
+ This function does not fetch any BLOBs. The callers might, by
+ possibly invoking row_ext_create() via row_build(). However,
+ they should have all needed information in the *old_vers
+ returned by this function. This is because *old_vers is based
+ on the transaction undo log records. The function
+ trx_undo_page_fetch_ext() will write BLOB prefixes to the
+ transaction undo log that are at least as long as the longest
+ possible column prefix in a secondary index. Thus, secondary
+ index entries for *old_vers can be constructed without
+ dereferencing any BLOB pointers. */
+
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits,
+ heap, &update);
+ ut_a(ptr);
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ /* We should confirm the existence of disowned external data,
+ if the previous version record is delete marked. If the trx_id
+ of the previous record is seen by purge view, we should treat
+ it as missing history, because the disowned external data
+ might be purged already.
+
+ The inherited external data (BLOBs) can be freed (purged)
+ after trx_id was committed, provided that no view was started
+ before trx_id. If the purge view can see the committed
+ delete-marked record by trx_id, no transactions need to access
+ the BLOB. */
+
+ /* the row_upd_changes_disowned_external(update) call could be
+ omitted, but the synchronization on purge_sys.latch is likely
+ more expensive. */
+
+ if ((update->info_bits & REC_INFO_DELETED_FLAG)
+ && row_upd_changes_disowned_external(update)) {
+ bool missing_extern;
+
+ rw_lock_s_lock(&purge_sys.latch);
+
+ missing_extern = purge_sys.changes_visible(
+ trx_id, index->table->name);
+
+ rw_lock_s_unlock(&purge_sys.latch);
+
+ if (missing_extern) {
+ /* treat as a fresh insert, not to
+ cause assertion error at the caller. */
+ return(true);
+ }
+ }
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info: */
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+ /* The page containing the clustered index record
+ corresponding to entry is latched in mtr. Thus the
+ following call is safe. */
+ if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+ heap)) {
+ ut_a(v_status & TRX_UNDO_PREV_IN_PURGE);
+ return false;
+ }
+
+ /* Get number of externally stored columns in updated record */
+ const ulint n_ext = index->is_primary()
+ ? dtuple_get_n_ext(entry) : 0;
+
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_get_converted_size(index, entry, n_ext)));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index,
+ entry, n_ext);
+ } else {
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_offs_size(offsets)));
+
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, true, offsets);
+ rec_set_bit_field_1(*old_vers, update->info_bits,
+ rec_offs_comp(offsets)
+ ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+ for (ulint i = 0; i < update->n_fields; i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ if (upd_fld_is_virtual_col(uf)) {
+ /* There are no virtual columns in
+ a clustered index record. */
+ continue;
+ }
+ const ulint n = uf->field_no;
+ ut_ad(!dfield_is_ext(&uf->new_val)
+ == !rec_offs_nth_extern(offsets, n));
+ ut_ad(!rec_offs_nth_default(offsets, n));
+
+ if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+ if (rec_offs_nth_sql_null(offsets, n)) {
+ ut_ad(index->table->is_instant());
+ ut_ad(n >= index->n_core_fields);
+ continue;
+ }
+ ut_ad(!index->table->not_redundant());
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+ - l;
+ *b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ continue;
+ }
+
+ ulint len;
+ memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+ uf->new_val.data, uf->new_val.len);
+ if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+ ut_ad(len == UNIV_SQL_NULL);
+ ut_ad(!rec_offs_comp(offsets));
+ ut_ad(uf->new_val.len
+ == rec_get_nth_field_size(rec, n));
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ *(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+ &= byte(~REC_1BYTE_SQL_NULL_MASK);
+ }
+ }
+ }
+
+ /* Set the old value (which is the after image of an update) in the
+ update vector to dtuple vrow */
+ if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+ row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+ false, NULL, NULL);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_dbg);
+ ut_a(!rec_offs_any_null_extern(
+ *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+ if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ if (!(*vrow)) {
+ *vrow = dtuple_create_with_vcol(
+ v_heap ? v_heap : heap,
+ dict_table_get_n_cols(index->table),
+ dict_table_get_n_v_cols(index->table));
+ dtuple_init_v_fld(*vrow);
+ }
+
+ ut_ad(index->table->n_v_cols);
+ trx_undo_read_v_cols(index->table, ptr, *vrow,
+ v_status & TRX_UNDO_PREV_IN_PURGE);
+ }
+
+ return(true);
+}
+
+/** Read virtual column value from undo log
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in,out] row the dtuple to fill
+@param[in] in_purge whether this is called by purge */
+void
+trx_undo_read_v_cols(
+ const dict_table_t* table,
+ const byte* ptr,
+ dtuple_t* row,
+ bool in_purge)
+{
+ const byte* end_ptr;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+ while (ptr < end_ptr) {
+ dfield_t* dfield;
+ const byte* field;
+ uint32_t field_no, len, orig_len;
+
+ field_no = mach_read_next_compressed(
+ const_cast<const byte**>(&ptr));
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ ptr = trx_undo_read_v_idx(
+ table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+
+ /* The virtual column is no longer indexed or does not exist.
+ This needs to put after trx_undo_rec_get_col_val() so the
+ undo ptr advances */
+ if (field_no == FIL_NULL) {
+ ut_ad(is_virtual);
+ continue;
+ }
+
+ if (is_virtual) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ table, field_no);
+
+ dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+ if (!in_purge
+ || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ dict_col_copy_type(
+ &vcol->m_col,
+ dfield_get_type(dfield));
+ dfield_set_data(dfield, field, len);
+ }
+ }
+ }
+
+ ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..23aa950a
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,984 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t* trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return whether the rollback was completed normally
+@retval false if the rollback was aborted by shutdown */
+inline bool trx_t::rollback_finish()
+{
+ mod_tables.clear();
+ if (UNIV_LIKELY(error_state == DB_SUCCESS))
+ {
+ commit();
+ return true;
+ }
+
+ ut_a(error_state == DB_INTERRUPTED);
+ ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+ ut_a(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+ ut_d(in_rollback= false);
+ if (trx_undo_t *&undo= rsegs.m_redo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ commit_low();
+ lock.que_state= TRX_QUE_RUNNING;
+ return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+ mem_heap_t *heap= mem_heap_create(512);
+ roll_node_t *roll_node= roll_node_create(heap);
+ roll_node->savept= savept;
+
+ ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+ {
+ const auto s= state;
+ ut_ad(s == TRX_STATE_ACTIVE ||
+ s == TRX_STATE_PREPARED ||
+ s == TRX_STATE_PREPARED_RECOVERED);
+ if (savept)
+ {
+ ut_ad(s == TRX_STATE_ACTIVE);
+ ut_ad(mysql_thd);
+ ut_ad(!is_recovered);
+ }
+ }
+#endif
+
+ error_state = DB_SUCCESS;
+
+ if (has_logged())
+ {
+ ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+ que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+ nullptr);
+ ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+ (que_node_get_parent(thr))));
+ que_run_threads(thr);
+ que_run_threads(roll_node->undo_thr);
+
+ /* Free the memory reserved by the undo graph. */
+ que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+ }
+
+ if (!savept)
+ {
+ rollback_finish();
+ MONITOR_INC(MONITOR_TRX_ROLLBACK);
+ }
+ else
+ {
+ ut_a(error_state == DB_SUCCESS);
+ const undo_no_t limit= savept->least_undo_no;
+ for (trx_mod_tables_t::iterator i= mod_tables.begin();
+ i != mod_tables.end(); )
+ {
+ trx_mod_tables_t::iterator j= i++;
+ ut_ad(j->second.valid());
+ if (j->second.rollback(limit))
+ mod_tables.erase(j);
+ }
+ lock.que_state= TRX_QUE_RUNNING;
+ MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+ }
+
+ mem_heap_free(heap);
+
+ MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/** Initiate rollback.
+@param savept savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+ ut_ad(!trx_mutex_own(this));
+ if (state == TRX_STATE_NOT_STARTED)
+ {
+ error_state= DB_SUCCESS;
+ return DB_SUCCESS;
+ }
+ ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+ if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+ wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+ rollback_low(savept);
+ return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx->op_info = "rollback";
+
+ /* If we are doing the XA recovery of prepared transactions,
+ then the transaction object does not have an InnoDB session
+ object, and we set a dummy session that we use for all MySQL
+ transactions. */
+
+ trx->rollback_low();
+
+ trx->op_info = "";
+
+ return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the rollback should be invoked for a running
+ active MySQL transaction (or recovered prepared transaction)
+ that is associated with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx->will_lock = false;
+ ut_ad(trx->mysql_thd);
+#ifdef WITH_WSREP
+ trx->wsrep= false;
+ trx->lock.was_chosen_as_wsrep_victim= false;
+#endif
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ ut_ad(!trx->is_autocommit_non_locking());
+ if (trx->rsegs.m_redo.undo) {
+ /* The XA ROLLBACK of a XA PREPARE transaction
+ will consist of multiple mini-transactions.
+
+ As the very first step of XA ROLLBACK, we must
+ change the undo log state back from
+ TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+ to ensure that recovery will complete the
+ rollback.
+
+ Failure to perform this step could cause a
+ situation where we would roll back part of
+ a XA PREPARE transaction, the server would be
+ killed, and finally, the transaction would be
+ recovered in XA PREPARE state, with some of
+ the actions already having been rolled back. */
+ ut_ad(trx->rsegs.m_redo.undo->rseg
+ == trx->rsegs.m_redo.rseg);
+ mtr_t mtr;
+ mtr.start();
+ mutex_enter(&trx->rsegs.m_redo.rseg->mutex);
+ if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+ trx_undo_set_state_at_prepare(trx, undo, true,
+ &mtr);
+ }
+ mutex_exit(&trx->rsegs.m_redo.rseg->mutex);
+ /* Write the redo log for the XA ROLLBACK
+ state change to the global buffer. It is
+ not necessary to flush the redo log. If
+ a durable log write of a later mini-transaction
+ takes place for whatever reason, then this state
+ change will be durable as well. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ }
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ ut_ad(!trx->is_autocommit_non_locking());
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ dberr_t err;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the statement rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx->rollback(&trx->last_sql_stat_start);
+
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_rollback_last_stmt(trx);
+ }
+
+ /* The following call should not be needed,
+ but we play it safe: */
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The statement rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ savep != NULL;
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+ if (!strcmp(savep->name, name)) {
+ return(savep);
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: savepoint to free */
+{
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: free all savepoints starting
+ with this savepoint i*/
+{
+ while (savep != NULL) {
+ trx_named_savept_t* next_savep;
+
+ next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+ trx_roll_savepoint_free(trx, savep);
+
+ savep = next_savep;
+ }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_named_savept_t* savep, /*!< in/out: savepoint */
+ int64_t* mysql_binlog_cache_pos)
+ /*!< out: the MySQL binlog
+ cache position corresponding
+ to this savepoint; MySQL needs
+ this information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ dberr_t err;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->mysql_thd);
+
+ /* Free all savepoints strictly later than savep. */
+
+ trx_roll_savepoints_free(
+ trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx->rollback(&savep->savept);
+
+ /* Store the current undo_no of the transaction so that
+ we know where to roll back if we have to roll back the
+ next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+#ifdef WITH_WSREP
+ trx->lock.was_chosen_as_wsrep_victim = false;
+#endif
+ return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the savepoint rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep == NULL) {
+ return(DB_NO_SAVEPOINT);
+ }
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ ib::error() << "Transaction has a savepoint "
+ << savep->name
+ << " though it is not started";
+ return(DB_ERROR);
+
+ case TRX_STATE_ACTIVE:
+
+ return(trx_rollback_to_savepoint_for_mysql_low(
+ trx, savep, mysql_binlog_cache_pos));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The savepoint rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ trx_start_if_not_started_xa(trx, false);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = static_cast<trx_named_savept_t*>(
+ ut_malloc_nokey(sizeof(*savep)));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept = trx_savept_take(trx);
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+ || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep != NULL) {
+ trx_roll_savepoint_free(trx, savep);
+ }
+
+ return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_savept_t savept;
+
+ savept.least_undo_no = trx->undo_no;
+
+ return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ const trx_id_t trx_id = trx->id;
+
+ ut_ad(trx_id);
+
+ heap = mem_heap_create(512);
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+
+ const bool dictionary_locked = trx_get_dict_operation(trx)
+ != TRX_DICT_OP_NONE;
+
+ if (dictionary_locked) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ que_run_threads(thr);
+ ut_a(roll_node->undo_thr != NULL);
+
+ que_run_threads(roll_node->undo_thr);
+
+ que_graph_free(
+ static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+ if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+ ut_ad(!dictionary_locked);
+ goto func_exit;
+ }
+
+ ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
+
+ if (!dictionary_locked || !trx->table_id) {
+ } else if (dict_table_t* table = dict_table_open_on_id(
+ trx->table_id, TRUE, DICT_TABLE_OP_NORMAL)) {
+ ib::info() << "Dropping table " << table->name
+ << ", with id " << trx->table_id
+ << " in recovery";
+
+ dict_table_close_and_drop(trx, table);
+
+ trx_commit_for_mysql(trx);
+ }
+
+ ib::info() << "Rolled back recovered transaction " << trx_id;
+
+func_exit:
+ if (dictionary_locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+ uint32_t n_trx;
+ uint64_t n_rows;
+ trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+ trx_roll_count_callback_arg *arg)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+ {
+ arg->n_trx++;
+ arg->n_rows+= trx->undo_no;
+ }
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+ time_t now = time(NULL);
+ mutex_enter(&recv_sys.mutex);
+ bool report = recv_sys.report(now);
+ mutex_exit(&recv_sys.mutex);
+
+ if (report) {
+ trx_roll_count_callback_arg arg;
+
+ /* Get number of recovered active transactions and number of
+ rows they modified. Numbers must be accurate, because only this
+ thread is allowed to touch recovered transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(
+ trx_roll_count_callback, &arg);
+
+ if (arg.n_rows > 0) {
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "To roll back: " UINT32PF " transactions, "
+ UINT64PF " rows", arg.n_trx, arg.n_rows);
+ }
+
+ ib::info() << "To roll back: " << arg.n_trx
+ << " transactions, " << arg.n_rows << " rows";
+
+ }
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+ std::vector<trx_t*> *trx_list)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ mutex_enter(&trx->mutex);
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+ trx_list->push_back(trx);
+ mutex_exit(&trx->mutex);
+ }
+ mutex_exit(&element->mutex);
+ return 0;
+}
+
+
+/**
+ Rollback any incomplete transactions which were encountered in crash recovery.
+
+ If the transaction already was committed, then we clean up a possible insert
+ undo log. If the transaction was not yet committed, then we roll it back.
+
+ Note: For XA recovered transactions, we rely on MySQL to
+ do rollback. They will be in TRX_STATE_PREPARED state. If the server
+ is shutdown and they are still lingering in trx_sys_t::trx_list
+ then the shutdown will hang.
+
+ @param[in] all true=roll back all recovered active transactions;
+ false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+ std::vector<trx_t*> trx_list;
+
+ ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+ /*
+ Collect list of recovered ACTIVE transaction ids first. Once collected, no
+ other thread is allowed to modify or remove these transactions from
+ rw_trx_hash.
+ */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+ &trx_list);
+
+ while (!trx_list.empty())
+ {
+ trx_t *trx= trx_list.back();
+ trx_list.pop_back();
+
+ ut_ad(trx);
+ ut_d(trx_mutex_enter(trx));
+ ut_ad(trx->is_recovered);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_d(trx_mutex_exit(trx));
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+ srv_fast_shutdown)
+ goto discard;
+
+ if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+ || trx->has_stats_table_lock())
+ {
+ trx_rollback_active(trx);
+ if (trx->error_state != DB_SUCCESS)
+ {
+ ut_ad(trx->error_state == DB_INTERRUPTED);
+ trx->error_state= DB_SUCCESS;
+ ut_ad(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+discard:
+ /* Note: before kill_server() invoked innobase_end() via
+ unireg_end(), it invoked close_connections(), which should initiate
+ the rollback of any user transactions via THD::cleanup() in the
+ connection threads, and wait for all THD::cleanup() to complete.
+ So, no active user transactions should exist at this point.
+
+ srv_undo_sources=false was cleared early in innobase_end().
+
+ Generally, the server guarantees that all connections using
+ InnoDB must be disconnected by the time we are reaching this code,
+ be it during shutdown or UNINSTALL PLUGIN.
+
+ Because there is no possible race condition with any
+ concurrent user transaction, we do not have to invoke
+ trx->commit_state() or wait for !trx->is_referenced()
+ before trx_sys.deregister_rw(trx). */
+ trx_sys.deregister_rw(trx);
+ trx_free_at_shutdown(trx);
+ }
+ else
+ trx->free();
+ }
+ }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+extern "C"
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_all_recovered)(void*)
+{
+ my_thread_init();
+ ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+ pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+ if (trx_sys.rw_trx_hash.size()) {
+ ib::info() << "Starting in background the rollback of"
+ " recovered transactions";
+ trx_rollback_recovered(true);
+ ib::info() << "Rollback of non-prepared transactions"
+ " completed";
+ }
+
+ trx_rollback_is_active = false;
+
+ my_thread_end();
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit();
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+
+ ut_ad(trx_mutex_own(trx));
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+
+ return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t roll_limit) /*!< in: rollback to undo no (for
+ partial undo), 0 if we are rolling back
+ the entire transaction */
+{
+ ut_ad(trx_mutex_own(trx));
+
+ /* Initialize the rollback field in the transaction */
+
+ ut_ad(!trx->roll_limit);
+ ut_ad(!trx->in_rollback);
+
+ trx->roll_limit = roll_limit;
+ trx->in_rollback = true;
+
+ ut_a(trx->roll_limit <= trx->undo_no);
+
+ trx->pages_undone = 0;
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ que_t* roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+
+ trx->lock.que_state = TRX_QUE_ROLLING_BACK;
+
+ return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+ node->state = ROLL_NODE_SEND;
+
+ node->common.type = QUE_NODE_ROLLBACK;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ trx_t* trx;
+ ib_id_t roll_limit;
+
+ trx = thr_get_trx(thr);
+
+ trx_mutex_enter(trx);
+
+ node->state = ROLL_NODE_WAIT;
+
+ ut_a(node->undo_thr == NULL);
+
+ roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+ trx_commit_or_rollback_prepare(trx);
+
+ node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+ trx_mutex_exit(trx);
+
+ } else {
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..307f8757
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,768 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+
+#ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+#endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ DBUG_ASSERT(xid->gtrid_length >= 0);
+ DBUG_ASSERT(xid->bqual_length >= 0);
+ DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->frame,
+ uint32_t(xid->formatID));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->frame,
+ uint32_t(xid->gtrid_length));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->frame,
+ uint32_t(xid->bqual_length));
+
+ const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+ + xid->bqual_length);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame,
+ xid->data, xid_length);
+ if (xid_length < XIDDATASIZE
+ && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame, field_ref_zero,
+ XIDDATASIZE - xid_length)) {
+ mtr->memset(rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+ XIDDATASIZE - xid_length, 0);
+ }
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+ /* Check that seqno is monotonically increasing */
+ long long xid_seqno = wsrep_xid_seqno(xid);
+ const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+ if (xid_seqno != -1
+ && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+ ut_ad(xid_seqno > wsrep_seqno);
+ } else {
+ memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+ }
+ wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+ trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+/** Clear the WSREP XID information from rollback segment header.
+@param[in,out] block rollback segment header
+@param[in,out] mtr mini-transaction */
+static void trx_rseg_clear_wsrep_checkpoint(buf_block_t *block, mtr_t *mtr)
+{
+ mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+ TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - TRX_RSEG_WSREP_XID_INFO,
+ 0);
+}
+
+static void
+trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+ const byte* xid_uuid = wsrep_xid_uuid(xid);
+ /* We must make check against wsrep_uuid here, the
+ trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with
+ xid contents in debug mode and the memcmp() will never give nonzero
+ result. */
+ const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid,
+ sizeof wsrep_uuid);
+ const trx_rseg_t* rseg = trx_sys.rseg_array[0];
+
+ buf_block_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
+ mtr);
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame))) {
+ trx_rseg_format_upgrade(rseg_header, mtr);
+ }
+
+ trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+ if (must_clear_rsegs) {
+ /* Because the UUID part of the WSREP XID differed
+ from current_xid_uuid, the WSREP group UUID was
+ changed, and we must reset the XID in all rollback
+ segment headers. */
+ for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
+ if (const trx_rseg_t* rseg =
+ trx_sys.rseg_array[rseg_id]) {
+ trx_rseg_clear_wsrep_checkpoint(
+ trx_rsegf_get(rseg->space,
+ rseg->page_no, mtr),
+ mtr);
+ }
+ }
+ }
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in] xid WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+ mtr_t mtr;
+ mtr.start();
+ trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+ mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in] rseg_header Rollback segment header
+@param[out] xid Transaction XID
+@return whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+ int formatID = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->frame));
+ if (formatID == 0) {
+ return false;
+ }
+
+ xid.formatID = formatID;
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->frame));
+
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->frame));
+
+ memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->frame, XIDDATASIZE);
+
+ return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in] page TRX_SYS page
+@param[out] xid WSREP XID (if present)
+@return whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+ + page)
+ != TRX_SYS_WSREP_XID_MAGIC_N) {
+ return false;
+ }
+
+ xid.formatID = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_FORMAT + page));
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_GTRID_LEN + page));
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+ memcpy(xid.data,
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+ return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out] xid WSREP XID
+@return whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+ mtr_t mtr;
+ long long max_xid_seqno = -1;
+ bool found = false;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+ rseg_id++, mtr.commit()) {
+ mtr.start();
+ const buf_block_t* sys = trx_sysf_get(&mtr, false);
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+
+ if (page_no == FIL_NULL) {
+ continue;
+ }
+
+ const buf_block_t* rseg_header = trx_rsegf_get_new(
+ trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr);
+
+ if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->frame)) {
+ continue;
+ }
+
+ XID tmp_xid;
+ long long tmp_seqno = 0;
+ if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+ && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+ > max_xid_seqno) {
+ found = true;
+ max_xid_seqno = tmp_seqno;
+ xid = tmp_xid;
+ memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+ sizeof wsrep_uuid);
+ }
+ }
+
+ return found;
+}
+#endif /* WITH_WSREP */
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out] rseg_header rollback segment header page
+@param[in,out] mtr mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+ /* Clear also possible garbage at the end of the page. Old
+ InnoDB versions did not initialize unused parts of pages. */
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+ srv_page_size
+ - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+ 0);
+}
+
+/** Create a rollback segment header.
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in,out] sys_header the TRX_SYS page (NULL for temporary rseg)
+@param[in,out] mtr mini-transaction
+@return the created rollback segment
+@retval NULL on failure */
+buf_block_t*
+trx_rseg_header_create(
+ fil_space_t* space,
+ ulint rseg_id,
+ buf_block_t* sys_header,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+
+ ut_ad(mtr->memo_contains(*space));
+ ut_ad(!sys_header == (space == fil_system.temp_space));
+
+ /* Allocate a new file segment for the rollback segment */
+ block = fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+ if (block == NULL) {
+ /* No space left */
+ return block;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG
+ + block->frame));
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG
+ + block->frame));
+
+ /* Initialize the history list */
+ flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+ /* Reset the undo log slots */
+ mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
+ TRX_RSEG_N_SLOTS * 4, 0xff);
+
+ if (sys_header) {
+ /* Add the rollback segment info to the free slot in
+ the trx system header */
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(
+ *sys_header,
+ TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->frame, space->id);
+ mtr->write<4,mtr_t::MAYBE_NOP>(
+ *sys_header,
+ TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
+ + rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+ + sys_header->frame, block->page.id().page_no());
+ }
+
+ return block;
+}
+
+/** Free a rollback segment in memory. */
+void
+trx_rseg_mem_free(trx_rseg_t* rseg)
+{
+ trx_undo_t* undo;
+ trx_undo_t* next_undo;
+
+ mutex_free(&rseg->mutex);
+
+ /* There can't be any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+ for (undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+ undo != NULL;
+ undo = next_undo) {
+
+ next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+ ut_free(undo);
+ }
+
+ ut_free(rseg);
+}
+
+/** Create a rollback segment object.
+@param[in] id rollback segment id
+@param[in] space space where the segment is placed
+@param[in] page_no page number of the segment header */
+static
+trx_rseg_t*
+trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no)
+{
+ trx_rseg_t* rseg = static_cast<trx_rseg_t*>(
+ ut_zalloc_nokey(sizeof *rseg));
+
+ rseg->id = id;
+ rseg->space = space;
+ rseg->page_no = page_no;
+ rseg->last_page_no = FIL_NULL;
+ rseg->curr_size = 1;
+
+ mutex_create(rseg->is_persistent()
+ ? LATCH_ID_REDO_RSEG : LATCH_ID_NOREDO_RSEG,
+ &rseg->mutex);
+
+ UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list);
+ UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list);
+
+ return(rseg);
+}
+
+/** Read the undo log lists.
+@param[in,out] rseg rollback segment
+@param[in,out] max_trx_id maximum observed transaction identifier
+@param[in] rseg_header rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+ const buf_block_t *rseg_header)
+{
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+ for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+ {
+ uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+ if (page_no != FIL_NULL)
+ {
+ const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no,
+ max_trx_id);
+ if (!undo)
+ return DB_CORRUPTION;
+ rseg->curr_size+= undo->size;
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out] rseg persistent rollback segment
+@param[in,out] max_trx_id maximum observed transaction identifier
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+ mtr_t *mtr)
+{
+ buf_block_t* rseg_hdr = trx_rsegf_get_new(
+ rseg->space->id, rseg->page_no, mtr);
+
+ if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) {
+ trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+ + rseg_hdr->frame);
+
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+
+ const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME
+ + rseg_hdr->frame;
+ if (*binlog_name) {
+ lsn_t lsn = mach_read_from_8(my_assume_aligned<8>(
+ FIL_PAGE_LSN
+ + rseg_hdr
+ ->frame));
+ compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
+ trx_sys.recovered_binlog_filename);
+ if (lsn > trx_sys.recovered_binlog_lsn) {
+ trx_sys.recovered_binlog_lsn = lsn;
+ trx_sys.recovered_binlog_offset
+ = mach_read_from_8(
+ TRX_RSEG
+ + TRX_RSEG_BINLOG_OFFSET
+ + rseg_hdr->frame);
+ memcpy(trx_sys.recovered_binlog_filename,
+ binlog_name,
+ TRX_RSEG_BINLOG_NAME_LEN);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_read_wsrep_checkpoint(
+ rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+ }
+ }
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return DB_SUCCESS;
+ }
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_hdr->frame)
+ + 1;
+ if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_hdr)) {
+ return err;
+ }
+
+ if (auto len = flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY
+ + rseg_hdr->frame)) {
+ trx_sys.rseg_history_len += len;
+
+ fil_addr_t node_addr = flst_get_last(TRX_RSEG
+ + TRX_RSEG_HISTORY
+ + rseg_hdr->frame);
+ node_addr.boffset = static_cast<uint16_t>(
+ node_addr.boffset - TRX_UNDO_HISTORY_NODE);
+
+ rseg->last_page_no = node_addr.page;
+
+ const buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, node_addr.page), mtr);
+
+ trx_id_t id = mach_read_from_8(block->frame + node_addr.boffset
+ + TRX_UNDO_TRX_ID);
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+ id = mach_read_from_8(block->frame + node_addr.boffset
+ + TRX_UNDO_TRX_NO);
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+
+ rseg->set_last_commit(node_addr.boffset, id);
+ unsigned purge = mach_read_from_2(block->frame
+ + node_addr.boffset
+ + TRX_UNDO_NEEDS_PURGE);
+ ut_ad(purge <= 1);
+ rseg->needs_purge = purge != 0;
+
+ if (rseg->last_page_no != FIL_NULL) {
+
+ /* There is no need to cover this operation by the purge
+ mutex because we are still bootstrapping. */
+ purge_sys.purge_queue.push(*rseg);
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+ + page)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+ memcpy(trx_sys.recovered_binlog_filename,
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+ + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+ trx_sys.recovered_binlog_offset = mach_read_from_8(
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+ + TRX_SYS + page);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+ trx_id_t max_trx_id = 0;
+
+ *trx_sys.recovered_binlog_filename = '\0';
+ trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+ trx_sys.recovered_wsrep_xid.null();
+ XID wsrep_sys_xid;
+ wsrep_sys_xid.null();
+ bool wsrep_xid_in_rseg_found = false;
+#endif
+ mtr_t mtr;
+ dberr_t err = DB_SUCCESS;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ mtr.start();
+ if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+ if (rseg_id == 0) {
+ /* In case this is an upgrade from
+ before MariaDB 10.3.5, fetch the base
+ information from the TRX_SYS page. */
+ max_trx_id = mach_read_from_8(
+ TRX_SYS + TRX_SYS_TRX_ID_STORE
+ + sys->frame);
+ trx_rseg_init_binlog_info(sys->frame);
+#ifdef WITH_WSREP
+ wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+ }
+
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+ if (page_no != FIL_NULL) {
+ trx_rseg_t* rseg = trx_rseg_mem_create(
+ rseg_id,
+ fil_space_get(trx_sysf_rseg_get_space(
+ sys, rseg_id)),
+ page_no);
+ ut_ad(rseg->is_persistent());
+ ut_ad(rseg->id == rseg_id);
+ ut_ad(!trx_sys.rseg_array[rseg_id]);
+ trx_sys.rseg_array[rseg_id] = rseg;
+ if ((err = trx_rseg_mem_restore(
+ rseg, max_trx_id, &mtr))
+ != DB_SUCCESS) {
+ mtr.commit();
+ break;
+ }
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null() &&
+ !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+ wsrep_xid_in_rseg_found = true;
+ ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+ wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+ sizeof wsrep_uuid)
+ || wsrep_xid_seqno(
+ &wsrep_sys_xid)
+ <= wsrep_xid_seqno(
+ &trx_sys.recovered_wsrep_xid));
+ }
+#endif
+ }
+ }
+
+ mtr.commit();
+ }
+
+ if (err != DB_SUCCESS) {
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ if (trx_rseg_t*& rseg = trx_sys.rseg_array[rseg_id]) {
+ while (trx_undo_t* u= UT_LIST_GET_FIRST(
+ rseg->undo_list)) {
+ UT_LIST_REMOVE(rseg->undo_list, u);
+ ut_free(u);
+ }
+ trx_rseg_mem_free(rseg);
+ rseg = NULL;
+ }
+ }
+ return err;
+ }
+
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null()) {
+ /* Upgrade from a version prior to 10.3.5,
+ where WSREP XID was stored in TRX_SYS page.
+ If no rollback segment has a WSREP XID set,
+ we must copy the XID found in TRX_SYS page
+ to rollback segments. */
+ mtr.start();
+
+ if (!wsrep_xid_in_rseg_found) {
+ trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+ }
+
+ /* Finally, clear WSREP XID in TRX_SYS page. */
+ mtr.memset(trx_sysf_get(&mtr),
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+ TRX_SYS_WSREP_XID_LEN, 0);
+ mtr.commit();
+ }
+#endif
+
+ trx_sys.init_max_trx_id(max_trx_id + 1);
+ return DB_SUCCESS;
+}
+
+/** Create a persistent rollback segment.
+@param[in] space_id system or undo tablespace id
+@return pointer to new rollback segment
+@retval NULL on failure */
+trx_rseg_t*
+trx_rseg_create(ulint space_id)
+{
+ trx_rseg_t* rseg = NULL;
+ mtr_t mtr;
+
+ mtr.start();
+
+ fil_space_t* space = mtr_x_lock_space(space_id, &mtr);
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+
+ if (buf_block_t* sys_header = trx_sysf_get(&mtr)) {
+ ulint rseg_id = trx_sys_rseg_find_free(sys_header);
+ if (buf_block_t* rblock = rseg_id == ULINT_UNDEFINED
+ ? NULL
+ : trx_rseg_header_create(space, rseg_id, sys_header,
+ &mtr)) {
+ ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id)
+ == space_id);
+ rseg = trx_rseg_mem_create(rseg_id, space,
+ rblock->page.id().
+ page_no());
+ ut_ad(rseg->id == rseg_id);
+ ut_ad(rseg->is_persistent());
+ ut_ad(!trx_sys.rseg_array[rseg->id]);
+ trx_sys.rseg_array[rseg->id] = rseg;
+ }
+ }
+
+ mtr.commit();
+
+ return(rseg);
+}
+
+/** Create the temporary rollback segments. */
+void
+trx_temp_rseg_create()
+{
+ mtr_t mtr;
+
+ for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ mtr_x_lock_space(fil_system.temp_space, &mtr);
+
+ buf_block_t* rblock = trx_rseg_header_create(
+ fil_system.temp_space, i, NULL, &mtr);
+ trx_rseg_t* rseg = trx_rseg_mem_create(
+ i, fil_system.temp_space, rblock->page.id().page_no());
+ ut_ad(!rseg->is_persistent());
+ ut_ad(!trx_sys.temp_rsegs[i]);
+ trx_sys.temp_rsegs[i] = rseg;
+ mtr.commit();
+ }
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out] rseg_header rollback segment header
+@param[in] trx committing transaction
+@param[in,out] mtr mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+ mtr_t *mtr)
+{
+ DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset);
+
+ const size_t len = strlen(trx->mysql_log_file_name) + 1;
+
+ ut_ad(len > 1);
+
+ if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) {
+ return;
+ }
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
+ + rseg_header->frame,
+ trx->mysql_log_offset);
+
+ void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
+
+ if (memcmp(trx->mysql_log_file_name, name, len)) {
+ mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
+ }
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..3064645f
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,339 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t trx_sys;
+
+/** Check whether transaction id is valid.
+@param[in] id transaction id to check
+@param[in] name table name */
+void
+ReadViewBase::check_trx_id_sanity(
+ trx_id_t id,
+ const table_name_t& name)
+{
+ if (id >= trx_sys.get_max_trx_id()) {
+
+ ib::warn() << "A transaction id"
+ << " in a record of table "
+ << name
+ << " is newer than the"
+ << " system-wide maximum.";
+ ut_ad(0);
+ THD *thd = current_thd;
+ if (thd != NULL) {
+ char table_name[MAX_FULL_NAME_LEN + 1];
+
+ innobase_format_name(
+ table_name, sizeof(table_name),
+ name.m_name);
+
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_SIGNAL_WARN,
+ "InnoDB: Transaction id"
+ " in a record of table"
+ " %s is newer than system-wide"
+ " maximum.", table_name);
+ }
+ }
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+ if (!*trx_sys.recovered_binlog_filename) {
+ return;
+ }
+
+ ib::info() << "Last binlog file '"
+ << trx_sys.recovered_binlog_filename
+ << "', position "
+ << trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in] sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+ == FIL_NULL) {
+ return rseg_id;
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+ mtr_t mtr;
+ mtr.start();
+
+ srv_available_undo_logs = 0;
+ if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ srv_available_undo_logs
+ += trx_sysf_rseg_get_page_no(sys_header,
+ rseg_id)
+ != FIL_NULL;
+ }
+ }
+
+ mtr.commit();
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint slot_no;
+ buf_block_t* block;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock_space(fil_system.sys_space, mtr);
+ compile_time_assert(TRX_SYS_SPACE == 0);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(fil_system.sys_space,
+ TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+ mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame,
+ FIL_PAGE_TYPE_TRX_SYS);
+
+ ut_ad(!mach_read_from_4(block->frame
+ + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC));
+
+ /* Reset the rollback segment slots. Old versions of InnoDB
+ (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+ that the whole array is initialized. */
+ compile_time_assert(256 >= TRX_SYS_N_RSEGS);
+ compile_time_assert(TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE
+ <= UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END);
+ mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS,
+ 256 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+ /* Initialize all of the page. This part used to be uninitialized. */
+ mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE,
+ srv_page_size
+ - (FIL_PAGE_DATA_END + TRX_SYS + TRX_SYS_RSEGS
+ + 256 * TRX_SYS_RSEG_SLOT_SIZE),
+ 0);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ slot_no = trx_sys_rseg_find_free(block);
+ buf_block_t* rblock = trx_rseg_header_create(fil_system.sys_space,
+ slot_no, block, mtr);
+
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(rblock->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+}
+
+/** Create the instance */
+void
+trx_sys_t::create()
+{
+ ut_ad(this == &trx_sys);
+ ut_ad(!is_initialised());
+ m_initialised = true;
+ trx_list.create();
+ rseg_history_len= 0;
+
+ rw_trx_hash.init();
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ trx_sysf_create(&mtr);
+
+ mtr_commit(&mtr);
+}
+
+/** Create the rollback segments.
+@return whether the creation succeeded */
+bool
+trx_sys_create_rsegs()
+{
+ /* srv_available_undo_logs reflects the number of persistent
+ rollback segments that have been initialized in the
+ transaction system header page. */
+ ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+ if (high_level_read_only) {
+ srv_available_undo_logs = 0;
+ return(true);
+ }
+
+ /* This is executed in single-threaded mode therefore it is not
+ necessary to use the same mtr in trx_rseg_create(). n_used cannot
+ change while the function is executing. */
+ trx_sysf_get_n_rseg_slots();
+
+ ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+ /* The first persistent rollback segment is always initialized
+ in the system tablespace. */
+ ut_a(srv_available_undo_logs > 0);
+
+ for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+ i++, srv_available_undo_logs++) {
+ /* Tablespace 0 is the system tablespace.
+ Dedicated undo log tablespaces start from 1. */
+ ulint space = srv_undo_tablespaces > 0
+ ? (i % srv_undo_tablespaces)
+ + srv_undo_space_id_start
+ : TRX_SYS_SPACE;
+
+ if (!trx_rseg_create(space)) {
+ ib::error() << "Unable to allocate the"
+ " requested innodb_undo_logs";
+ return(false);
+ }
+
+ /* Increase the number of active undo
+ tablespace in case new rollback segment
+ assigned to new undo tablespace. */
+ if (space > srv_undo_tablespaces_active) {
+ srv_undo_tablespaces_active++;
+
+ ut_ad(srv_undo_tablespaces_active == space);
+ }
+ }
+
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ ib::info info;
+ info << srv_available_undo_logs;
+ if (srv_undo_tablespaces_active) {
+ info << " rollback segments in " << srv_undo_tablespaces_active
+ << " undo tablespaces are active.";
+ } else {
+ info << " rollback segments are active.";
+ }
+
+ return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+ if (!is_initialised()) {
+ return;
+ }
+
+ if (size_t size = view_count()) {
+ ib::error() << "All read views were not closed before"
+ " shutdown: " << size << " read views open";
+ }
+
+ rw_trx_hash.destroy();
+
+ /* There can't be any active transactions. */
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ if (trx_rseg_t* rseg = rseg_array[i]) {
+ trx_rseg_mem_free(rseg);
+ }
+
+ if (trx_rseg_t* rseg = temp_rsegs[i]) {
+ trx_rseg_mem_free(rseg);
+ }
+ }
+
+ ut_a(trx_list.empty());
+ trx_list.close();
+ m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+ulint trx_sys_t::any_active_transactions()
+{
+ uint32_t total_trx= 0;
+
+ trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) {
+ if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY ||
+ (trx.state == TRX_STATE_ACTIVE && trx.id))
+ total_trx++;
+ });
+
+ return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..cf8fa17c
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2300 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+ 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/** Set of table_id */
+typedef std::set<
+ table_id_t,
+ std::less<table_id_t>,
+ ut_allocator<table_id_t> > table_id_set;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg) /*!< in: detailed error message */
+{
+ strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+ trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file) /*!< in: file to read message from */
+{
+ os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+ trx_t* trx)
+{
+ trx->state = TRX_STATE_NOT_STARTED;
+
+ trx->is_recovered = false;
+
+ trx->op_info = "";
+
+ trx->active_commit_ordered = false;
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->check_foreigns = true;
+
+ trx->check_unique_secondary = true;
+
+ trx->lock.n_rec_locks = 0;
+
+ trx->dict_operation = TRX_DICT_OP_NONE;
+
+ trx->table_id = 0;
+
+ trx->error_state = DB_SUCCESS;
+
+ trx->error_key_num = ULINT_UNDEFINED;
+
+ trx->undo_no = 0;
+
+ trx->rsegs.m_redo.rseg = NULL;
+
+ trx->rsegs.m_noredo.rseg = NULL;
+
+ trx->read_only = false;
+
+ trx->auto_commit = false;
+
+ trx->will_lock = false;
+
+ trx->ddl = false;
+
+ trx->internal = false;
+
+ ut_d(trx->start_file = 0);
+
+ ut_d(trx->start_line = 0);
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->lock.que_state = TRX_QUE_RUNNING;
+
+ trx->last_sql_stat_start.least_undo_no = 0;
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.rec_cached = 0;
+
+ trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+ ut_ad(!trx->wsrep);
+ ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+ /** Initializes a transaction object. It must be explicitly started
+ with trx_start_if_not_started() before using it. The default isolation
+ level is TRX_ISO_REPEATABLE_READ.
+ @param trx Transaction instance to initialise */
+ static void init(trx_t* trx)
+ {
+ /* Explicitly call the constructor of the already
+ allocated object. trx_t objects are allocated by
+ ut_zalloc_nokey() in Pool::Pool() which would not call
+ the constructors of the trx_t members. */
+ new(&trx->mod_tables) trx_mod_tables_t();
+
+ new(&trx->lock.table_locks) lock_list();
+
+ new(&trx->read_view) ReadView();
+
+ trx->rw_trx_hash_pins = 0;
+ trx_init(trx);
+
+ trx->dict_operation_lock_mode = 0;
+
+ trx->xid = UT_NEW_NOKEY(xid_t());
+
+ trx->detailed_error = reinterpret_cast<char*>(
+ ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+ trx->lock.lock_heap = mem_heap_create_typed(
+ 1024, MEM_HEAP_FOR_LOCK_HEAP);
+
+ lock_trx_lock_list_init(&trx->lock.trx_locks);
+
+ UT_LIST_INIT(trx->lock.evicted_tables,
+ &dict_table_t::table_LRU);
+
+ UT_LIST_INIT(
+ trx->trx_savepoints,
+ &trx_named_savept_t::trx_savepoints);
+
+ mutex_create(LATCH_ID_TRX, &trx->mutex);
+ }
+
+ /** Release resources held by the transaction object.
+ @param trx the transaction for which to release resources */
+ static void destroy(trx_t* trx)
+ {
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer */
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ ut_ad(!trx->mysql_thd);
+
+ ut_a(trx->lock.wait_lock == NULL);
+ ut_a(trx->lock.wait_thr == NULL);
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock.lock_heap != NULL) {
+ mem_heap_free(trx->lock.lock_heap);
+ trx->lock.lock_heap = NULL;
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+ UT_DELETE(trx->xid);
+ ut_free(trx->detailed_error);
+
+ mutex_free(&trx->mutex);
+
+ trx->mod_tables.~trx_mod_tables_t();
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.table_locks.~lock_list();
+
+ trx->read_view.~ReadView();
+ }
+};
+
+/** The lock strategy for TrxPool */
+struct TrxPoolLock {
+ TrxPoolLock() { }
+
+ /** Create the mutex */
+ void create()
+ {
+ mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mutex_enter(&m_mutex); }
+
+ /** Release the mutex */
+ void exit() { mutex_exit(&m_mutex); }
+
+ /** Free the mutex */
+ void destroy() { mutex_free(&m_mutex); }
+
+ /** Mutex to use */
+ ib_mutex_t m_mutex;
+};
+
+/** The lock strategy for the TrxPoolManager */
+struct TrxPoolManagerLock {
+ TrxPoolManagerLock() { }
+
+ /** Create the mutex */
+ void create()
+ {
+ mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mutex_enter(&m_mutex); }
+
+ /** Release the mutex */
+ void exit() { mutex_exit(&m_mutex); }
+
+ /** Free the mutex */
+ void destroy() { mutex_free(&m_mutex); }
+
+ /** Mutex to use */
+ ib_mutex_t m_mutex;
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+ trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+ ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+ UT_DELETE(trx_pools);
+
+ trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+ trx_t* trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer.
+ It may have been poisoned in trx_t::free().*/
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ trx->assert_freed();
+
+ mem_heap_t* heap;
+ ib_alloc_t* alloc;
+
+ /* We just got trx from pool, it should be non locking */
+ ut_ad(!trx->will_lock);
+ ut_ad(!trx->rw_trx_hash_pins);
+
+ DBUG_LOG("trx", "Create: " << trx);
+
+ heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+ alloc = ib_heap_allocator_create(heap);
+
+ trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+ ut_ad(trx->mod_tables.empty());
+ ut_ad(trx->lock.n_rec_locks == 0);
+ ut_ad(trx->lock.table_cached == 0);
+ ut_ad(trx->lock.rec_cached == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+#ifdef WITH_WSREP
+ ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+
+ trx_sys.register_trx(trx);
+
+ return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+ MEM_CHECK_DEFINED(this, sizeof *this);
+
+ ut_ad(!n_mysql_tables_in_use);
+ ut_ad(!mysql_log_file_name);
+ ut_ad(!mysql_n_tables_locked);
+ ut_ad(!internal);
+ ut_ad(!will_lock);
+ ut_ad(error_state == DB_SUCCESS);
+ ut_ad(magic_n == TRX_MAGIC_N);
+ ut_ad(!read_only);
+ ut_ad(!lock.wait_lock);
+
+ dict_operation= TRX_DICT_OP_NONE;
+ trx_sys.deregister_trx(this);
+ assert_freed();
+ trx_sys.rw_trx_hash.put_pins(this);
+
+ mysql_thd= nullptr;
+
+ // FIXME: We need to avoid this heap free/alloc for each commit.
+ if (autoinc_locks)
+ {
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ /* We allocated a dedicated heap for the vector. */
+ ib_vector_free(autoinc_locks);
+ autoinc_locks= NULL;
+ }
+
+ mod_tables.clear();
+
+ MEM_NOACCESS(&n_ref, sizeof n_ref);
+ /* do not poison mutex */
+ MEM_NOACCESS(&id, sizeof id);
+ MEM_NOACCESS(&state, sizeof state);
+ MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+ MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+ read_view.mem_noaccess();
+ MEM_NOACCESS(&lock, sizeof lock);
+ MEM_NOACCESS(&op_info, sizeof op_info);
+ MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+ MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+ MEM_NOACCESS(&is_registered, sizeof is_registered);
+ MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+ MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+ MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+ MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
+ MEM_NOACCESS(&duplicates, sizeof duplicates);
+ MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+ MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+ MEM_NOACCESS(&start_time, sizeof start_time);
+ MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+ MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+ MEM_NOACCESS(&table_id, sizeof table_id);
+ MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+ MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+ MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+ MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+ MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+ MEM_NOACCESS(&error_state, sizeof error_state);
+ MEM_NOACCESS(&error_info, sizeof error_info);
+ MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+ MEM_NOACCESS(&graph, sizeof graph);
+ MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+ MEM_NOACCESS(&undo_no, sizeof undo_no);
+ MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+ MEM_NOACCESS(&rsegs, sizeof rsegs);
+ MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+ MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+ MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+ MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+ MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+ MEM_NOACCESS(&read_only, sizeof read_only);
+ MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+ MEM_NOACCESS(&will_lock, sizeof will_lock);
+ MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+ MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+ MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+ MEM_NOACCESS(&ddl, sizeof ddl);
+ MEM_NOACCESS(&internal, sizeof internal);
+#ifdef UNIV_DEBUG
+ MEM_NOACCESS(&start_line, sizeof start_line);
+ MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+ MEM_NOACCESS(&xid, sizeof xid);
+ MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+ MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+#ifdef WITH_WSREP
+ ut_ad(!wsrep_UK_scan);
+ MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
+#endif /* WITH_WSREP */
+ MEM_NOACCESS(&magic_n, sizeof magic_n);
+ trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+inline void trx_t::commit_state()
+{
+ ut_ad(state == TRX_STATE_PREPARED
+ || state == TRX_STATE_PREPARED_RECOVERED
+ || state == TRX_STATE_ACTIVE);
+ /* This makes the transaction committed in memory and makes its
+ changes to data visible to other transactions. NOTE that there is a
+ small discrepancy from the strict formal visibility rules here: a
+ user of the database can see modifications made by another
+ transaction T even before the necessary redo log segment has been
+ flushed to the disk. If the database happens to crash before the
+ flush, the user has seen modifications from T which will never be a
+ committed transaction. However, any transaction T2 which sees the
+ modifications of the committing transaction T, and which also itself
+ makes modifications to the database, will get an lsn larger than the
+ committing transaction T. In the case where the log flush fails, and
+ T never gets committed, also T2 will never get committed. */
+ trx_mutex_enter(this);
+ state= TRX_STATE_COMMITTED_IN_MEMORY;
+ trx_mutex_exit(this);
+ ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+ DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+ DBUG_ASSERT(!is_referenced());
+
+ if (UT_LIST_GET_LEN(lock.trx_locks))
+ {
+ lock_release(this);
+ lock.n_rec_locks = 0;
+ ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ mem_heap_empty(lock.lock_heap);
+ }
+
+ lock.table_locks.clear();
+}
+
+/** At shutdown, frees a transaction object. */
+void
+trx_free_at_shutdown(trx_t *trx)
+{
+ ut_ad(trx->is_recovered);
+ ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+ || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+ || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+ && (!srv_was_started
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || (!srv_is_being_started
+ && !srv_undo_sources && srv_fast_shutdown))));
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ trx->commit_state();
+ trx->release_locks();
+ trx_undo_free_at_shutdown(trx);
+
+ ut_a(!trx->read_only);
+
+ DBUG_LOG("trx", "Free prepared: " << trx);
+ trx->state = TRX_STATE_NOT_STARTED;
+ ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+ trx->id = 0;
+ trx->free();
+}
+
+
+/**
+ Disconnect a prepared transaction from MySQL
+ @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+ ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->mysql_log_file_name);
+ trx->read_view.close();
+ trx->is_recovered= true;
+ trx->mysql_thd= NULL;
+ /* todo/fixme: suggest to do it at innodb prepare */
+ trx->will_lock= false;
+ trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+/****************************************************************//**
+Resurrect the table locks for a resurrected transaction. */
+static
+void
+trx_resurrect_table_locks(
+/*======================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const trx_undo_t* undo) /*!< in: undo log */
+{
+ mtr_t mtr;
+ table_id_set tables;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+ if (undo->empty()) {
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ /* trx_rseg_mem_create() may have acquired an X-latch on this
+ page, so we cannot acquire an S-latch. */
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(trx->rsegs.m_redo.rseg->space->id,
+ undo->top_page_no), &mtr);
+ buf_block_t* undo_block = block;
+ trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
+
+ do {
+ ulint type;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ ulint cmpl_info;
+ bool updated_extern;
+
+ if (undo_block != block) {
+ mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
+ undo_block = block;
+ }
+
+ trx_undo_rec_get_pars(
+ undo_rec, &type, &cmpl_info,
+ &updated_extern, &undo_no, &table_id);
+ tables.insert(table_id);
+
+ undo_rec = trx_undo_get_prev_rec(
+ block, page_offset(undo_rec), undo->hdr_page_no,
+ undo->hdr_offset, false, &mtr);
+ } while (undo_rec);
+
+ mtr_commit(&mtr);
+
+ for (table_id_set::const_iterator i = tables.begin();
+ i != tables.end(); i++) {
+ if (dict_table_t* table = dict_table_open_on_id(
+ *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
+ if (!table->is_readable()) {
+ mutex_enter(&dict_sys.mutex);
+ dict_table_close(table, TRUE, FALSE);
+ dict_sys.remove(table);
+ mutex_exit(&dict_sys.mutex);
+ continue;
+ }
+
+ if (trx->state == TRX_STATE_PREPARED) {
+ trx->mod_tables.insert(
+ trx_mod_tables_t::value_type(table,
+ 0));
+ }
+ lock_table_ix_resurrect(table, trx);
+
+ DBUG_LOG("ib_trx",
+ "resurrect " << ib::hex(trx->id)
+ << " IX lock on " << table->name);
+
+ dict_table_close(table, FALSE, FALSE);
+ }
+ }
+}
+
+
+/**
+ Resurrect the transactions that were doing inserts/updates the time of the
+ crash, they need to be undone.
+*/
+
+static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+ time_t start_time, ulonglong start_time_micro,
+ uint64_t *rows_to_undo)
+{
+ trx_state_t state;
+ /*
+ This is single-threaded startup code, we do not need the
+ protection of trx->mutex here.
+ */
+ switch (undo->state)
+ {
+ case TRX_UNDO_ACTIVE:
+ state= TRX_STATE_ACTIVE;
+ break;
+ case TRX_UNDO_PREPARED:
+ /*
+ Prepared transactions are left in the prepared state
+ waiting for a commit or abort decision from MySQL
+ */
+ ib::info() << "Transaction " << undo->trx_id
+ << " was in the XA prepared state.";
+
+ state= TRX_STATE_PREPARED;
+ break;
+ default:
+ return;
+ }
+
+ trx_t *trx= trx_create();
+ trx->state= state;
+ ut_d(trx->start_file= __FILE__);
+ ut_d(trx->start_line= __LINE__);
+
+ trx->rsegs.m_redo.undo= undo;
+ trx->undo_no= undo->top_undo_no + 1;
+ trx->rsegs.m_redo.rseg= rseg;
+ /*
+ For transactions with active data will not have rseg size = 1
+ or will not qualify for purge limit criteria. So it is safe to increment
+ this trx_ref_count w/o mutex protection.
+ */
+ ++trx->rsegs.m_redo.rseg->trx_ref_count;
+ *trx->xid= undo->xid;
+ trx->id= undo->trx_id;
+ trx->is_recovered= true;
+ trx->start_time= start_time;
+ trx->start_time_micro= start_time_micro;
+
+ if (undo->dict_operation)
+ {
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ if (!trx->table_id)
+ trx->table_id= undo->table_id;
+ }
+
+ trx_sys.rw_trx_hash.insert(trx);
+ trx_sys.rw_trx_hash.put_pins(trx);
+ trx_resurrect_table_locks(trx, undo);
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+ *rows_to_undo+= trx->undo_no;
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+ ut_a(srv_is_being_started);
+ ut_ad(!srv_was_started);
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return trx_rseg_array_init();
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+ return DB_SUCCESS;
+ }
+
+ purge_sys.create();
+ if (dberr_t err = trx_rseg_array_init()) {
+ ib::info() << "Retry with innodb_force_recovery=5";
+ return err;
+ }
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions. */
+ const time_t start_time = time(NULL);
+ const ulonglong start_time_micro= microsecond_interval_timer();
+ uint64_t rows_to_undo = 0;
+
+ for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+ trx_undo_t* undo;
+ trx_rseg_t* rseg = trx_sys.rseg_array[i];
+
+ /* Some rollback segment may be unavailable,
+ especially if the server was previously run with a
+ non-default value of innodb_undo_logs. */
+ if (rseg == NULL) {
+ continue;
+ }
+ /* Ressurrect other transactions. */
+ for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
+ undo != NULL;
+ undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+ trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+ if (!trx) {
+ trx_resurrect(undo, rseg, start_time,
+ start_time_micro, &rows_to_undo);
+ } else {
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->start_time == start_time);
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->rsegs.m_redo.rseg == rseg);
+ ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count);
+
+ trx->rsegs.m_redo.undo = undo;
+ if (undo->top_undo_no >= trx->undo_no) {
+ if (trx_state_eq(trx,
+ TRX_STATE_ACTIVE)) {
+ rows_to_undo -= trx->undo_no;
+ rows_to_undo +=
+ undo->top_undo_no + 1;
+ }
+
+ trx->undo_no = undo->top_undo_no + 1;
+ }
+ trx_resurrect_table_locks(trx, undo);
+ }
+ }
+ }
+
+ if (const auto size = trx_sys.rw_trx_hash.size()) {
+ ib::info() << size
+ << " transaction(s) which must be rolled back or"
+ " cleaned up in total " << rows_to_undo
+ << " row operations to undo";
+ ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+ }
+
+ purge_sys.clone_oldest_view();
+ return DB_SUCCESS;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@return persistent rollback segment
+@retval NULL if innodb_read_only */
+static trx_rseg_t* trx_assign_rseg_low()
+{
+ if (high_level_read_only) {
+ ut_ad(!srv_available_undo_logs);
+ return(NULL);
+ }
+
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ /* The first slot is always assigned to the system tablespace. */
+ ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
+
+ /* Choose a rollback segment evenly distributed between 0 and
+ innodb_undo_logs-1 in a round-robin fashion, skipping those
+ undo tablespaces that are scheduled for truncation. */
+ static Atomic_counter<unsigned> rseg_slot;
+ unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+ ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+ trx_rseg_t* rseg;
+
+#ifdef UNIV_DEBUG
+ ulint start_scan_slot = slot;
+ bool look_for_rollover = false;
+#endif /* UNIV_DEBUG */
+
+ bool allocated = false;
+
+ do {
+ for (;;) {
+ rseg = trx_sys.rseg_array[slot];
+
+#ifdef UNIV_DEBUG
+ /* Ensure that we are not revisiting the same
+ slot that we have already inspected. */
+ if (look_for_rollover) {
+ ut_ad(start_scan_slot != slot);
+ }
+ look_for_rollover = true;
+#endif /* UNIV_DEBUG */
+
+ ut_d(if (!trx_rseg_n_slots_debug))
+ slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+ if (rseg == NULL) {
+ continue;
+ }
+
+ ut_ad(rseg->is_persistent());
+
+ if (rseg->space != fil_system.sys_space) {
+ if (rseg->skip_allocation
+ || !srv_undo_tablespaces) {
+ continue;
+ }
+ } else if (trx_rseg_t* next
+ = trx_sys.rseg_array[slot]) {
+ if (next->space != fil_system.sys_space
+ && srv_undo_tablespaces > 0) {
+ /** If dedicated
+ innodb_undo_tablespaces have
+ been configured, try to use them
+ instead of the system tablespace. */
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ /* By now we have only selected the rseg but not marked it
+ allocated. By marking it allocated we are ensuring that it will
+ never be selected for UNDO truncate purge. */
+ mutex_enter(&rseg->mutex);
+ if (!rseg->skip_allocation) {
+ rseg->trx_ref_count++;
+ allocated = true;
+ }
+ mutex_exit(&rseg->mutex);
+ } while (!allocated);
+
+ ut_ad(rseg->trx_ref_count > 0);
+ ut_ad(rseg->is_persistent());
+ return(rseg);
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+ ut_ad(!rsegs.m_noredo.rseg);
+ ut_ad(!is_autocommit_non_locking());
+ compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+ /* Choose a temporary rollback segment between 0 and 127
+ in a round-robin fashion. */
+ static Atomic_counter<unsigned> rseg_slot;
+ trx_rseg_t* rseg = trx_sys.temp_rsegs[
+ rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+ ut_ad(!rseg->is_persistent());
+ rsegs.m_noredo.rseg = rseg;
+
+ if (id == 0) {
+ trx_sys.register_rw(this);
+ }
+
+ ut_ad(!rseg->is_persistent());
+ return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read-write transaction */
+{
+ ut_ad(!trx->in_rollback);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->start_line != 0);
+ ut_ad(trx->start_file != 0);
+ ut_ad(trx->roll_limit == 0);
+ ut_ad(trx->error_state == DB_SUCCESS);
+ ut_ad(trx->rsegs.m_redo.rseg == NULL);
+ ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+ ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+ ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+ /* Check whether it is an AUTOCOMMIT SELECT */
+ trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+ trx->read_only = srv_read_only_mode
+ || (!trx->ddl && !trx->internal
+ && thd_trx_is_read_only(trx->mysql_thd));
+
+ if (!trx->auto_commit) {
+ trx->will_lock = true;
+ } else if (!trx->will_lock) {
+ trx->read_only = true;
+ }
+
+#ifdef WITH_WSREP
+ trx->xid->null();
+#endif /* WITH_WSREP */
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ ut_a(trx->lock.table_locks.empty());
+
+ /* No other thread can access this trx object through rw_trx_hash,
+ still it can be found through trx_sys.trx_list. Sometimes it's
+ possible to indirectly protect trx_t::state by freezing
+ trx_sys.trx_list.
+
+ For now we update it without mutex protection, because original code
+ did it this way. It has to be reviewed and fixed properly. */
+ trx->state = TRX_STATE_ACTIVE;
+
+ /* By default all transactions are in the read-only list unless they
+ are non-locking auto-commit read only transactions or background
+ (internal) transactions. Note: Transactions marked explicitly as
+ read only can write to temporary tables, we put those on the RO
+ list too. */
+
+ if (!trx->read_only
+ && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
+
+ /* Temporary rseg is assigned only if the transaction
+ updates a temporary table */
+ trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+ ut_ad(trx->rsegs.m_redo.rseg != 0
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+ trx_sys.register_rw(trx);
+ } else {
+ if (!trx->is_autocommit_non_locking()) {
+
+ /* If this is a read-only transaction that is writing
+ to a temporary table then it needs a transaction id
+ to write to the temporary table. */
+
+ if (read_write) {
+ ut_ad(!srv_read_only_mode);
+ trx_sys.register_rw(trx);
+ }
+ } else {
+ ut_ad(!read_write);
+ }
+ }
+
+ trx->start_time = time(NULL);
+ trx->start_time_micro = trx->mysql_thd
+ ? thd_query_start_micro(trx->mysql_thd)
+ : microsecond_interval_timer();
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/** Set the serialisation number for a persistent committed transaction.
+@param[in,out] trx committed transaction with persistent changes */
+static
+void
+trx_serialise(trx_t* trx)
+{
+ trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(rseg);
+ ut_ad(mutex_own(&rseg->mutex));
+
+ if (rseg->last_page_no == FIL_NULL) {
+ mutex_enter(&purge_sys.pq_mutex);
+ }
+
+ trx_sys.assign_new_trx_no(trx);
+
+ /* If the rollback segment is not empty then the
+ new trx_t::no can't be less than any trx_t::no
+ already in the rollback segment. User threads only
+ produce events when a rollback segment is empty. */
+ if (rseg->last_page_no == FIL_NULL) {
+ purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
+ *rseg));
+ mutex_exit(&purge_sys.pq_mutex);
+ }
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static
+void
+trx_write_serialisation_history(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE to some
+ other state: these modifications to the file data structure define
+ the transaction as committed in the file based domain, at the
+ serialization point of the log sequence number lsn obtained below. */
+
+ /* We have to hold the rseg mutex because update log headers have
+ to be put to the history list in the (serialisation) order of the
+ UNDO trx number. This is required for the purge in-memory data
+ structures too. */
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ /* Undo log for temporary tables is discarded at transaction
+ commit. There is no purge for temporary tables, and also no
+ MVCC, because they are private to a session. */
+
+ mtr_t temp_mtr;
+ temp_mtr.start();
+ temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
+ trx_undo_set_state_at_finish(undo, &temp_mtr);
+ mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
+ temp_mtr.commit();
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ if (!rseg) {
+ ut_ad(!trx->rsegs.m_redo.undo);
+ return;
+ }
+
+ trx_undo_t*& undo = trx->rsegs.m_redo.undo;
+
+ if (!undo) {
+ return;
+ }
+
+ ut_ad(!trx->read_only);
+ ut_ad(!undo || undo->rseg == rseg);
+ mutex_enter(&rseg->mutex);
+
+ /* Assign the transaction serialisation number and add any
+ undo log to the purge queue. */
+ trx_serialise(trx);
+ if (undo) {
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+ trx_purge_add_undo_to_history(trx, undo, mtr);
+ }
+
+ mutex_exit(&rseg->mutex);
+
+ MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+ fts_trx_table_t* ftt) /* in: FTS trx table */
+{
+ fts_t* fts = ftt->table->fts;
+ fts_doc_ids_t* doc_ids = ftt->added_doc_ids;
+
+ ut_a(fts->add_wq);
+
+ mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+ ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+ /* fts_trx_table_t no longer owns the list. */
+ ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool is_commit) /*!< in: true if the transaction was
+ committed, false if it was rolled back. */
+{
+ if (is_commit) {
+ const ib_rbt_node_t* node;
+ ib_rbt_t* tables;
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(trx->fts_trx->savepoints));
+
+ tables = savepoint->tables;
+
+ for (node = rbt_first(tables);
+ node;
+ node = rbt_next(tables, node)) {
+ fts_trx_table_t** ftt;
+
+ ftt = rbt_value(fts_trx_table_t*, node);
+
+ if ((*ftt)->added_doc_ids) {
+ trx_finalize_for_fts_table(*ftt);
+ }
+ }
+ }
+
+ fts_trx_free(trx->fts_trx);
+ trx->fts_trx = NULL;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed_low(
+/*========================*/
+ lsn_t lsn) /*!< in: lsn up to which logs are to be
+ flushed. */
+{
+ bool flush = srv_file_flush_method != SRV_NOSYNC;
+
+ switch (srv_flush_log_at_trx_commit) {
+ case 2:
+ /* Write the log but do not flush it to disk */
+ flush = false;
+ /* fall through */
+ case 1:
+ case 3:
+ /* Write the log and optionally flush it to disk */
+ log_write_up_to(lsn, flush);
+ srv_inc_activity_count();
+ return;
+ case 0:
+ /* Do nothing */
+ return;
+ }
+
+ ut_error;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed(
+/*====================*/
+ lsn_t lsn, /*!< in: lsn up to which logs are to be
+ flushed. */
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx->op_info = "flushing log";
+ trx_flush_log_if_needed_low(lsn);
+ trx->op_info = "";
+}
+
+/**********************************************************************//**
+For each table that has been modified by the given transaction: update
+its dict_table_t::update_time with the current timestamp. Clear the list
+of the modified tables at the end. */
+static
+void
+trx_update_mod_tables_timestamp(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ /* consider using trx->start_time if calling time() is too
+ expensive here */
+ const time_t now = time(NULL);
+
+ trx_mod_tables_t::const_iterator end = trx->mod_tables.end();
+
+ for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
+ it != end;
+ ++it) {
+
+ /* This could be executed by multiple threads concurrently
+ on the same table object. This is fine because time_t is
+ word size or less. And _purely_ _theoretically_, even if
+ time_t write is not atomic, likely the value of 'now' is
+ the same in all threads and even if it is not, getting a
+ "garbage" in table->update_time is justified because
+ protecting it with a latch here would be too performance
+ intrusive. */
+ dict_table_t* table = it->first;
+ table->update_time = now;
+ }
+
+ trx->mod_tables.clear();
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param[in] table_id table identifier */
+void trx_t::evict_table(table_id_t table_id)
+{
+ ut_ad(in_rollback);
+
+ dict_table_t* table = dict_table_open_on_id(
+ table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+ if (!table) {
+ return;
+ }
+
+ if (!table->release()) {
+ /* This must be a DDL operation that is being rolled
+ back in an active connection. */
+ ut_a(table->get_ref_count() == 1);
+ ut_ad(!is_recovered);
+ ut_ad(mysql_thd);
+ return;
+ }
+
+ /* This table should only be locked by this transaction, if at all. */
+ ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+ const bool locked = UT_LIST_GET_LEN(table->locks);
+ ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+ dict_sys.remove(table, true, locked);
+ if (locked) {
+ UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+ }
+}
+
+/** Mark a transaction committed in the main memory data structures. */
+inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+ must_flush_log_later= false;
+ read_view.close();
+
+ if (is_autocommit_non_locking())
+ {
+ ut_ad(id == 0);
+ ut_ad(read_only);
+ ut_ad(!will_lock);
+ ut_a(!is_recovered);
+ ut_ad(!rsegs.m_redo.rseg);
+ ut_ad(mysql_thd);
+ ut_ad(state == TRX_STATE_ACTIVE);
+
+ /* Note: We are asserting without holding the lock mutex. But
+ that is OK because this transaction is not waiting and cannot
+ be rolled back and no new locks can (or should) be added
+ because it is flagged as a non-locking read-only transaction. */
+ ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+
+ /* This state change is not protected by any mutex, therefore
+ there is an inherent race here around state transition during
+ printouts. We ignore this race for the sake of efficiency.
+ However, the freezing of trx_sys.trx_list will protect the trx_t
+ instance and it cannot be removed from the trx_list and freed
+ without first unfreezing trx_list. */
+ state= TRX_STATE_NOT_STARTED;
+
+ MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+ DBUG_LOG("trx", "Autocommit in memory: " << this);
+ }
+ else
+ {
+#ifdef UNIV_DEBUG
+ if (!UT_LIST_GET_LEN(lock.trx_locks))
+ for (auto l : lock.table_locks)
+ ut_ad(!l);
+#endif /* UNIV_DEBUG */
+ commit_state();
+
+ if (id)
+ {
+ trx_sys.deregister_rw(this);
+
+ /* Wait for any implicit-to-explicit lock conversions to cease,
+ so that there will be no race condition in lock_release(). */
+ while (UNIV_UNLIKELY(is_referenced()))
+ ut_delay(srv_spin_wait_delay);
+ }
+ else
+ ut_ad(read_only || !rsegs.m_redo.rseg);
+
+ if (read_only || !rsegs.m_redo.rseg)
+ {
+ MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+ }
+ else
+ {
+ trx_update_mod_tables_timestamp(this);
+ MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+ is_recovered= false;
+ }
+
+ release_locks();
+ id= 0;
+ DEBUG_SYNC_C("after_trx_committed_in_memory");
+
+ while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+ {
+ UT_LIST_REMOVE(lock.evicted_tables, table);
+ dict_mem_table_free(table);
+ }
+ }
+
+ ut_ad(!rsegs.m_redo.undo);
+ ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+ if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+ {
+ mutex_enter(&rseg->mutex);
+ ut_ad(rseg->trx_ref_count > 0);
+ --rseg->trx_ref_count;
+ mutex_exit(&rseg->mutex);
+ }
+
+ if (mtr)
+ {
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+ trx_undo_commit_cleanup(undo);
+ undo= nullptr;
+ }
+
+ /* NOTE that we could possibly make a group commit more efficient
+ here: call os_thread_yield here to allow also other trxs to come
+ to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if the OS
+ does not crash. We may also flush the log files to disk, making
+ the transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group commit is that a group of transactions
+ gather behind a trx doing a physical disk write to log files, and
+ when that physical write has been completed, one of those
+ transactions does a write which commits the whole group. Note that
+ this group commit will only bring benefit if there are > 2 users
+ in the database. Then at least 2 users can gather behind one doing
+ the physical log write to disk.
+
+ If we are calling trx_t::commit() under prepare_commit_mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the mutex. This is to make the group commit
+ algorithm to work. Otherwise, the prepare_commit mutex would
+ serialize all commits and prevent a group of transactions from
+ gathering. */
+
+ commit_lsn= mtr->commit_lsn();
+ if (!commit_lsn)
+ /* Nothing to be done. */;
+ else if (flush_log_later)
+ /* Do nothing yet */
+ must_flush_log_later= true;
+ else if (srv_flush_log_at_trx_commit)
+ trx_flush_log_if_needed(commit_lsn, this);
+ }
+
+ ut_ad(!rsegs.m_noredo.undo);
+
+ /* Free all savepoints, starting from the first. */
+ trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
+
+ trx_roll_savepoints_free(this, savep);
+
+ if (fts_trx)
+ trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+ /* Serialization history has been written and the transaction is
+ committed in memory, which makes this commit ordered. Release commit
+ order critical section. */
+ if (wsrep)
+ {
+ wsrep= false;
+ wsrep_commit_ordered(mysql_thd);
+ }
+ lock.was_chosen_as_wsrep_victim= false;
+#endif /* WITH_WSREP */
+ trx_mutex_enter(this);
+ dict_operation= TRX_DICT_OP_NONE;
+
+ DBUG_LOG("trx", "Commit in memory: " << this);
+ state= TRX_STATE_NOT_STARTED;
+
+ assert_freed();
+ trx_init(this);
+ trx_mutex_exit(this);
+
+ ut_a(error_state == DB_SUCCESS);
+ if (!srv_read_only_mode)
+ srv_wake_purge_thread_if_not_active();
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr mini-transaction (if there are any persistent modifications) */
+void trx_t::commit_low(mtr_t *mtr)
+{
+ ut_ad(!mtr || mtr->is_active());
+ ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
+ ut_ad(!mtr == (aborted || !has_logged()));
+ ut_ad(!mtr || !aborted);
+
+ /* undo_no is non-zero if we're doing the final commit. */
+ if (fts_trx && undo_no)
+ {
+ ut_a(!is_autocommit_non_locking());
+ /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
+ dying. This is a possible scenario if there is a crash between
+ insert to DELETED table committing and transaction committing. The
+ fix would be able to return error from this function */
+ if (dberr_t error= fts_commit(this))
+ ut_a(error == DB_DUPLICATE_KEY);
+ }
+
+#ifndef DBUG_OFF
+ const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+ if (mtr)
+ {
+ trx_write_serialisation_history(this, mtr);
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this log
+ sequence number. The transaction becomes 'durable' when we write
+ the log to disk, but in the logical sense the commit in the
+ file-based data structures (undo logs etc.) happens here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come in
+ exactly the same order as commit lsn's, if the transactions have
+ different rollback segments. To get exactly the same order we
+ should hold the kernel mutex up to this point, adding to the
+ contention of the kernel mutex. However, if a transaction T2 is
+ able to see modifications made by a transaction T1, T2 will always
+ get a bigger transaction number and a bigger commit lsn than T1. */
+
+ mtr->commit();
+ }
+#ifndef DBUG_OFF
+ if (debug_sync)
+ DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+ commit_in_memory(mtr);
+}
+
+
+void trx_t::commit()
+{
+ mtr_t *mtr= nullptr;
+ mtr_t local_mtr;
+
+ if (has_logged())
+ {
+ mtr= &local_mtr;
+ local_mtr.start();
+ }
+ commit_low(mtr);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the commit or rollback should be invoked for a
+ running (or recovered prepared) transaction that is associated
+ with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, true);
+ /* fall through */
+
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ /* If the trx is in a lock wait state, moves the waiting
+ query thread to the suspended state */
+
+ if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ ut_a(trx->lock.wait_thr != NULL);
+ trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+ trx->lock.wait_thr = NULL;
+
+ trx->lock.que_state = TRX_QUE_RUNNING;
+ }
+
+ ut_ad(trx->lock.n_active_thrs == 1);
+ return;
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ trx_t* trx;
+
+ node->state = COMMIT_NODE_WAIT;
+
+ trx = thr_get_trx(thr);
+
+ ut_a(trx->lock.wait_thr == NULL);
+ ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+ trx_commit_or_rollback_prepare(trx);
+
+ trx->lock.que_state = TRX_QUE_COMMITTING;
+ trx->commit();
+ ut_ad(trx->lock.wait_thr == NULL);
+ trx->lock.que_state = TRX_QUE_RUNNING;
+
+ thr = NULL;
+ } else {
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ ut_d(trx->start_file = __FILE__);
+ ut_d(trx->start_line = __LINE__);
+
+ trx_start_low(trx, true);
+ /* fall through */
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ trx->op_info = "committing";
+ trx->commit();
+ MONITOR_DEC(MONITOR_TRX_ACTIVE);
+ trx->op_info = "";
+ return(DB_SUCCESS);
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ if (trx->id != 0
+ || !trx->must_flush_log_later
+ || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
+
+ return;
+ }
+
+ trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+ trx->must_flush_log_later = false;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ut_a(trx);
+
+ switch (trx->state) {
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ case TRX_STATE_NOT_STARTED:
+ trx->undo_no = 0;
+ /* fall through */
+ case TRX_STATE_ACTIVE:
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_laststmt_refresh(trx);
+ }
+
+ return;
+ }
+
+ ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+ FILE* f,
+ /*!< in: output stream */
+ const trx_t* trx,
+ /*!< in: transaction */
+ ulint max_query_len,
+ /*!< in: max query length to print,
+ or 0 to use the default max length */
+ ulint n_rec_locks,
+ /*!< in: lock_number_of_rows_locked(&trx->lock) */
+ ulint n_trx_locks,
+ /*!< in: length of trx->lock.trx_locks */
+ ulint heap_size)
+ /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+ ibool newline;
+
+ fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ fputs(", not started", f);
+ goto state_ok;
+ case TRX_STATE_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ goto state_ok;
+ }
+ fprintf(f, ", state %lu", (ulong) trx->state);
+ ut_ad(0);
+state_ok:
+ const char* op_info = trx->op_info;
+
+ if (*op_info) {
+ putc(' ', f);
+ fputs(op_info, f);
+ }
+
+ if (trx->is_recovered) {
+ fputs(" recovered trx", f);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ newline = TRUE;
+
+ /* trx->lock.que_state of an ACTIVE transaction may change
+ while we are not holding trx->mutex. We perform a dirty read
+ for performance reasons. */
+
+ switch (trx->lock.que_state) {
+ case TRX_QUE_RUNNING:
+ newline = FALSE; break;
+ case TRX_QUE_LOCK_WAIT:
+ fputs("LOCK WAIT ", f); break;
+ case TRX_QUE_ROLLING_BACK:
+ fputs("ROLLING BACK ", f); break;
+ case TRX_QUE_COMMITTING:
+ fputs("COMMITTING ", f); break;
+ default:
+ fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+ }
+
+ if (n_trx_locks > 0 || heap_size > 400) {
+ newline = TRUE;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu,"
+ " %lu row lock(s)",
+ (ulong) n_trx_locks,
+ (ulong) heap_size,
+ (ulong) n_rec_locks);
+ }
+
+ if (trx->undo_no != 0) {
+ newline = TRUE;
+ fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(
+ f, trx->mysql_thd, static_cast<uint>(max_query_len));
+ }
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.mutex.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ ut_ad(lock_mutex_own());
+
+ trx_print_low(f, trx, max_query_len,
+ lock_number_of_rows_locked(&trx->lock),
+ UT_LIST_GET_LEN(trx->lock.trx_locks),
+ mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.mutex. */
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ ulint n_rec_locks;
+ ulint n_trx_locks;
+ ulint heap_size;
+
+ lock_mutex_enter();
+ n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+ n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+ heap_size = mem_heap_get_size(trx->lock.lock_heap);
+ lock_mutex_exit();
+
+ trx_print_low(f, trx, max_query_len,
+ n_rec_locks, n_trx_locks, heap_size);
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return TRUE if weight(a) >= weight(b) */
+bool
+trx_weight_ge(
+/*==========*/
+ const trx_t* a, /*!< in: transaction to be compared */
+ const trx_t* b) /*!< in: transaction to be compared */
+{
+ ibool a_notrans_edit;
+ ibool b_notrans_edit;
+
+ /* If mysql_thd is NULL for a transaction we assume that it has
+ not edited non-transactional tables. */
+
+ a_notrans_edit = a->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(a->mysql_thd);
+
+ b_notrans_edit = b->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(b->mysql_thd);
+
+ if (a_notrans_edit != b_notrans_edit) {
+
+ return(a_notrans_edit);
+ }
+
+ /* Either both had edited non-transactional tables or both had
+ not, we fall back to comparing the number of altered/locked
+ rows. */
+
+ return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/** Prepare a transaction.
+@return log sequence number that makes the XA PREPARE durable
+@retval 0 if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+ ut_ad(!trx->is_recovered);
+
+ mtr_t mtr;
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ mutex_enter(&undo->rseg->mutex);
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+ mutex_exit(&undo->rseg->mutex);
+
+ mtr.commit();
+ }
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (!undo) {
+ /* There were no changes to persistent tables. */
+ return(0);
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+ ut_ad(undo->rseg == rseg);
+
+ mtr.start();
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE to
+ TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the file-based
+ world, at the serialization point of lsn. */
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+ mutex_exit(&rseg->mutex);
+
+ /* Make the XA PREPARE durable. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Only fresh user transactions can be prepared.
+ Recovered transactions cannot. */
+ ut_a(!trx->is_recovered);
+
+ lsn_t lsn = trx_prepare_low(trx);
+
+ DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
+
+ ut_a(trx->state == TRX_STATE_ACTIVE);
+ trx_mutex_enter(trx);
+ trx->state = TRX_STATE_PREPARED;
+ trx_mutex_exit(trx);
+
+ if (lsn) {
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ We must not be holding any mutexes or latches here. */
+
+ trx_flush_log_if_needed(lsn, trx);
+ }
+}
+
+/** XA PREPARE a transaction.
+@param[in,out] trx transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+ trx_start_if_not_started_xa(trx, false);
+
+ trx->op_info = "preparing";
+
+ trx_prepare(trx);
+
+ trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+ XID *xid_list;
+ uint len;
+ uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+ trx_recover_for_mysql_callback_arg *arg)
+{
+ DBUG_ASSERT(arg->len > 0);
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ /*
+ The state of a read-write transaction can only change from ACTIVE to
+ PREPARED while we are holding the element->mutex. But since it is
+ executed at startup no state change should occur.
+ */
+ if (trx_state_eq(trx, TRX_STATE_PREPARED))
+ {
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->id);
+ if (arg->count == 0)
+ ib::info() << "Starting recovery for XA transactions...";
+ XID& xid= arg->xid_list[arg->count];
+ if (arg->count++ < arg->len)
+ {
+ trx->state= TRX_STATE_PREPARED_RECOVERED;
+ ib::info() << "Transaction " << trx->id
+ << " in prepared state after recovery";
+ ib::info() << "Transaction contains changes to " << trx->undo_no
+ << " rows";
+ xid= *trx->xid;
+ }
+ }
+ }
+ mutex_exit(&element->mutex);
+ /* Do not terminate upon reaching arg->len; count all transactions */
+ return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+ void*)
+{
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+ trx->state= TRX_STATE_PREPARED;
+ }
+ mutex_exit(&element->mutex);
+ return false;
+}
+
+
+/**
+ Find prepared transaction objects for recovery.
+
+ @param[out] xid_list prepared transactions
+ @param[in] len number of slots in xid_list
+
+ @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+ trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ /* Fill xid_list with PREPARED transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+ if (arg.count)
+ {
+ ib::info() << arg.count
+ << " transactions in prepared state after recovery";
+ /* After returning the full list, reset the state, because
+ init_server_components() wants to recover the collection of
+ transactions twice, by first calling tc_log->open() and then
+ ha_recover() directly. */
+ if (arg.count <= len)
+ trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+ }
+ return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+ const XID *xid;
+ trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+ trx_get_trx_by_xid_callback_arg *arg)
+{
+ my_bool found= 0;
+ mutex_enter(&element->mutex);
+ if (trx_t *trx= element->trx)
+ {
+ trx_mutex_enter(trx);
+ if (trx->is_recovered &&
+ (trx_state_eq(trx, TRX_STATE_PREPARED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+ arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
+ {
+#ifdef WITH_WSREP
+ /* The commit of a prepared recovered Galera
+ transaction needs a valid trx->xid for
+ invoking trx_sys_update_wsrep_checkpoint(). */
+ if (!wsrep_is_wsrep_xid(trx->xid))
+#endif /* WITH_WSREP */
+ /* Invalidate the XID, so that subsequent calls will not find it. */
+ trx->xid->null();
+ arg->trx= trx;
+ found= 1;
+ }
+ trx_mutex_exit(trx);
+ }
+ mutex_exit(&element->mutex);
+ return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in] xid X/Open XA transaction identifier
+@return transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+ trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+ if (xid)
+ trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+ return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (trx->id == 0 && read_write) {
+ /* If the transaction is tagged as read-only then
+ it can only write to temp tables and for such
+ transactions we don't want to move them to the
+ trx_sys_t::rw_trx_hash. */
+ if (!trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ }
+ return;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (read_write && trx->id == 0 && !trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ return;
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Starts a transaction for internal processing. */
+void
+trx_start_internal_low(
+/*===================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Ensure it is not flagged as an auto-commit-non-locking
+ transaction. */
+
+ trx->will_lock = true;
+
+ trx->internal = true;
+
+ trx_start_low(trx, true);
+}
+
+/** Starts a read-only transaction for internal processing.
+@param[in,out] trx transaction to be started */
+void
+trx_start_internal_read_only_low(
+ trx_t* trx)
+{
+ /* Ensure it is not flagged as an auto-commit-non-locking
+ transaction. */
+
+ trx->will_lock = true;
+
+ trx->internal = true;
+
+ trx_start_low(trx, false);
+}
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+void
+trx_start_for_ddl_low(
+/*==================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_dict_op_t op) /*!< in: dictionary operation type */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ /* Flag this transaction as a dictionary operation, so that
+ the data dictionary will be locked in crash recovery. */
+
+ trx_set_dict_operation(trx, op);
+ trx->ddl= true;
+ trx_start_internal_low(trx);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+ trx_t* trx) /*!< in/out: transaction that is RW */
+{
+ ut_ad(trx->rsegs.m_redo.rseg == 0);
+ ut_ad(!trx->is_autocommit_non_locking());
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id == 0);
+
+ if (high_level_read_only) {
+ return;
+ }
+
+ trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+ ut_ad(trx->rsegs.m_redo.rseg != 0);
+
+ trx_sys.register_rw(trx);
+
+ /* So that we can see our own changes. */
+ if (trx->read_view.is_open()) {
+ trx->read_view.set_creator_trx_id(trx->id);
+ }
+}
+
+bool trx_t::has_stats_table_lock() const
+{
+ for (lock_list::const_iterator it= lock.table_locks.begin(),
+ end= lock.table_locks.end(); it != end; ++it)
+ {
+ const lock_t *lock= *it;
+ if (lock && lock->un_member.tab_lock.table->is_stats_table())
+ return true;
+ }
+
+ return false;
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..3d2d9752
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1401 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+ When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+ When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ return page_no == block->page.id().page_no()
+ ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame)
+ : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to first record
+@retval NULL if none exists */
+static trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+ return start == trx_undo_page_get_end(block, page_no, offset)
+ ? nullptr : block->frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in] page undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to last record
+@retval NULL if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+ return trx_undo_page_get_start(block, page_no, offset) == end
+ ? nullptr : block->frame + mach_read_from_2(block->frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+ uint32_t page_no, uint16_t offset,
+ bool shared, mtr_t *mtr)
+{
+ uint32_t prev_page_no= flst_get_prev_addr(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_NODE +
+ block->frame).page;
+
+ if (prev_page_no == FIL_NULL)
+ return nullptr;
+
+ block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+ 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return trx_undo_page_get_last_rec(block, page_no, offset);
+}
+
+/** Get the previous undo log record.
+@param[in] block undo log page
+@param[in] rec undo log record
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to record
+@retval NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+ uint32_t page_no, uint16_t offset)
+{
+ ut_ad(block->frame == page_align(rec));
+ return rec == block->frame + trx_undo_page_get_start(block, page_no, offset)
+ ? nullptr
+ : block->frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, bool shared, mtr_t *mtr)
+{
+ if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+ block->frame + rec,
+ page_no, offset))
+ return prev;
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+ shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
+ uint16_t offset, ulint mode, mtr_t *mtr)
+{
+ if (page_no == block->page.id().page_no() &&
+ mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG))
+ return NULL;
+
+ uint32_t next= flst_get_next_addr(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ block->frame).page;
+ if (next == FIL_NULL)
+ return NULL;
+
+ block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return trx_undo_page_get_first_rec(block, page_no, offset);
+}
+
+/** Get the next record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, mtr_t *mtr)
+{
+ if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
+ offset))
+ return next;
+
+ return trx_undo_get_next_rec_from_next_page(block, page_no, offset,
+ RW_S_LATCH, mtr);
+}
+
+/** Get the first record in an undo log.
+@param[in] space undo log header space
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out] block undo log page
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+ uint16_t offset, ulint mode, buf_block_t*& block,
+ mtr_t *mtr)
+{
+ block = buf_page_get(page_id_t(space.id, page_no), 0, mode, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+ return rec;
+
+ return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+ mtr);
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out] block undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+ mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.frame),
+ FIL_PAGE_UNDO_LOG);
+ static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.frame,
+ 0, 2);
+ mach_write_to_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame),
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame, 2);
+ /* The following corresponds to flst_zero_both(), but without writing log. */
+ memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_PAGE + block.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_BYTE + block.frame, 0, 2);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_PAGE + block.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_BYTE + block.frame, 0, 2);
+ static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+ TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+ /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+ memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + block.frame, 0,
+ srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+ FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+ ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+ if (trx_rseg_n_slots_debug)
+ max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+ for (ulint i= 0; i < max_slots; i++)
+ if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+ return i;
+
+ return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out] space tablespace
+@param[in,out] rseg_hdr rollback segment header (x-latched)
+@param[out] id undo slot number
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+ dberr_t *err, mtr_t *mtr)
+{
+ buf_block_t* block;
+ uint32_t n_reserved;
+ bool success;
+
+ const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ib::warn() << "Cannot find a free slot for an undo log. Do"
+ " you have too many active transactions running"
+ " concurrently?";
+
+ *err = DB_TOO_MANY_CONCURRENT_TRXS;
+ return NULL;
+ }
+
+ ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
+ if (!success) {
+ *err = DB_OUT_OF_FILE_SPACE;
+ return NULL;
+ }
+
+ /* Allocate a new file segment for the undo log */
+ block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ mtr, true);
+
+ space->release_free_extents(n_reserved);
+
+ if (block == NULL) {
+ *err = DB_OUT_OF_FILE_SPACE;
+ return NULL;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ mtr->undo_create(*block);
+ trx_undo_page_init(*block);
+
+ mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + block->frame,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame, 0U);
+
+ flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
+ mtr);
+
+ flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ *id = slot_no;
+ mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame,
+ block->page.id().page_no());
+
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out] undo_page undo log segment header page
+@param[in] trx_id transaction identifier
+@param[in,out] mtr mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+ mtr_t* mtr)
+{
+ /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+ repurposed after upgrading to MariaDB 10.3. */
+ byte *undo_type= my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+ ut_ad(mach_read_from_2(undo_type) <= 2);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+ byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+ undo_page->frame);
+ const uint16_t free= mach_read_from_2(start + 2);
+ static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+ "compatibility");
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+ mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+ /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+ So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+ But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+ memcpy_aligned<2>(start + 2, start, 2);
+ mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+ start, 2);
+ uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ undo_page->frame);
+ alignas(4) byte buf[4];
+ mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+ mach_write_to_2(buf + 2, free);
+ static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+ static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+ mtr->memcpy(*undo_page, my_assume_aligned<4>
+ (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+ buf, 4);
+ if (prev_log)
+ mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
+ free);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+ undo_page->frame, trx_id);
+ /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+ mach_write_to_2(buf, 1);
+ memcpy_aligned<2>(buf + 2, start, 2);
+ static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+ "compatibility");
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+ undo_page->frame, buf, 4);
+ /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+ if (prev_log)
+ {
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+ undo_page->frame, prev_log);
+ static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+ "compatibility");
+ mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+ static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+ FLST_NODE_SIZE, "compatibility");
+ }
+ else
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+ return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out] block undo header page
+@param[in] offset undo header record offset
+@param[in] xid distributed transaction identifier
+@param[in,out] mtr mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+ const XID &xid, mtr_t *mtr)
+{
+ DBUG_ASSERT(xid.gtrid_length > 0);
+ DBUG_ASSERT(xid.bqual_length >= 0);
+ DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+ DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+ static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+ "gtrid and bqual don't fit xid data");
+ DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ block->frame) == offset);
+
+ trx_ulogf_t* log_hdr= block->frame + offset;
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+ static_cast<uint32_t>(xid.formatID));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+ static_cast<uint32_t>(xid.gtrid_length));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+ static_cast<uint32_t>(xid.bqual_length));
+ const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+ + xid.bqual_length);
+ mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+ xid.data, xid_length);
+ if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+ mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+ XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+ xid->formatID=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_FORMAT));
+
+ xid->gtrid_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+ xid->bqual_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+ memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any page latch
+@return X-latched block if success
+@retval NULL on failure */
+buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
+{
+ trx_rseg_t* rseg = undo->rseg;
+ buf_block_t* new_block = NULL;
+ uint32_t n_reserved;
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* header_block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1,
+ FSP_UNDO, mtr)) {
+ goto func_exit;
+ }
+
+ new_block = fseg_alloc_free_page_general(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_block->frame,
+ undo->top_page_no + 1, FSP_UP, true, mtr, mtr);
+
+ rseg->space->release_free_extents(n_reserved);
+
+ if (!new_block) {
+ goto func_exit;
+ }
+
+ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+ buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
+ undo->last_page_no = new_block->page.id().page_no();
+
+ mtr->undo_create(*new_block);
+ trx_undo_page_init(*new_block);
+
+ flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ undo->size++;
+ rseg->curr_size++;
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return(new_block);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ bool in_history, /*!< in: TRUE if the undo log is in the history
+ list */
+ uint32_t hdr_page_no, /*!< in: header page number */
+ uint32_t page_no, /*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ const ulint space = rseg->space->id;
+
+ ut_a(hdr_page_no != page_no);
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ buf_block_t* undo_block = trx_undo_page_get(page_id_t(space, page_no),
+ mtr);
+ buf_block_t* header_block = trx_undo_page_get(page_id_t(space,
+ hdr_page_no),
+ mtr);
+
+ flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_block->frame,
+ rseg->space, page_no, mtr);
+ buf_page_free(rseg->space, page_no, mtr, __FILE__, __LINE__);
+
+ const fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_block->frame);
+ rseg->curr_size--;
+
+ if (in_history) {
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, mtr);
+ byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_header->frame;
+ uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+ ut_ad(hist_size > 0);
+ mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+ }
+
+ return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any undo log page
+ or that has allocated the undo log page */
+void
+trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
+{
+ ut_ad(undo->hdr_page_no != undo->last_page_no);
+ ut_ad(undo->size > 0);
+
+ undo->last_page_no = trx_undo_free_page(
+ undo->rseg, false, undo->hdr_page_no, undo->last_page_no, mtr);
+
+ undo->size--;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out] undo undo log
+@param[in] limit all undo logs after this limit will be discarded
+@param[in] is_temp whether this is temporary undo log */
+void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
+{
+ mtr_t mtr;
+ ut_ad(is_temp == !undo.rseg->is_persistent());
+
+ for (;;) {
+ mtr.start();
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ trx_undo_rec_t* trunc_here = NULL;
+ mutex_enter(&undo.rseg->mutex);
+ buf_block_t* undo_block = trx_undo_page_get(
+ page_id_t(undo.rseg->space->id, undo.last_page_no),
+ &mtr);
+ trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+ undo_block, undo.hdr_page_no, undo.hdr_offset);
+ while (rec) {
+ if (trx_undo_rec_get_undo_no(rec) < limit) {
+ goto func_exit;
+ }
+ /* Truncate at least this record off, maybe more */
+ trunc_here = rec;
+
+ rec = trx_undo_page_get_prev_rec(undo_block, rec,
+ undo.hdr_page_no,
+ undo.hdr_offset);
+ }
+
+ if (undo.last_page_no != undo.hdr_page_no) {
+ trx_undo_free_last_page(&undo, &mtr);
+ mutex_exit(&undo.rseg->mutex);
+ mtr.commit();
+ continue;
+ }
+
+func_exit:
+ mutex_exit(&undo.rseg->mutex);
+
+ if (trunc_here) {
+ mtr.write<2>(*undo_block,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->frame,
+ ulint(trunc_here - undo_block->frame));
+ }
+
+ mtr.commit();
+ return;
+ }
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out] rseg rollback segment
+@param[in] hdr_page_no header page number
+@param[in] hdr_offset header offset on the page
+@param[in] limit first undo number to preserve
+(everything below the limit will be truncated) */
+void
+trx_undo_truncate_start(
+ trx_rseg_t* rseg,
+ uint32_t hdr_page_no,
+ uint16_t hdr_offset,
+ undo_no_t limit)
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (!limit) {
+ return;
+ }
+loop:
+ mtr_start(&mtr);
+
+ if (!rseg->is_persistent()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ buf_block_t* undo_page;
+ rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+ RW_X_LATCH, undo_page, &mtr);
+ if (rec == NULL) {
+ /* Already empty */
+done:
+ mtr.commit();
+ return;
+ }
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+ goto done;
+ }
+
+ if (undo_page->page.id().page_no() == hdr_page_no) {
+ uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+ + undo_page->frame);
+ if (end == 0) {
+ end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_page->frame);
+ }
+
+ mtr.write<2>(*undo_page, undo_page->frame + hdr_offset
+ + TRX_UNDO_LOG_START, end);
+ } else {
+ trx_undo_free_page(rseg, true, hdr_page_no,
+ undo_page->page.id().page_no(), &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ goto loop;
+}
+
+/** Frees an undo log segment which is not in the history list.
+@param undo temporary undo log */
+static void trx_undo_seg_free(const trx_undo_t *undo)
+{
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+ trx_rseg_t* const rseg = undo->rseg;
+ bool finished;
+ mtr_t mtr;
+ ut_ad(rseg->space == fil_system.temp_space);
+
+ do {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), &mtr);
+
+ fseg_header_t* file_seg = TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER + block->frame;
+
+ finished = fseg_free_step(file_seg, &mtr);
+
+ if (finished) {
+ /* Update the rseg header */
+ buf_block_t* rseg_header = trx_rsegf_get(
+ rseg->space, rseg->page_no, &mtr);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ memset(TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + undo->id * TRX_RSEG_SLOT_SIZE +
+ rseg_header->frame, 0xff, 4);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+ }
+
+ mtr.commit();
+ } while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out] rseg rollback segment
+@param[in] id rollback segment slot
+@param[in] page_no undo log segment page number
+@param[in,out] max_trx_id the largest observed transaction ID
+@return the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
+ trx_id_t &max_trx_id)
+{
+ mtr_t mtr;
+ XID xid;
+
+ ut_ad(id < TRX_RSEG_N_SLOTS);
+
+ mtr.start();
+ const buf_block_t* block = trx_undo_page_get(
+ page_id_t(rseg->space->id, page_no), &mtr);
+ const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE
+ + block->frame);
+ if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+ sql_print_error("InnoDB: unsupported undo header type %u",
+ type);
+corrupted:
+ mtr.commit();
+ return nullptr;
+ }
+
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame);
+ if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+ offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+ sql_print_error("InnoDB: invalid undo header offset %u",
+ offset);
+ goto corrupted;
+ }
+
+ const trx_ulogf_t* const undo_header = block->frame + offset;
+ uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame);
+ switch (state) {
+ case TRX_UNDO_ACTIVE:
+ case TRX_UNDO_PREPARED:
+ if (UNIV_LIKELY(type != 1)) {
+ break;
+ }
+ sql_print_error("InnoDB: upgrade from older version than"
+ " MariaDB 10.3 requires clean shutdown");
+ goto corrupted;
+ default:
+ sql_print_error("InnoDB: unsupported undo header state %u",
+ state);
+ goto corrupted;
+ case TRX_UNDO_TO_PURGE:
+ if (UNIV_UNLIKELY(type == 1)) {
+ goto corrupted_type;
+ }
+ /* fall through */
+ case TRX_UNDO_CACHED:
+ trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+ if (id >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_NO %llx", id);
+ goto corrupted;
+ }
+ if (id > max_trx_id) {
+ max_trx_id = id;
+ }
+ }
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ if (undo_header[TRX_UNDO_XID_EXISTS]) {
+ trx_undo_read_xid(undo_header, &xid);
+ } else {
+ xid.null();
+ }
+
+ trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+ if (trx_id >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+ goto corrupted;
+ }
+ if (trx_id > max_trx_id) {
+ max_trx_id = trx_id;
+ }
+
+ mutex_enter(&rseg->mutex);
+ trx_undo_t* undo = trx_undo_mem_create(
+ rseg, id, trx_id, &xid, page_no, offset);
+ mutex_exit(&rseg->mutex);
+ if (!undo) {
+ return undo;
+ }
+
+ undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+ undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
+ undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + block->frame);
+
+ fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ const buf_block_t* last = trx_undo_page_get(
+ page_id_t(rseg->space->id, undo->last_page_no), &mtr);
+
+ if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+ last, page_no, offset)) {
+ undo->top_offset = static_cast<uint16_t>(rec - last->frame);
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ ut_ad(!undo->empty());
+ } else {
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+ }
+
+ undo->state = state;
+
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(rseg->undo_list, undo);
+ } else {
+ UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ }
+
+ mtr.commit();
+ return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification */
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ ut_a(id < TRX_RSEG_N_SLOTS);
+
+ undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ undo->id = id;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->top_undo_no = IB_ID_MAX;
+ undo->top_page_no = page_no;
+ undo->guess_block = NULL;
+ ut_ad(undo->empty());
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /*!< in: undo log to init */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] undo undo log object
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+{
+ ulint id;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ buf_block_t* block = trx_undo_seg_create(
+ rseg->space,
+ trx_rsegf_get(rseg->space, rseg->page_no, mtr), &id, err, mtr);
+
+ if (!block) {
+ return NULL;
+ }
+
+ rseg->curr_size++;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ *undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
+ block->page.id().page_no(), offset);
+ if (*undo == NULL) {
+ *err = DB_OUT_OF_MEMORY;
+ /* FIXME: this will not free the undo block to the file */
+ return NULL;
+ } else if (rseg != trx->rsegs.m_redo.rseg) {
+ return block;
+ }
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ trx->table_id = 0;
+ /* fall through */
+ case TRX_DICT_OP_TABLE:
+ (*undo)->table_id = trx->table_id;
+ (*undo)->dict_operation = TRUE;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_TABLE_ID,
+ trx->table_id);
+ }
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] pundo the undo log memory object
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+ mtr_t* mtr)
+{
+ ut_ad(mutex_own(&rseg->mutex));
+
+ trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+ if (!undo) {
+ return NULL;
+ }
+
+ ut_ad(undo->size == 1);
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = buf_page_get(page_id_t(undo->rseg->space->id,
+ undo->hdr_page_no),
+ 0, RW_X_LATCH, mtr);
+ if (!block) {
+ return NULL;
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+ MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+ *pundo = undo;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);
+
+ if (rseg != trx->rsegs.m_redo.rseg) {
+ return block;
+ }
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ return block;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ trx->table_id = 0;
+ /* fall through */
+ case TRX_DICT_OP_TABLE:
+ undo->table_id = trx->table_id;
+ undo->dict_operation = TRUE;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+ + TRX_UNDO_TABLE_ID,
+ trx->table_id);
+ }
+
+ return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+ ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (undo) {
+ return buf_page_get_gen(
+ page_id_t(undo->rseg->space->id, undo->last_page_no),
+ 0, RW_X_LATCH, undo->guess_block,
+ BUF_GET, __FILE__, __LINE__, mtr, err);
+ }
+
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+ mutex_enter(&rseg->mutex);
+ buf_block_t* block = trx_undo_reuse_cached(
+ trx, rseg, &trx->rsegs.m_redo.undo, mtr);
+
+ if (!block) {
+ block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+ err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+ } else {
+ *err = DB_SUCCESS;
+ }
+
+ UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[in] rseg rollback segment
+@param[out] undo the undo log
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+{
+ const bool is_temp __attribute__((unused)) = rseg == trx->rsegs.m_noredo.rseg;
+
+ ut_ad(rseg == trx->rsegs.m_redo.rseg
+ || rseg == trx->rsegs.m_noredo.rseg);
+ ut_ad(undo == (is_temp
+ ? &trx->rsegs.m_noredo.undo
+ : &trx->rsegs.m_redo.undo));
+ ut_ad(mtr->get_log_mode()
+ == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+ if (*undo) {
+ return buf_page_get_gen(
+ page_id_t(rseg->space->id, (*undo)->last_page_no),
+ 0, RW_X_LATCH, (*undo)->guess_block,
+ BUF_GET, __FILE__, __LINE__, mtr, err);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_too_many_trx",
+ *err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+ );
+
+ mutex_enter(&rseg->mutex);
+
+ buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr);
+
+ if (!block) {
+ block = trx_undo_create(trx, rseg, undo, err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+ } else {
+ *err = DB_SUCCESS;
+ }
+
+ UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+ mutex_exit(&rseg->mutex);
+ return block;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+buf_block_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ const uint16_t state = undo->size == 1
+ && TRX_UNDO_PAGE_REUSE_LIMIT
+ > mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + block->frame)
+ ? TRX_UNDO_CACHED
+ : TRX_UNDO_TO_PURGE;
+
+ undo->state = state;
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame, state);
+ return block;
+}
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out] trx transaction
+@param[in,out] undo undo log
+@param[in] rollback false=XA PREPARE, true=XA ROLLBACK
+@param[in,out] mtr mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+ mtr_t *mtr)
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = trx_undo_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+ if (rollback) {
+ ut_ad(undo->state == TRX_UNDO_PREPARED);
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->frame, TRX_UNDO_ACTIVE);
+ return;
+ }
+
+ /*------------------------------*/
+ ut_ad(undo->state == TRX_UNDO_ACTIVE);
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = *trx->xid;
+ /*------------------------------*/
+
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame,
+ undo->state);
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->frame);
+ mtr->write<1>(*block, block->frame + offset + TRX_UNDO_XID_EXISTS, 1U);
+
+ trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** Free temporary undo log after commit or rollback.
+The information is not needed after a commit or rollback, therefore
+the data can be discarded.
+@param undo temporary undo log */
+void trx_undo_commit_cleanup(trx_undo_t *undo)
+{
+ trx_rseg_t* rseg = undo->rseg;
+ ut_ad(rseg->space == fil_system.temp_space);
+
+ mutex_enter(&rseg->mutex);
+
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+
+ if (undo->state == TRX_UNDO_CACHED) {
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+ /* Delete first the undo log segment in the file */
+ trx_undo_seg_free(undo);
+
+ ut_ad(rseg->curr_size > undo->size);
+ rseg->curr_size -= undo->size;
+
+ ut_free(undo);
+ }
+
+ mutex_exit(&rseg->mutex);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+ if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+ switch (undo->state) {
+ case TRX_UNDO_PREPARED:
+ break;
+ case TRX_UNDO_CACHED:
+ case TRX_UNDO_TO_PURGE:
+ ut_ad(trx_state_eq(trx,
+ TRX_STATE_COMMITTED_IN_MEMORY));
+ /* fall through */
+ case TRX_UNDO_ACTIVE:
+ /* trx_t::commit_state() assigns
+ trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+ ut_a(!srv_was_started
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || srv_fast_shutdown);
+ break;
+ default:
+ ut_error;
+ }
+
+ UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+ if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+ ut_a(undo->state == TRX_UNDO_PREPARED);
+
+ UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+}