summaryrefslogtreecommitdiffstats
path: root/storage/innobase/trx
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/trx')
-rw-r--r--storage/innobase/trx/trx0i_s.cc1471
-rw-r--r--storage/innobase/trx/trx0purge.cc1480
-rw-r--r--storage/innobase/trx/trx0rec.cc2448
-rw-r--r--storage/innobase/trx/trx0roll.cc933
-rw-r--r--storage/innobase/trx/trx0rseg.cc727
-rw-r--r--storage/innobase/trx/trx0sys.cc370
-rw-r--r--storage/innobase/trx/trx0trx.cc2292
-rw-r--r--storage/innobase/trx/trx0undo.cc1478
8 files changed, 11199 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..2dc39118
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1471 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM 1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE 39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd \
+ - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+ ulint offset; /*!< offset, in number of rows */
+ ulint rows_allocd; /*!< the size of this chunk, in number
+ of rows */
+ void* base; /*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+ ulint rows_used; /*!< number of used rows */
+ ulint rows_allocd; /*!< number of allocated rows */
+ ulint row_size; /*!< size of a single row */
+ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+ memory chunks that stores the
+ rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+ srw_lock rw_lock; /*!< read-write lock protecting this */
+ Atomic_relaxed<ulonglong> last_read;
+ /*!< last time the cache was read;
+ measured in nanoseconds */
+ i_s_table_cache_t innodb_trx; /*!< innodb_trx table */
+ i_s_table_cache_t innodb_locks; /*!< innodb_locks table */
+ i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM 10000
+ hash_table_t locks_hash; /*!< hash table used to eliminate
+ duplicate entries in the
+ innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE 1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS 2048
+ ha_storage_t* storage; /*!< storage for external volatile
+ data that may become unavailable
+ when we release
+ lock_sys.latch */
+ ulint mem_allocd; /*!< the amount of memory
+ allocated with mem_alloc*() */
+ bool is_truncated; /*!< this is true if the memory
+ limit was hit and thus the data
+ in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+ return !lock->is_table()
+ ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+ : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+ i_s_table_cache_t* table_cache, /*!< out: table cache */
+ size_t row_size) /*!< in: the size of a
+ row */
+{
+ ulint i;
+
+ table_cache->rows_used = 0;
+ table_cache->rows_allocd = 0;
+ table_cache->row_size = row_size;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ table_cache->chunks[i].base = NULL;
+ }
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+ i_s_table_cache_t* table_cache) /*!< in/out: table cache */
+{
+ ulint i;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ if (table_cache->chunks[i].base) {
+ ut_free(table_cache->chunks[i].base);
+ table_cache->chunks[i].base = NULL;
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+ i_s_table_cache_t* table_cache, /*!< in/out: table cache */
+ trx_i_s_cache_t* cache) /*!< in/out: cache to record
+ how many bytes are
+ allocated */
+{
+ ulint i;
+ void* row;
+
+ ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+ if (table_cache->rows_used == table_cache->rows_allocd) {
+
+ /* rows_used == rows_allocd means that new chunk needs
+ to be allocated: either no more empty rows in the
+ last allocated chunk or nothing has been allocated yet
+ (rows_num == rows_allocd == 0); */
+
+ i_s_mem_chunk_t* chunk;
+ ulint req_bytes;
+ ulint got_bytes;
+ ulint req_rows;
+ ulint got_rows;
+
+ /* find the first not allocated chunk */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].base == NULL) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ have been allocated :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ /* allocate the chunk we just found */
+
+ if (i == 0) {
+
+ /* first chunk, nothing is allocated yet */
+ req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+ } else {
+
+ /* Memory is increased by the formula
+ new = old + old / 2; We are trying not to be
+ aggressive here (= using the common new = old * 2)
+ because the allocated memory will not be freed
+ until InnoDB exit (it is reused). So it is better
+ to once allocate the memory in more steps, but
+ have less unused/wasted memory than to use less
+ steps in allocation (which is done once in a
+ lifetime) but end up with lots of unused/wasted
+ memory. */
+ req_rows = table_cache->rows_allocd / 2;
+ }
+ req_bytes = req_rows * table_cache->row_size;
+
+ if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+ return(NULL);
+ }
+
+ chunk = &table_cache->chunks[i];
+
+ got_bytes = req_bytes;
+ chunk->base = ut_malloc_nokey(req_bytes);
+
+ got_rows = got_bytes / table_cache->row_size;
+
+ cache->mem_allocd += got_bytes;
+
+#if 0
+ printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+ " row size=%lu,"
+ " req rows=%lu, got rows=%lu\n",
+ i, req_bytes, got_bytes,
+ table_cache->row_size,
+ req_rows, got_rows);
+#endif
+
+ chunk->rows_allocd = got_rows;
+
+ table_cache->rows_allocd += got_rows;
+
+ /* adjust the offset of the next chunk */
+ if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+ table_cache->chunks[i + 1].offset
+ = chunk->offset + chunk->rows_allocd;
+ }
+
+ /* return the first empty row in the newly allocated
+ chunk */
+ row = chunk->base;
+ } else {
+
+ char* chunk_start;
+ ulint offset;
+
+ /* there is an empty row, no need to allocate new
+ chunks */
+
+ /* find the first chunk that contains allocated but
+ empty/unused rows */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd
+ > table_cache->rows_used) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ are full, but
+ table_cache->rows_used != table_cache->rows_allocd means
+ exactly the opposite - there are allocated but
+ empty/unused rows :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ chunk_start = (char*) table_cache->chunks[i].base;
+ offset = table_cache->rows_used
+ - table_cache->chunks[i].offset;
+
+ row = chunk_start + offset * table_cache->row_size;
+ }
+
+ table_cache->rows_used++;
+
+ return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+ const i_s_locks_row_t* row) /*!< in: row to validate */
+{
+ ut_ad(row->lock_mode);
+ ut_ad(row->lock_table != NULL);
+ ut_ad(row->lock_table_id != 0);
+
+ if (!row->lock_index) {
+ /* table lock */
+ ut_ad(!row->lock_data);
+ ut_ad(row->lock_page == page_id_t(0, 0));
+ ut_ad(!row->lock_rec);
+ } else {
+ /* record lock */
+ /* row->lock_data == NULL if buf_page_try_get() == NULL */
+ }
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+ i_s_trx_row_t* row, /*!< out: result object
+ that's filled */
+ const trx_t* trx, /*!< in: transaction to
+ get data from */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ corresponding row in
+ innodb_locks if trx is
+ waiting or NULL if trx
+ is not waiting */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into
+ which to copy volatile
+ strings */
+{
+ const char* s;
+
+ lock_sys.assert_locked();
+
+ const lock_t* wait_lock = trx->lock.wait_lock;
+
+ row->trx_id = trx->id;
+ row->trx_started = trx->start_time;
+ if (trx->in_rollback) {
+ row->trx_state = "ROLLING BACK";
+ } else if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+ row->trx_state = "COMMITTING";
+ } else if (wait_lock) {
+ row->trx_state = "LOCK WAIT";
+ } else {
+ row->trx_state = "RUNNING";
+ }
+
+ row->requested_lock_row = requested_lock_row;
+ ut_ad(requested_lock_row == NULL
+ || i_s_locks_row_validate(requested_lock_row));
+
+ ut_ad(!wait_lock == !requested_lock_row);
+
+ const my_hrtime_t suspend_time= trx->lock.suspend_time;
+ row->trx_wait_started = wait_lock ? hrtime_to_time(suspend_time) : 0;
+
+ row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+ if (trx->mysql_thd == NULL) {
+ /* For internal transactions e.g., purge and transactions
+ being recovered at startup there is no associated MySQL
+ thread data structure. */
+ row->trx_mysql_thread_id = 0;
+ row->trx_query = NULL;
+ goto thd_done;
+ }
+
+ row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+ char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+ if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+ sizeof query)) {
+ row->trx_query = static_cast<const char*>(
+ ha_storage_put_memlim(
+ cache->storage, query, stmt_len + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache)));
+
+ row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+ if (row->trx_query == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_query = NULL;
+ }
+
+thd_done:
+ row->trx_operation_state = trx->op_info;
+
+ row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+ row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+ /* These are protected by lock_sys.latch (which we are holding)
+ and sometimes also trx->mutex. */
+
+ row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+ row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+ row->trx_rows_locked = trx->lock.n_rec_locks;
+
+ row->trx_rows_modified = trx->undo_no;
+
+ row->trx_isolation_level = trx->isolation_level;
+
+ row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+ row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+ s = trx->detailed_error;
+
+ if (s != NULL && s[0] != '\0') {
+
+ TRX_I_S_STRING_COPY(s,
+ row->trx_foreign_key_error,
+ TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+ if (row->trx_foreign_key_error == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+ row->trx_foreign_key_error = NULL;
+ }
+
+ row->trx_is_read_only = trx->read_only;
+
+ row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+ char* buf, /*!< out: buffer */
+ ulint buf_size,/*!< in: buffer size in bytes */
+ ulint n, /*!< in: number of field */
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets)/*!< in: record offsets, returned
+ by rec_get_offsets() */
+{
+ const byte* data;
+ ulint data_len;
+ dict_field_t* dict_field;
+ ulint ret;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ ret = 0;
+
+ if (n > 0) {
+ /* we must append ", " before the actual data */
+
+ if (buf_size < 3) {
+
+ buf[0] = '\0';
+ return(1);
+ }
+
+ memcpy(buf, ", ", 3);
+
+ buf += 2;
+ buf_size -= 2;
+ ret += 2;
+ }
+
+ /* now buf_size >= 1 */
+
+ data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+ dict_field = dict_index_get_nth_field(index, n);
+
+ ret += row_raw_format((const char*) data, data_len,
+ dict_field, buf, buf_size);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+ const char** lock_data,/*!< out: "lock_data" to fill */
+ const lock_t* lock, /*!< in: lock used to find the data */
+ ulint heap_no,/*!< in: rec num used to find the data */
+ trx_i_s_cache_t* cache) /*!< in/out: cache where to store
+ volatile data */
+{
+ ut_a(!lock->is_table());
+
+ switch (heap_no) {
+ case PAGE_HEAP_NO_INFIMUM:
+ case PAGE_HEAP_NO_SUPREMUM:
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage,
+ heap_no == PAGE_HEAP_NO_INFIMUM
+ ? "infimum pseudo-record"
+ : "supremum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ return(*lock_data != NULL);
+ }
+
+ mtr_t mtr;
+
+ const buf_block_t* block;
+ const page_t* page;
+ const rec_t* rec;
+ ulint n_fields;
+ mem_heap_t* heap;
+ rec_offs offsets_onstack[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets;
+ char buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+ ulint buf_used;
+ ulint i;
+
+ mtr_start(&mtr);
+
+ block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+ if (block == NULL) {
+
+ *lock_data = NULL;
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+ rec_offs_init(offsets_onstack);
+ offsets = offsets_onstack;
+
+ rec = page_find_rec_with_heap_no(page, heap_no);
+
+ const dict_index_t* index = lock->index;
+ ut_ad(index->is_primary() || !dict_index_is_online_ddl(index));
+
+ n_fields = dict_index_get_n_unique(index);
+
+ ut_a(n_fields > 0);
+
+ heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ n_fields, &heap);
+
+ /* format and store the data */
+
+ buf_used = 0;
+ for (i = 0; i < n_fields; i++) {
+
+ buf_used += put_nth_field(
+ buf + buf_used, sizeof(buf) - buf_used,
+ i, index, rec, offsets) - 1;
+ }
+
+ *lock_data = (const char*) ha_storage_put_memlim(
+ cache->storage, buf, buf_used + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (heap != NULL) {
+
+ /* this means that rec_get_offsets() has created a new
+ heap and has stored offsets in it; check that this is
+ really the case and free the heap */
+ ut_a(offsets != offsets_onstack);
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ if (*lock_data == NULL) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/** @return the table of a lock */
+static const dict_table_t *lock_get_table(const lock_t &lock)
+{
+ if (lock.is_table())
+ return lock.un_member.tab_lock.table;
+ ut_ad(lock.index->is_primary() || !dict_index_is_online_ddl(lock.index));
+ return lock.index->table;
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+ i_s_locks_row_t* row, /*!< out: result object that's filled */
+ const lock_t* lock, /*!< in: lock to get data from */
+ uint16_t heap_no,/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy
+ volatile strings */
+{
+ row->lock_trx_id = lock->trx->id;
+ const bool is_gap_lock = lock->is_gap();
+ ut_ad(!is_gap_lock || !lock->is_table());
+ switch (lock->mode()) {
+ case LOCK_S:
+ row->lock_mode = uint8_t(1 + is_gap_lock);
+ break;
+ case LOCK_X:
+ row->lock_mode = uint8_t(3 + is_gap_lock);
+ break;
+ case LOCK_IS:
+ row->lock_mode = uint8_t(5 + is_gap_lock);
+ break;
+ case LOCK_IX:
+ row->lock_mode = uint8_t(7 + is_gap_lock);
+ break;
+ case LOCK_AUTO_INC:
+ row->lock_mode = 9;
+ break;
+ default:
+ ut_ad("unknown lock mode" == 0);
+ row->lock_mode = 0;
+ }
+
+ const dict_table_t* table= lock_get_table(*lock);
+
+ row->lock_table = ha_storage_put_str_memlim(
+ cache->storage, table->name.m_name,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_table == NULL) {
+
+ return false;
+ }
+
+ if (!lock->is_table()) {
+ row->lock_index = ha_storage_put_str_memlim(
+ cache->storage, lock->index->name,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_index == NULL) {
+
+ return false;
+ }
+
+ row->lock_page = lock->un_member.rec_lock.page_id;
+ row->lock_rec = heap_no;
+
+ if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ return false;
+ }
+ } else {
+ row->lock_index = NULL;
+
+ row->lock_page = page_id_t(0, 0);
+ row->lock_rec = 0;
+
+ row->lock_data = NULL;
+ }
+
+ row->lock_table_id = table->id;
+
+ row->hash_chain.value = row;
+ ut_ad(i_s_locks_row_validate(row));
+
+ return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+ i_s_lock_waits_row_t* row, /*!< out: result object
+ that's filled */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ ut_ad(i_s_locks_row_validate(requested_lock_row));
+ ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+ row->requested_lock_row = requested_lock_row;
+ row->blocking_lock_row = blocking_lock_row;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+ const lock_t* lock, /*!< in: lock object to fold */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+ static ulint fold = 0;
+
+ return(fold++);
+#else
+ ulint ret;
+
+ if (!lock->is_table()) {
+ ut_a(heap_no != 0xFFFF);
+ ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+ lock->un_member.rec_lock.page_id.
+ fold());
+ ret = ut_fold_ulint_pair(ret, heap_no);
+ } else {
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ ret = (ulint) lock_get_table(*lock)->id;
+ }
+
+ return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ const lock_t* lock, /*!< in: lock object */
+ ulint heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+ return(0);
+#else
+ if (!lock->is_table()) {
+ ut_a(heap_no != 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_page == lock->un_member.rec_lock.page_id
+ && row->lock_rec == heap_no);
+ } else {
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == 0xFFFF);
+
+ return(row->lock_trx_id == lock->trx->id
+ && row->lock_table_id == lock_get_table(*lock)->id);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ const lock_t* lock, /*!< in: lock to search for */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0xFFFF if the lock
+ is a table lock */
+{
+ i_s_hash_chain_t* hash_chain;
+
+ HASH_SEARCH(
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* the type of the next variable */
+ i_s_hash_chain_t*,
+ /* auxiliary variable */
+ hash_chain,
+ /* assertion on every traversed item */
+ ut_ad(i_s_locks_row_validate(hash_chain->value)),
+ /* this determines if we have found the lock */
+ locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+ if (hash_chain == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const lock_t* lock, /*!< in: the element to add */
+ uint16_t heap_no)/*!< in: lock's record number
+ or 0 if the lock
+ is a table lock */
+{
+ i_s_locks_row_t* dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ ulint i;
+ for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+ /* quit if this lock is already present */
+ dst_row = search_innodb_locks(cache, lock, heap_no);
+ if (dst_row != NULL) {
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+ }
+#endif
+
+ dst_row = (i_s_locks_row_t*)
+ table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(NULL);
+ }
+
+ if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_locks.rows_used--;
+ return(NULL);
+ }
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+ HASH_INSERT(
+ /* the type used in the hash chain */
+ i_s_hash_chain_t,
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ &cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* add this data to the hash */
+ &dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ } /* for()-loop */
+#endif
+
+ ut_ad(i_s_locks_row_validate(dst_row));
+ return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ i_s_lock_waits_row_t* dst_row;
+
+ dst_row = (i_s_lock_waits_row_t*)
+ table_cache_create_empty_row(&cache->innodb_lock_waits,
+ cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(FALSE);
+ }
+
+ fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const trx_t* trx, /*!< in: transaction */
+ i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the
+ requested lock row, or NULL or
+ undefined */
+{
+ lock_sys.assert_locked();
+
+ /* If transaction is waiting we add the wait lock and all locks
+ from another transactions that are blocking the wait lock. */
+ if (const lock_t *wait_lock = trx->lock.wait_lock) {
+
+ const lock_t* curr_lock;
+ i_s_locks_row_t* blocking_lock_row;
+ lock_queue_iterator_t iter;
+
+ uint16_t wait_lock_heap_no
+ = wait_lock_get_heap_no(wait_lock);
+
+ /* add the requested lock */
+ *requested_lock_row = add_lock_to_cache(cache, wait_lock,
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (*requested_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* then iterate over the locks before the wait lock and
+ add the ones that are blocking it */
+
+ lock_queue_iterator_reset(&iter, wait_lock, ULINT_UNDEFINED);
+
+ for (curr_lock = lock_queue_iterator_get_prev(&iter);
+ curr_lock != NULL;
+ curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+ if (lock_has_to_wait(wait_lock, curr_lock)) {
+
+ /* add the lock that is
+ blocking wait_lock */
+ blocking_lock_row
+ = add_lock_to_cache(
+ cache, curr_lock,
+ /* heap_no is the same
+ for the wait and waited
+ locks */
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (blocking_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* add the relation between both locks
+ to innodb_lock_waits */
+ if (!add_lock_wait_to_cache(
+ cache, *requested_lock_row,
+ blocking_lock_row)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+ }
+ }
+ } else {
+
+ *requested_lock_row = NULL;
+ }
+
+ return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS 100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+ /* cache->last_read is only updated when a shared rw lock on the
+ whole cache is being held (see trx_i_s_cache_end_read()) and
+ we are currently holding an exclusive rw lock on the cache.
+ So it is not possible for last_read to be updated while we are
+ reading it. */
+ return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+ trx_i_s_cache_t* cache) /*!< out: cache to clear */
+{
+ cache->innodb_trx.rows_used = 0;
+ cache->innodb_locks.rows_used = 0;
+ cache->innodb_lock_waits.rows_used = 0;
+
+ cache->locks_hash.clear();
+
+ ha_storage_empty(&cache->storage);
+}
+
+
+/**
+ Add transactions to innodb_trx's cache.
+
+ We also add all locks that are relevant to each transaction into
+ innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+ i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+ {
+ const auto state= trx->state;
+
+ if (trx->is_autocommit_non_locking())
+ {
+ ut_ad(trx->read_only);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->mysql_thd);
+ ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+ }
+ else
+ ut_ad(state == TRX_STATE_ACTIVE ||
+ state == TRX_STATE_PREPARED ||
+ state == TRX_STATE_PREPARED_RECOVERED ||
+ state == TRX_STATE_COMMITTED_IN_MEMORY);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+ {
+ if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+ table_cache_create_empty_row(&cache->innodb_trx, cache)))
+ {
+ if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+ return;
+ --cache->innodb_trx.rows_used;
+ }
+ }
+
+ /* memory could not be allocated */
+ cache->is_truncated= true;
+}
+
+
+/**
+ Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+ table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+ LockMutexGuard g{SRW_LOCK_CALL};
+ trx_i_s_cache_clear(cache);
+
+ /* Capture the state of transactions */
+ trx_sys.trx_list.for_each([cache](trx_t &trx) {
+ if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+ &trx != (purge_sys.query ? purge_sys.query->trx : nullptr))
+ {
+ trx.mutex_lock();
+ if (trx.state != TRX_STATE_NOT_STARTED)
+ fetch_data_into_cache_low(cache, &trx);
+ trx.mutex_unlock();
+ }
+ });
+ cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ if (!can_cache_be_updated(cache)) {
+
+ return(1);
+ }
+
+ /* We need to read trx_sys and record/table lock queues */
+ fetch_data_into_cache(cache);
+
+ /* update cache last read time */
+ cache->last_read = my_interval_timer();
+
+ return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< out: cache to init */
+{
+ /* The latching is done in the following order:
+ acquire trx_i_s_cache_t::rw_lock, rwlock
+ acquire exclusive lock_sys.latch
+ release exclusive lock_sys.latch
+ release trx_i_s_cache_t::rw_lock
+ acquire trx_i_s_cache_t::rw_lock, rdlock
+ release trx_i_s_cache_t::rw_lock */
+
+ cache->rw_lock.SRW_LOCK_INIT(trx_i_s_cache_lock_key);
+
+ cache->last_read = 0;
+
+ table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+ table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+ table_cache_init(&cache->innodb_lock_waits,
+ sizeof(i_s_lock_waits_row_t));
+
+ cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+ cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+ CACHE_STORAGE_HASH_CELLS);
+
+ cache->mem_allocd = 0;
+
+ cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< in, own: cache to free */
+{
+ cache->rw_lock.destroy();
+
+ cache->locks_hash.free();
+ ha_storage_free(cache->storage);
+ table_cache_free(&cache->innodb_trx);
+ table_cache_free(&cache->innodb_locks);
+ table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->rw_lock.rd_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->last_read = my_interval_timer();
+ cache->rw_lock.rd_unlock();
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->rw_lock.wr_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ cache->rw_lock.wr_unlock();
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+ trx_i_s_cache_t* cache, /*!< in: whole cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ switch (table) {
+ case I_S_INNODB_TRX:
+ return &cache->innodb_trx;
+ case I_S_INNODB_LOCKS:
+ return &cache->innodb_locks;
+ case I_S_INNODB_LOCK_WAITS:
+ return &cache->innodb_lock_waits;
+ }
+
+ ut_error;
+ return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+ table_cache = cache_select_table(cache, table);
+
+ return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n) /*!< in: row number */
+{
+ i_s_table_cache_t* table_cache;
+ ulint i;
+ void* row;
+
+ table_cache = cache_select_table(cache, table);
+
+ ut_a(n < table_cache->rows_used);
+
+ row = NULL;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd > n) {
+
+ row = (char*) table_cache->chunks[i].base
+ + (n - table_cache->chunks[i].offset)
+ * table_cache->row_size;
+ break;
+ }
+ }
+
+ ut_a(row != NULL);
+
+ return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size)/*!< in: size of the lock id
+ buffer */
+{
+ int res_len;
+
+ /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+ if (row->lock_index) {
+ /* record lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT
+ ":%u:%u:%u",
+ row->lock_trx_id, row->lock_page.space(),
+ row->lock_page.page_no(), row->lock_rec);
+ } else {
+ /* table lock */
+ res_len = snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT":" UINT64PF,
+ row->lock_trx_id,
+ row->lock_table_id);
+ }
+
+ /* the typecast is safe because snprintf(3) never returns
+ negative result */
+ ut_a(res_len >= 0);
+ ut_a((ulint) res_len < lock_id_size);
+
+ return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..1f31ceda
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1480 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "dict0load.h"
+#include <mysql/service_thd_mdl.h>
+#include <mysql/service_wsrep.h>
+
+/** Maximum allowable purge history length. <=0 means 'infinite'. */
+ulong srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t purge_sys;
+
+#ifdef UNIV_DEBUG
+my_bool srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+ : m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@retval false when nothing is to be purged
+@retval true when purge_sys.rseg->latch was locked */
+inline bool TrxUndoRsegsIterator::set_next()
+{
+ ut_ad(!purge_sys.next_stored);
+ mysql_mutex_lock(&purge_sys.pq_mutex);
+
+ /* Only purge consumes events from the priority queue, user
+ threads only produce the events. */
+
+ /* Check if there are more rsegs to process in the
+ current element. */
+ if (m_iter != m_rsegs.end()) {
+ /* We are still processing rollback segment from
+ the same transaction and so expected transaction
+ number shouldn't increase. Undo the increment of
+ expected commit done by caller assuming rollback
+ segments from given transaction are done. */
+ purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+ } else if (!purge_sys.purge_queue.empty()) {
+ m_rsegs = purge_sys.purge_queue.top();
+ purge_sys.purge_queue.pop();
+ ut_ad(purge_sys.purge_queue.empty()
+ || purge_sys.purge_queue.top() != m_rsegs);
+ m_iter = m_rsegs.begin();
+ } else {
+ /* Queue is empty, reset iterator. */
+ purge_sys.rseg = NULL;
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+ m_rsegs = NullElement;
+ m_iter = m_rsegs.begin();
+ return false;
+ }
+
+ purge_sys.rseg = *m_iter++;
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+
+ /* We assume in purge of externally stored fields that space
+ id is in the range of UNDO tablespace space ids */
+ ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+ || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+ purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
+ trx_id_t last_trx_no = purge_sys.rseg->last_trx_no();
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+ /* Only the purge_coordinator_task will access this object
+ purge_sys.rseg_iter, or any of purge_sys.hdr_page_no,
+ purge_sys.tail.
+ The field purge_sys.head and purge_sys.view are modified by
+ purge_sys_t::clone_end_view()
+ in the purge_coordinator_task
+ while holding exclusive purge_sys.latch.
+ The purge_sys.view may also be modified by
+ purge_sys_t::wake_if_not_active() while holding exclusive
+ purge_sys.latch.
+ The purge_sys.head may be read by
+ purge_truncation_callback(). */
+ ut_ad(last_trx_no == m_rsegs.trx_no);
+ ut_a(purge_sys.hdr_page_no != FIL_NULL);
+ ut_a(purge_sys.tail.trx_no <= last_trx_no);
+ purge_sys.tail.trx_no = last_trx_no;
+
+ return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+ ut_a(srv_n_purge_threads > 0);
+
+ trx_t* trx = trx_create();
+ ut_ad(!trx->id);
+ trx->start_time = time(NULL);
+ trx->start_time_micro = microsecond_interval_timer();
+ trx->state = TRX_STATE_ACTIVE;
+ trx->op_info = "purge trx";
+
+ mem_heap_t* heap = mem_heap_create(512);
+ que_fork_t* fork = que_fork_create(heap);
+ fork->trx = trx;
+
+ for (auto i = innodb_purge_threads_MAX; i; i--) {
+ que_thr_t* thr = que_thr_create(fork, heap, NULL);
+ thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+ purge_node_t(thr);
+ }
+
+ return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+ ut_ad(this == &purge_sys);
+ ut_ad(!m_initialized);
+ ut_ad(!enabled());
+ m_paused= 0;
+ query= purge_graph_build();
+ next_stored= false;
+ rseg= NULL;
+ page_no= 0;
+ offset= 0;
+ hdr_page_no= 0;
+ hdr_offset= 0;
+ latch.SRW_LOCK_INIT(trx_purge_latch_key);
+ end_latch.init();
+ mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr);
+ truncate.current= NULL;
+ truncate.last= NULL;
+ m_initialized= true;
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+ ut_ad(this == &purge_sys);
+ if (!m_initialized)
+ return;
+
+ ut_ad(!enabled());
+ trx_t *trx= query->trx;
+ que_graph_free(query);
+ ut_ad(!trx->id);
+ ut_ad(trx->state == TRX_STATE_ACTIVE);
+ trx->state= TRX_STATE_NOT_STARTED;
+ trx->free();
+ latch.destroy();
+ end_latch.destroy();
+ mysql_mutex_destroy(&pq_mutex);
+ m_initialized= false;
+}
+
+/** Determine if the history of a transaction is purgeable.
+@param trx_id transaction identifier
+@return whether the history is purgeable */
+TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const
+{
+ bool purgeable;
+#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION
+ purgeable= false;
+ if (xbegin())
+ {
+ if (!latch.is_write_locked())
+ {
+ purgeable= view.changes_visible(trx_id);
+ xend();
+ }
+ else
+ xabort();
+ }
+ else
+#endif
+ {
+ latch.rd_lock(SRW_LOCK_CALL);
+ purgeable= view.changes_visible(trx_id);
+ latch.rd_unlock();
+ }
+ return purgeable;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in] trx transaction
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+ DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+ trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+ ut_ad(undo == trx->rsegs.m_redo.undo);
+ trx_rseg_t *rseg= trx->rsegs.m_redo.rseg;
+ ut_ad(undo->rseg == rseg);
+ buf_block_t *rseg_header= rseg->get(mtr, nullptr);
+ /* We are in transaction commit; we cannot return an error. If the
+ database is corrupted, it is better to crash it than to
+ intentionally violate ACID by committing something that is known to
+ be corrupted. */
+ ut_ad(rseg_header);
+ buf_block_t *undo_page=
+ buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0,
+ RW_X_LATCH, mtr);
+ /* This function is invoked during transaction commit, which is not
+ allowed to fail. If we get a corrupted undo header, we will crash here. */
+ ut_a(undo_page);
+ trx_ulogf_t *undo_header= undo_page->page.frame + undo->hdr_offset;
+
+ ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+ ut_ad(rseg->needs_purge > trx->id);
+ ut_ad(rseg->last_page_no != FIL_NULL);
+
+ rseg->history_size++;
+
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+ rseg_header->page.frame)))
+ /* This database must have been upgraded from before MariaDB 10.3.5. */
+ trx_rseg_format_upgrade(rseg_header, mtr);
+
+ uint16_t undo_state;
+
+ if (undo->size == 1 &&
+ TRX_UNDO_PAGE_REUSE_LIMIT >
+ mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+ undo_page->page.frame))
+ {
+ undo->state= undo_state= TRX_UNDO_CACHED;
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ }
+ else
+ {
+ ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST +
+ undo_page->page.frame));
+ /* The undo log segment will not be reused */
+ static_assert(FIL_NULL == 0xffffffff, "");
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+ uint32_t hist_size= mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+ rseg_header->page.frame);
+ mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+ rseg_header->page.frame, hist_size + undo->size);
+ mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+ rseg_header->page.frame, trx_sys.get_max_trx_id());
+ ut_free(undo);
+ undo_state= TRX_UNDO_TO_PURGE;
+ }
+
+ undo= nullptr;
+
+ /*
+ Before any transaction-generating background threads or the purge
+ have been started, we can start transactions in
+ row_merge_drop_temp_indexes(), and roll back recovered transactions.
+
+ Arbitrary user transactions may be executed when all the undo log
+ related background processes (including purge) are disabled due to
+ innodb_force_recovery=2 or innodb_force_recovery=3. DROP TABLE may
+ be executed at any innodb_force_recovery level.
+
+ During fast shutdown, we may also continue to execute user
+ transactions. */
+ ut_ad(srv_undo_sources || srv_fast_shutdown ||
+ (!purge_sys.enabled() &&
+ (srv_is_being_started ||
+ srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)));
+
+#ifdef WITH_WSREP
+ if (wsrep_is_wsrep_xid(&trx->xid))
+ trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr);
+#endif
+
+ if (trx->mysql_log_file_name && *trx->mysql_log_file_name)
+ /* Update the latest binlog name and offset if log_bin=ON or this
+ is a replica. */
+ trx_rseg_update_binlog_offset(rseg_header, trx->mysql_log_file_name,
+ trx->mysql_log_offset, mtr);
+
+ /* Add the log as the first in the history list */
+
+ /* We are in transaction commit; we cannot return an error
+ when detecting corruption. It is better to crash the server
+ than to intentionally violate ACID by committing something
+ that is known to be corrupted. */
+ ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+ uint16_t(page_offset(undo_header) +
+ TRX_UNDO_HISTORY_NODE), mtr) == DB_SUCCESS);
+
+ mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
+ undo_page->page.frame, undo_state);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, undo_header + TRX_UNDO_TRX_NO,
+ trx->rw_trx_hash_element->no);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header +
+ TRX_UNDO_NEEDS_PURGE, 1U);
+}
+
+/** Free an undo log segment.
+@param block rollback segment header page
+@param mtr mini-transaction */
+static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr)
+{
+ while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+ block->page.frame, &mtr))
+ {
+ block->fix();
+ ut_d(const page_id_t id{block->page.id()});
+ mtr.commit();
+ /* NOTE: If the server is killed after the log that was produced
+ up to this point was written, and before the log from the mtr.commit()
+ in our caller is written, then the pages belonging to the
+ undo log will become unaccessible garbage.
+
+ This does not matter when using multiple innodb_undo_tablespaces;
+ innodb_undo_log_truncate=ON will be able to reclaim the space. */
+ mtr.start();
+ block->page.lock.x_lock();
+ ut_ad(block->page.id() == id);
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+ }
+
+ while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+ block->page.frame, &mtr));
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param rseg rollback segment
+@param limit truncate anything before this
+@param all whether everything can be truncated
+@return error code */
+static dberr_t
+trx_purge_truncate_rseg_history(trx_rseg_t &rseg,
+ const purge_sys_t::iterator &limit, bool all)
+{
+ fil_addr_t hdr_addr;
+ mtr_t mtr;
+
+ mtr.start();
+
+ dberr_t err;
+ buf_block_t *rseg_hdr= rseg.get(&mtr, &err);
+ if (!rseg_hdr)
+ {
+func_exit:
+ mtr.commit();
+ return err;
+ }
+
+ hdr_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY + rseg_hdr->page.frame);
+ hdr_addr.boffset= static_cast<uint16_t>(hdr_addr.boffset -
+ TRX_UNDO_HISTORY_NODE);
+
+loop:
+ if (hdr_addr.page == FIL_NULL)
+ goto func_exit;
+
+ buf_block_t *b=
+ buf_page_get_gen(page_id_t(rseg.space->id, hdr_addr.page),
+ 0, RW_X_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+ &mtr, &err);
+ if (!b)
+ goto func_exit;
+
+ const trx_id_t undo_trx_no=
+ mach_read_from_8(b->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO);
+
+ if (undo_trx_no >= limit.trx_no)
+ {
+ if (undo_trx_no == limit.trx_no)
+ err = trx_undo_truncate_start(&rseg, hdr_addr.page,
+ hdr_addr.boffset, limit.undo_no);
+ goto func_exit;
+ }
+
+ if (!all)
+ goto func_exit;
+
+ fil_addr_t prev_hdr_addr=
+ flst_get_prev_addr(b->page.frame + hdr_addr.boffset +
+ TRX_UNDO_HISTORY_NODE);
+ prev_hdr_addr.boffset= static_cast<uint16_t>(prev_hdr_addr.boffset -
+ TRX_UNDO_HISTORY_NODE);
+
+ err= flst_remove(rseg_hdr, TRX_RSEG + TRX_RSEG_HISTORY, b,
+ uint16_t(hdr_addr.boffset + TRX_UNDO_HISTORY_NODE), &mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ goto func_exit;
+
+ rseg_hdr->fix();
+
+ if (mach_read_from_2(b->page.frame + hdr_addr.boffset + TRX_UNDO_NEXT_LOG))
+ /* We cannot free the entire undo log segment. */;
+ else
+ {
+ const uint32_t seg_size=
+ flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + b->page.frame);
+ switch (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
+ b->page.frame)) {
+ case TRX_UNDO_TO_PURGE:
+ {
+ byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->page.frame;
+ ut_ad(mach_read_from_4(hist) >= seg_size);
+ mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+ }
+ free_segment:
+ ut_ad(rseg.curr_size >= seg_size);
+ rseg.curr_size-= seg_size;
+ trx_purge_free_segment(b, mtr);
+ break;
+ case TRX_UNDO_CACHED:
+ /* rseg.undo_cached must point to this page */
+ trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached);
+ for (; undo; undo= UT_LIST_GET_NEXT(undo_list, undo))
+ if (undo->hdr_page_no == hdr_addr.page)
+ goto found_cached;
+ ut_ad("inconsistent undo logs" == 0);
+ if (false)
+ found_cached:
+ UT_LIST_REMOVE(rseg.undo_cached, undo);
+ static_assert(FIL_NULL == 0xffffffff, "");
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+ rseg_hdr->page.frame)))
+ trx_rseg_format_upgrade(rseg_hdr, &mtr);
+ mtr.memset(rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+ ut_free(undo);
+ mtr.write<8,mtr_t::MAYBE_NOP>(*rseg_hdr, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+ rseg_hdr->page.frame,
+ trx_sys.get_max_trx_id() - 1);
+ goto free_segment;
+ }
+ }
+
+ hdr_addr= prev_hdr_addr;
+
+ mtr.commit();
+ ut_ad(rseg.history_size > 0);
+ rseg.history_size--;
+ mtr.start();
+ rseg_hdr->page.lock.x_lock();
+ ut_ad(rseg_hdr->page.id() == rseg.page_id());
+ mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_MODIFY);
+
+ goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in] space undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+ typedef std::vector<TrxUndoRsegs> purge_elem_list_t;
+ purge_elem_list_t purge_elem_list;
+
+ mysql_mutex_lock(&purge_sys.pq_mutex);
+
+ /* Remove rseg instances that are in the purge queue before we start
+ truncate of corresponding UNDO truncate. */
+ while (!purge_sys.purge_queue.empty()) {
+ purge_elem_list.push_back(purge_sys.purge_queue.top());
+ purge_sys.purge_queue.pop();
+ }
+
+ for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+ it != purge_elem_list.end();
+ ++it) {
+
+ for (TrxUndoRsegs::iterator it2 = it->begin();
+ it2 != it->end();
+ ++it2) {
+ if ((*it2)->space == &space) {
+ it->erase(it2);
+ break;
+ }
+ }
+
+ if (!it->empty()) {
+ purge_sys.purge_queue.push(*it);
+ }
+ }
+
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+}
+
+dberr_t purge_sys_t::iterator::free_history() const
+{
+ for (auto &rseg : trx_sys.rseg_array)
+ if (rseg.space)
+ {
+ ut_ad(rseg.is_persistent());
+ log_free_check();
+ rseg.latch.wr_lock(SRW_LOCK_CALL);
+ dberr_t err=
+ trx_purge_truncate_rseg_history(rseg, *this, !rseg.is_referenced() &&
+ purge_sys.sees(rseg.needs_purge));
+ rseg.latch.wr_unlock();
+ if (err)
+ return err;
+ }
+ return DB_SUCCESS;
+}
+
+#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
+# if defined __arm__ || defined __aarch64__
+/* Work around an internal compiler error in GCC 4.8.5 */
+__attribute__((optimize(0)))
+# endif
+#endif
+/**
+Remove unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller
+(purge_coordinator_callback or purge_truncation_callback)
+must not have any latches on undo log pages!
+*/
+TRANSACTIONAL_TARGET void trx_purge_truncate_history()
+{
+ ut_ad(purge_sys.head <= purge_sys.tail);
+ purge_sys_t::iterator &head= purge_sys.head.trx_no
+ ? purge_sys.head : purge_sys.tail;
+
+ if (head.trx_no >= purge_sys.low_limit_no())
+ {
+ /* This is sometimes necessary. TODO: find out why. */
+ head.trx_no= purge_sys.low_limit_no();
+ head.undo_no= 0;
+ }
+
+ if (head.free_history() != DB_SUCCESS || srv_undo_tablespaces_active < 2)
+ return;
+
+ while (srv_undo_log_truncate)
+ {
+ if (!purge_sys.truncate.current)
+ {
+ const ulint threshold=
+ ulint(srv_max_undo_log_size >> srv_page_size_shift);
+ for (uint32_t i= purge_sys.truncate.last
+ ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0,
+ j= i;; )
+ {
+ const uint32_t space_id= srv_undo_space_id_start + i;
+ ut_ad(srv_is_undo_tablespace(space_id));
+ fil_space_t *space= fil_space_get(space_id);
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+ if (space && space->get_size() > threshold)
+ {
+ purge_sys.truncate.current= space;
+ break;
+ }
+
+ ++i;
+ i %= srv_undo_tablespaces_active;
+ if (i == j)
+ return;
+ }
+ }
+
+ fil_space_t &space= *purge_sys.truncate.current;
+ /* Undo tablespace always are a single file. */
+ fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
+ /* The undo tablespace files are never closed. */
+ ut_ad(file->is_open());
+
+ DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+ for (auto &rseg : trx_sys.rseg_array)
+ if (rseg.space == &space)
+ /* Once set, this rseg will not be allocated to subsequent
+ transactions, but we will wait for existing active
+ transactions to finish. */
+ rseg.set_skip_allocation();
+
+ for (auto &rseg : trx_sys.rseg_array)
+ {
+ if (rseg.space != &space)
+ continue;
+
+ rseg.latch.rd_lock(SRW_LOCK_CALL);
+ ut_ad(rseg.skip_allocation());
+ if (rseg.is_referenced() || !purge_sys.sees(rseg.needs_purge))
+ {
+not_free:
+ rseg.latch.rd_unlock();
+ return;
+ }
+
+ ut_ad(UT_LIST_GET_LEN(rseg.undo_list) == 0);
+ /* Check if all segments are cached and safe to remove. */
+ ulint cached= 0;
+
+ for (const trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached); undo;
+ undo= UT_LIST_GET_NEXT(undo_list, undo))
+ {
+ if (head.trx_no && head.trx_no < undo->trx_id)
+ goto not_free;
+ else
+ cached+= undo->size;
+ }
+
+ ut_ad(rseg.curr_size > cached);
+ if (rseg.curr_size > cached + 1 &&
+ (rseg.history_size || srv_fast_shutdown || srv_undo_sources))
+ goto not_free;
+
+ rseg.latch.rd_unlock();
+ }
+
+ ib::info() << "Truncating " << file->name;
+ trx_purge_cleanse_purge_queue(space);
+
+ log_free_check();
+
+ mtr_t mtr;
+ mtr.start();
+ mtr.x_lock_space(&space);
+ const auto space_id= space.id;
+
+ /* Lock all modified pages of the tablespace.
+
+ During truncation, we do not want any writes to the file.
+
+ If a log checkpoint was completed at LSN earlier than our
+ mini-transaction commit and the server was killed, then
+ discarding the to-be-trimmed pages without flushing would
+ break crash recovery. */
+ rescan:
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+ {
+ ut_ad(bpage->oldest_modification());
+ ut_ad(bpage->in_file());
+
+ buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+ if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id)
+ {
+ ut_ad(bpage->frame);
+ bpage->fix();
+ {
+ /* Try to acquire an exclusive latch while the cache line is
+ fresh after fix(). */
+ const bool got_lock{bpage->lock.x_lock_try()};
+ buf_pool.flush_hp.set(prev);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (!got_lock)
+ bpage->lock.x_lock();
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* There is no AHI on undo tablespaces. */
+ ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+#endif
+ ut_ad(!bpage->is_io_fixed());
+ ut_ad(bpage->id().space() == space_id);
+
+ if (bpage->oldest_modification() > 2)
+ {
+ mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage),
+ MTR_MEMO_PAGE_X_FIX);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ ut_ad(bpage->oldest_modification() > 2);
+ bpage->reset_oldest_modification();
+ }
+ else
+ {
+ bpage->unfix();
+ bpage->lock.x_unlock();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ }
+
+ if (prev != buf_pool.flush_hp.get())
+ {
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ goto rescan;
+ }
+ }
+
+ bpage= prev;
+ }
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ /* Re-initialize tablespace, in a single mini-transaction. */
+ const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+
+ /* Adjust the tablespace metadata. */
+ mysql_mutex_lock(&fil_system.mutex);
+ space.set_stopping();
+ space.is_being_truncated= true;
+ if (space.crypt_data)
+ {
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ fil_space_crypt_close_tablespace(&space);
+ space.release();
+ }
+ else
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ for (auto i= 6000; space.referenced();
+ std::this_thread::sleep_for(std::chrono::milliseconds(10)))
+ {
+ if (!--i)
+ {
+ mtr.commit();
+ ib::error() << "Failed to freeze UNDO tablespace " << file->name;
+ return;
+ }
+ }
+
+ /* Associate the undo tablespace with mtr.
+ During mtr::commit_shrink(), InnoDB can use the undo
+ tablespace object to clear all freed ranges */
+ mtr.set_named_space(&space);
+ mtr.trim_pages(page_id_t(space.id, size));
+ ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
+ mysql_mutex_lock(&fil_system.mutex);
+ space.size= file->size= size;
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ for (auto &rseg : trx_sys.rseg_array)
+ {
+ if (rseg.space != &space)
+ continue;
+
+ ut_ad(!rseg.is_referenced());
+ /* We may actually have rseg.needs_purge > head.trx_no here
+ if trx_t::commit_empty() had been executed in the past,
+ possibly before this server had been started up. */
+
+ dberr_t err;
+ buf_block_t *rblock= trx_rseg_header_create(&space,
+ &rseg - trx_sys.rseg_array,
+ trx_sys.get_max_trx_id(),
+ &mtr, &err);
+ ut_a(rblock);
+ /* These were written by trx_rseg_header_create(). */
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+ rblock->page.frame));
+ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+ rblock->page.frame));
+ rseg.reinit(rblock->page.id().page_no());
+ }
+
+ mtr.commit_shrink(space);
+
+ /* No mutex; this is only updated by the purge coordinator. */
+ export_vars.innodb_undo_truncations++;
+
+ if (purge_sys.rseg && purge_sys.rseg->last_page_no == FIL_NULL)
+ {
+ /* If purge_sys.rseg is pointing to rseg that was recently
+ truncated then move to next rseg element.
+
+ Note: Ideally purge_sys.rseg should be NULL because purge should
+ complete processing of all the records but srv_purge_batch_size
+ can force the purge loop to exit before all the records are purged. */
+ purge_sys.rseg= nullptr;
+ purge_sys.next_stored= false;
+ }
+
+ DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
+ log_buffer_flush_to_disk();
+ DBUG_SUICIDE(););
+
+ ib::info() << "Truncated " << file->name;
+ purge_sys.truncate.last= purge_sys.truncate.current;
+ ut_ad(&space == purge_sys.truncate.current);
+ purge_sys.truncate.current= nullptr;
+ }
+}
+
+buf_block_t *purge_sys_t::get_page(page_id_t id)
+{
+ buf_block_t*& undo_page= pages[id];
+
+ if (undo_page)
+ return undo_page;
+
+ mtr_t mtr;
+ mtr.start();
+ undo_page=
+ buf_page_get_gen(id, 0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr);
+
+ if (UNIV_LIKELY(undo_page != nullptr))
+ {
+ undo_page->fix();
+ mtr.commit();
+ return undo_page;
+ }
+
+ mtr.commit();
+ pages.erase(id);
+ return nullptr;
+}
+
+void purge_sys_t::rseg_get_next_history_log()
+{
+ fil_addr_t prev_log_addr;
+
+ ut_ad(rseg->latch.is_write_locked());
+ ut_a(rseg->last_page_no != FIL_NULL);
+
+ tail.trx_no= rseg->last_trx_no() + 1;
+ tail.undo_no= 0;
+ next_stored= false;
+
+ if (buf_block_t *undo_page=
+ get_page(page_id_t(rseg->space->id, rseg->last_page_no)))
+ {
+ const byte *log_hdr= undo_page->page.frame + rseg->last_offset();
+ prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+ prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset -
+ TRX_UNDO_HISTORY_NODE);
+ }
+ else
+ prev_log_addr.page= FIL_NULL;
+
+ if (prev_log_addr.page == FIL_NULL)
+ rseg->last_page_no= FIL_NULL;
+ else
+ {
+ /* Read the previous log header. */
+ trx_id_t trx_no= 0;
+ if (const buf_block_t* undo_page=
+ get_page(page_id_t(rseg->space->id,
+ prev_log_addr.page)))
+ {
+ const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
+ trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+ ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+ }
+
+ if (UNIV_LIKELY(trx_no != 0))
+ {
+ rseg->last_page_no= prev_log_addr.page;
+ rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+
+ /* Purge can also produce events, however these are already
+ ordered in the rollback segment and any user generated event
+ will be greater than the events that Purge produces. ie. Purge
+ can never produce events from an empty rollback segment. */
+
+ mysql_mutex_lock(&pq_mutex);
+ purge_queue.push(*rseg);
+ mysql_mutex_unlock(&pq_mutex);
+ }
+ }
+
+ rseg->latch.wr_unlock();
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging.
+@retval false when nothing is to be purged
+@retval true when purge_sys.rseg->latch was locked */
+bool purge_sys_t::choose_next_log()
+{
+ if (!rseg_iter.set_next())
+ return false;
+
+ hdr_offset= rseg->last_offset();
+ hdr_page_no= rseg->last_page_no;
+
+ if (!rseg->needs_purge)
+ {
+ purge_nothing:
+ page_no= hdr_page_no;
+ offset= 0;
+ tail.undo_no= 0;
+ }
+ else
+ {
+ page_id_t id{rseg->space->id, hdr_page_no};
+ buf_block_t *b= get_page(id);
+ if (!b)
+ goto purge_nothing;
+ const trx_undo_rec_t *undo_rec=
+ trx_undo_page_get_first_rec(b, hdr_page_no, hdr_offset);
+ if (!undo_rec)
+ {
+ if (mach_read_from_2(b->page.frame + hdr_offset + TRX_UNDO_NEXT_LOG))
+ goto purge_nothing;
+ const uint32_t next=
+ mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ FLST_NEXT + FIL_ADDR_PAGE + b->page.frame);
+ if (next == FIL_NULL)
+ goto purge_nothing;
+ id.set_page_no(next);
+ b= get_page(id);
+ if (!b)
+ goto purge_nothing;
+ undo_rec=
+ trx_undo_page_get_first_rec(b, page_no, hdr_offset);
+ if (!undo_rec)
+ goto purge_nothing;
+ }
+
+ offset= page_offset(undo_rec);
+ tail.undo_no= trx_undo_rec_get_undo_no(undo_rec);
+ page_no= id.page_no();
+ }
+
+ next_stored= true;
+ return true;
+}
+
+/**
+Get the next record to purge and update the info in the purge system.
+@param roll_ptr undo log pointer to the record
+@return buffer-fixed reference to undo log record
+@retval {nullptr,1} if the whole undo log can skipped in purge
+@retval {nullptr,0} if nothing is left, or on corruption */
+inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr)
+{
+ ut_ad(next_stored);
+ ut_ad(tail.trx_no < low_limit_no());
+ ut_ad(rseg->latch.is_write_locked());
+
+ if (!offset)
+ {
+ /* It is the dummy undo log record, which means that there is no
+ need to purge this undo log */
+ rseg_get_next_history_log();
+
+ /* Look for the next undo log and record to purge */
+ if (choose_next_log())
+ rseg->latch.wr_unlock();
+ return {nullptr, 1};
+ }
+
+ ut_ad(offset == uint16_t(roll_ptr));
+
+ page_id_t page_id{rseg->space->id, page_no};
+ bool locked= true;
+ buf_block_t *b= get_page(page_id);
+ if (UNIV_UNLIKELY(!b))
+ {
+ if (locked)
+ rseg->latch.wr_unlock();
+ return {nullptr, 0};
+ }
+
+ if (const trx_undo_rec_t *rec2=
+ trx_undo_page_get_next_rec(b, offset, hdr_page_no, hdr_offset))
+ {
+ got_rec:
+ ut_ad(page_no == page_id.page_no());
+ offset= page_offset(rec2);
+ tail.undo_no= trx_undo_rec_get_undo_no(rec2);
+ }
+ else if (hdr_page_no != page_no ||
+ !mach_read_from_2(b->page.frame + hdr_offset + TRX_UNDO_NEXT_LOG))
+ {
+ uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ FLST_NEXT + FIL_ADDR_PAGE + b->page.frame);
+ if (next != FIL_NULL)
+ {
+ page_id.set_page_no(next);
+ if (buf_block_t *next_page= get_page(page_id))
+ {
+ rec2= trx_undo_page_get_first_rec(next_page, hdr_page_no, hdr_offset);
+ if (rec2)
+ {
+ page_no= next;
+ goto got_rec;
+ }
+ }
+ }
+ goto got_no_rec;
+ }
+ else
+ {
+ got_no_rec:
+ rseg_get_next_history_log();
+ /* Look for the next undo log and record to purge */
+ locked= choose_next_log();
+ }
+
+ if (locked)
+ rseg->latch.wr_unlock();
+
+ return {b->page.frame + uint16_t(roll_ptr), roll_ptr};
+}
+
+inline trx_purge_rec_t purge_sys_t::fetch_next_rec()
+{
+ roll_ptr_t roll_ptr;
+
+ if (!next_stored)
+ {
+ bool locked= choose_next_log();
+ ut_ad(locked == next_stored);
+ if (!locked)
+ goto got_nothing;
+ if (tail.trx_no >= low_limit_no())
+ {
+ rseg->latch.wr_unlock();
+ goto got_nothing;
+ }
+ /* row_purge_record_func() will later set ROLL_PTR_INSERT_FLAG for
+ TRX_UNDO_INSERT_REC */
+ roll_ptr= trx_undo_build_roll_ptr(false, trx_sys.rseg_id(rseg, true),
+ page_no, offset);
+ }
+ else if (tail.trx_no >= low_limit_no())
+ got_nothing:
+ return {nullptr, 0};
+ else
+ {
+ roll_ptr= trx_undo_build_roll_ptr(false, trx_sys.rseg_id(rseg, true),
+ page_no, offset);
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ }
+
+ /* The following will advance the purge iterator. */
+ return get_next_rec(roll_ptr);
+}
+
+/** Close all tables that were opened in a purge batch for a worker.
+@param node purge task context
+@param thd purge coordinator thread handle */
+static void trx_purge_close_tables(purge_node_t *node, THD *thd)
+{
+ for (auto &t : node->tables)
+ {
+ if (!t.second.first);
+ else if (t.second.first == reinterpret_cast<dict_table_t*>(-1));
+ else
+ {
+ dict_table_close(t.second.first, false, thd, t.second.second);
+ t.second.first= reinterpret_cast<dict_table_t*>(-1);
+ }
+ }
+}
+
+void purge_sys_t::wait_FTS(bool also_sys)
+{
+ bool paused;
+ do
+ {
+ latch.wr_lock(SRW_LOCK_CALL);
+ paused= m_FTS_paused || (also_sys && m_SYS_paused);
+ latch.wr_unlock();
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+ }
+ while (paused);
+}
+
+__attribute__((nonnull))
+/** Aqcuire a metadata lock on a table.
+@param table table handle
+@param mdl_context metadata lock acquisition context
+@param mdl metadata lcok
+@return table handle
+@retval nullptr if the table is not found or accessible
+@retval -1 if the purge of history must be suspended due to DDL */
+static dict_table_t *trx_purge_table_acquire(dict_table_t *table,
+ MDL_context *mdl_context,
+ MDL_ticket **mdl)
+{
+ ut_ad(dict_sys.frozen_not_locked());
+ *mdl= nullptr;
+
+ if (!table->is_readable() || table->corrupted)
+ {
+ table->release();
+ return nullptr;
+ }
+
+ size_t db_len= dict_get_db_name_len(table->name.m_name);
+ if (db_len == 0)
+ return table; /* InnoDB system tables are not covered by MDL */
+
+ if (purge_sys.must_wait_FTS())
+ {
+ must_wait:
+ table->release();
+ return reinterpret_cast<dict_table_t*>(-1);
+ }
+
+ char db_buf[NAME_LEN + 1];
+ char tbl_buf[NAME_LEN + 1];
+ size_t tbl_len;
+
+ if (!table->parse_name<true>(db_buf, tbl_buf, &db_len, &tbl_len))
+ /* The name of an intermediate table starts with #sql */
+ return table;
+
+ {
+ MDL_request request;
+ MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED,
+ MDL_EXPLICIT);
+ if (mdl_context->try_acquire_lock(&request))
+ goto must_wait;
+ *mdl= request.ticket;
+ if (!*mdl)
+ goto must_wait;
+ }
+
+ return table;
+}
+
+/** Open a table handle for the purge of committed transaction history
+@param table_id InnoDB table identifier
+@param mdl_context metadata lock acquisition context
+@param mdl metadata lcok
+@return table handle
+@retval nullptr if the table is not found or accessible
+@retval -1 if the purge of history must be suspended due to DDL */
+static dict_table_t *trx_purge_table_open(table_id_t table_id,
+ MDL_context *mdl_context,
+ MDL_ticket **mdl)
+{
+ dict_sys.freeze(SRW_LOCK_CALL);
+
+ dict_table_t *table= dict_sys.find_table(table_id);
+
+ if (table)
+ table->acquire();
+ else
+ {
+ dict_sys.unfreeze();
+ dict_sys.lock(SRW_LOCK_CALL);
+ table= dict_load_table_on_id(table_id, DICT_ERR_IGNORE_FK_NOKEY);
+ if (table)
+ table->acquire();
+ dict_sys.unlock();
+ if (!table)
+ return nullptr;
+ dict_sys.freeze(SRW_LOCK_CALL);
+ }
+
+ table= trx_purge_table_acquire(table, mdl_context, mdl);
+ dict_sys.unfreeze();
+ return table;
+}
+
+ATTRIBUTE_COLD
+dict_table_t *purge_sys_t::close_and_reopen(table_id_t id, THD *thd,
+ MDL_ticket **mdl)
+{
+ MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+ ut_ad(mdl_context);
+ retry:
+ ut_ad(m_active);
+
+ for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+ thr= UT_LIST_GET_NEXT(thrs, thr))
+ {
+ purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+ trx_purge_close_tables(node, thd);
+ }
+
+ m_active= false;
+ wait_FTS(false);
+ m_active= true;
+
+ dict_table_t *table= trx_purge_table_open(id, mdl_context, mdl);
+ if (table == reinterpret_cast<dict_table_t*>(-1))
+ goto retry;
+
+ for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+ thr= UT_LIST_GET_NEXT(thrs, thr))
+ {
+ purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+ for (auto &t : node->tables)
+ {
+ if (t.second.first)
+ {
+ t.second.first= trx_purge_table_open(t.first, mdl_context,
+ &t.second.second);
+ if (t.second.first == reinterpret_cast<dict_table_t*>(-1))
+ {
+ if (table)
+ dict_table_close(table, false, thd, *mdl);
+ goto retry;
+ }
+ }
+ }
+ }
+
+ return table;
+}
+
+/** Run a purge batch.
+@param n_purge_threads number of purge threads
+@return new purge_sys.head */
+static purge_sys_t::iterator
+trx_purge_attach_undo_recs(ulint n_purge_threads, THD *thd)
+{
+ que_thr_t* thr;
+ ulint i;
+
+ ut_a(n_purge_threads > 0);
+ ut_a(UT_LIST_GET_LEN(purge_sys.query->thrs) >= n_purge_threads);
+
+ purge_sys_t::iterator head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+ i = 0;
+ /* Debug code to validate some pre-requisites and reset done flag. */
+ for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+ thr != NULL && i < n_purge_threads;
+ thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+ purge_node_t* node;
+
+ /* Get the purge node. */
+ node = (purge_node_t*) thr->child;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+ ut_ad(node->undo_recs.empty());
+ ut_ad(!node->in_progress);
+ ut_d(node->in_progress = true);
+ }
+
+ /* There should never be fewer nodes than threads, the inverse
+ however is allowed because we only use purge threads as needed. */
+ ut_ad(i == n_purge_threads);
+#endif
+
+ /* Fetch and parse the UNDO records. The UNDO records are added
+ to a per purge node vector. */
+ thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+
+ ut_ad(head <= purge_sys.tail);
+
+ i = 0;
+
+ std::unordered_map<table_id_t, purge_node_t*>
+ table_id_map(TRX_PURGE_TABLE_BUCKETS);
+ purge_sys.m_active = true;
+
+ MDL_context* const mdl_context
+ = static_cast<MDL_context*>(thd_mdl_context(thd));
+ ut_ad(mdl_context);
+
+ const size_t max_pages = std::min(buf_pool.curr_size * 3 / 4,
+ size_t{srv_purge_batch_size});
+
+ while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+ /* Track the max {trx_id, undo_no} for truncating the
+ UNDO logs once we have purged the records. */
+
+ if (head <= purge_sys.tail) {
+ head = purge_sys.tail;
+ }
+
+ /* Fetch the next record, and advance the purge_sys.tail. */
+ trx_purge_rec_t purge_rec = purge_sys.fetch_next_rec();
+
+ if (!purge_rec.undo_rec) {
+ if (!purge_rec.roll_ptr) {
+ break;
+ }
+ ut_ad(purge_rec.roll_ptr == 1);
+ continue;
+ }
+
+ table_id_t table_id = trx_undo_rec_get_table_id(
+ purge_rec.undo_rec);
+
+ purge_node_t*& table_node = table_id_map[table_id];
+
+ if (!table_node) {
+ std::pair<dict_table_t*,MDL_ticket*> p;
+ p.first = trx_purge_table_open(table_id, mdl_context,
+ &p.second);
+ if (p.first == reinterpret_cast<dict_table_t*>(-1)) {
+ p.first = purge_sys.close_and_reopen(
+ table_id, thd, &p.second);
+ }
+
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+
+ if (!(++i % n_purge_threads)) {
+ thr = UT_LIST_GET_FIRST(
+ purge_sys.query->thrs);
+ }
+
+ table_node = static_cast<purge_node_t*>(thr->child);
+ ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE);
+ ut_d(auto i=)
+ table_node->tables.emplace(table_id, p);
+ ut_ad(i.second);
+ if (p.first) {
+ goto enqueue;
+ }
+ } else if (table_node->tables[table_id].first) {
+enqueue:
+ table_node->undo_recs.push(purge_rec);
+ }
+
+ if (purge_sys.n_pages_handled() >= max_pages) {
+ break;
+ }
+ }
+
+ purge_sys.m_active = false;
+
+ ut_ad(head <= purge_sys.tail);
+
+ return head;
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+ const bool notify_wait{purge_worker_task.is_running()};
+
+ if (notify_wait)
+ tpool::tpool_wait_begin();
+
+ purge_worker_task.wait();
+
+ if (notify_wait)
+ tpool::tpool_wait_end();
+
+ /* There should be no outstanding tasks as long
+ as the worker threads are active. */
+ ut_ad(srv_get_task_queue_length() == 0);
+}
+
+TRANSACTIONAL_INLINE
+void purge_sys_t::batch_cleanup(const purge_sys_t::iterator &head)
+{
+ /* Release the undo pages. */
+ for (auto p : pages)
+ p.second->unfix();
+ pages.clear();
+ pages.reserve(srv_purge_batch_size);
+
+ /* This is only invoked only by the purge coordinator,
+ which is the only thread that can modify our inputs head, tail, view.
+ Therefore, we only need to protect end_view from concurrent reads. */
+
+ /* Limit the end_view similar to what trx_purge_truncate_history() does. */
+ const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no;
+#ifdef SUX_LOCK_GENERIC
+ end_latch.wr_lock();
+#else
+ transactional_lock_guard<srw_spin_lock_low> g(end_latch);
+#endif
+ this->head= head;
+ end_view= view;
+ end_view.clamp_low_limit_id(trx_no);
+#ifdef SUX_LOCK_GENERIC
+ end_latch.wr_unlock();
+#endif
+}
+
+/**
+Run a purge batch.
+@param n_tasks number of purge tasks to submit to the queue
+@param history_size trx_sys.history_size()
+@return number of undo log pages handled in the batch */
+TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, ulint history_size)
+{
+ ut_ad(n_tasks > 0);
+
+ purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+ if (srv_purge_view_update_only_debug) {
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+
+ THD* const thd = current_thd;
+
+ /* Fetch the UNDO recs that need to be purged. */
+ const purge_sys_t::iterator head
+ = trx_purge_attach_undo_recs(n_tasks, thd);
+ const size_t n_pages = purge_sys.n_pages_handled();
+
+ {
+ ulint delay = n_pages ? srv_max_purge_lag : 0;
+ if (UNIV_UNLIKELY(delay)) {
+ if (delay >= history_size) {
+ no_throttle:
+ delay = 0;
+ } else if (const ulint max_delay =
+ srv_max_purge_lag_delay) {
+ delay = std::min(max_delay,
+ 10000 * history_size / delay
+ - 5000);
+ } else {
+ goto no_throttle;
+ }
+ }
+ srv_dml_needed_delay = delay;
+ }
+
+ que_thr_t* thr = nullptr;
+
+ /* Submit tasks to workers queue if using multi-threaded purge. */
+ for (ulint i = n_tasks; --i; ) {
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+ ut_a(thr);
+ srv_que_task_enqueue_low(thr);
+ srv_thread_pool->submit_task(&purge_worker_task);
+ }
+
+ thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+ que_run_threads(thr);
+
+ trx_purge_wait_for_workers_to_complete();
+
+ for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+ thr = UT_LIST_GET_NEXT(thrs, thr)) {
+ purge_node_t* node = static_cast<purge_node_t*>(thr->child);
+ trx_purge_close_tables(node, thd);
+ node->tables.clear();
+ }
+
+ purge_sys.batch_cleanup(head);
+
+ MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+ MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages);
+
+ return n_pages;
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..b381c9de
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2448 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+#include "row0ins.h"
+#include "mariadb_stats.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+ /* This also works for REC_INFO_METADATA_ALTER, because the
+ delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+ REC_INFO_METADATA_ADD, 0, 0,
+ NULL, 0, NULL
+#ifdef UNIV_DEBUG
+ , DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block undo log page
+@param ptr current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+ ut_ad(ptr >=
+ &undo_block->page.frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+ /* The 10 is supposed to be an extra safety margin (and needed for
+ compatibility with older versions) */
+ lint left= srv_page_size - (ptr - undo_block->page.frame) -
+ (10 + FIL_PAGE_DATA_END);
+ ut_ad(left >= 0);
+ return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+ buf_block_t* undo_block, /*!< in/out: undo log page */
+ byte* ptr, /*!< in: ptr up to where data has been
+ written on this undo page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page_align(ptr) == undo_block->page.frame);
+
+ if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+ return 0;
+
+ byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_FREE +
+ undo_block->page.frame);
+
+ const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+ /* Write offset of the previous undo log record */
+ memcpy(ptr, ptr_to_first_free, 2);
+ ptr += 2;
+
+ const uint16_t end_of_rec= static_cast<uint16_t>
+ (ptr - undo_block->page.frame);
+
+ /* Update the offset to first free undo record */
+ mach_write_to_2(ptr_to_first_free, end_of_rec);
+ /* Write offset of the next undo log record */
+ memcpy(undo_block->page.frame + first_free, ptr_to_first_free, 2);
+ const byte *start= undo_block->page.frame + first_free + 2;
+
+ mtr->undo_append(*undo_block, start, ptr - start - 2);
+ return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out] undo_block undo log page
+@param[in] table the table
+@param[in] pos the virtual column position
+@param[in] ptr undo log record being written
+@param[in] first_v_col whether this is the first virtual column
+ which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+ buf_block_t* undo_block,
+ const dict_table_t* table,
+ ulint pos,
+ byte* ptr,
+ bool first_v_col)
+{
+ ut_ad(pos < table->n_v_def);
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(table, pos);
+ byte* old_ptr;
+
+ ut_ad(!vcol->v_indexes.empty());
+
+ ulint size = first_v_col ? 1 + 2 : 2;
+ const ulint avail = trx_undo_left(undo_block, ptr);
+
+ /* The mach_write_compressed(ptr, flen) in
+ trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ulint n_idx = 0;
+ for (const auto& v_index : vcol->v_indexes) {
+ n_idx++;
+ /* FIXME: index->id is 64 bits! */
+ size += mach_get_compressed_size(uint32_t(v_index.index->id));
+ size += mach_get_compressed_size(v_index.nth_field);
+ }
+
+ size += mach_get_compressed_size(n_idx);
+
+ if (avail < size + 5) {
+ return(NULL);
+ }
+
+ ut_d(const byte* orig_ptr = ptr);
+
+ if (first_v_col) {
+ /* write the version marker */
+ mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+ ptr += 1;
+ }
+
+ old_ptr = ptr;
+
+ ptr += 2;
+
+ ptr += mach_write_compressed(ptr, n_idx);
+
+ for (const auto& v_index : vcol->v_indexes) {
+ ptr += mach_write_compressed(
+ /* FIXME: index->id is 64 bits! */
+ ptr, uint32_t(v_index.index->id));
+
+ ptr += mach_write_compressed(ptr, v_index.nth_field);
+ }
+
+ ut_ad(orig_ptr + size == ptr);
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[out] col_pos the column number or FIL_NULL
+ if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+ const dict_table_t* table,
+ const byte* ptr,
+ uint32_t* col_pos)
+{
+ ulint len = mach_read_from_2(ptr);
+ const byte* old_ptr = ptr;
+
+ *col_pos = FIL_NULL;
+
+ ptr += 2;
+
+ ulint num_idx = mach_read_next_compressed(&ptr);
+
+ ut_ad(num_idx > 0);
+
+ dict_index_t* clust_index = dict_table_get_first_index(table);
+
+ for (ulint i = 0; i < num_idx; i++) {
+ index_id_t id = mach_read_next_compressed(&ptr);
+ ulint pos = mach_read_next_compressed(&ptr);
+ dict_index_t* index = dict_table_get_next_index(clust_index);
+
+ while (index != NULL) {
+ /* Return if we find a matching index.
+ TODO: in the future, it might be worth to add
+ checks on other indexes */
+ if (index->id == id) {
+ const dict_col_t* col = dict_index_get_nth_col(
+ index, pos);
+ ut_ad(col->is_virtual());
+ const dict_v_col_t* vcol = reinterpret_cast<
+ const dict_v_col_t*>(col);
+ *col_pos = vcol->v_pos;
+ return(old_ptr + len);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+ }
+
+ return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in] first_v_col if this is the first virtual column, which
+ has the version marker
+@param[in,out] is_undo_log this function is used to parse both undo log,
+ and online log for virtual columns. So
+ check to see if this is undo log. When
+ first_v_col is true, is_undo_log is output,
+ when first_v_col is false, is_undo_log is input
+@param[out] field_no the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+ const dict_table_t* table,
+ const byte* ptr,
+ bool first_v_col,
+ bool* is_undo_log,
+ uint32_t* field_no)
+{
+ /* Version marker only put on the first virtual column */
+ if (first_v_col) {
+ /* Undo log has the virtual undo log marker */
+ *is_undo_log = (mach_read_from_1(ptr)
+ == VIRTUAL_COL_UNDO_FORMAT_1);
+
+ if (*is_undo_log) {
+ ptr += 1;
+ }
+ }
+
+ if (*is_undo_log) {
+ ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+ } else {
+ *field_no -= REC_MAX_N_FIELDS;
+ }
+
+ return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in] undo_block undo log page
+@param[in] table the table
+@param[in] row dtuple contains the virtual columns
+@param[in,out] ptr log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+ buf_block_t* undo_block,
+ dict_table_t* table,
+ const dtuple_t* row,
+ byte** ptr)
+{
+ byte* start = *ptr;
+ bool first_v_col = true;
+
+ if (trx_undo_left(undo_block, *ptr) < 2) {
+ return(false);
+ }
+
+ /* Reserve 2 bytes to write the number
+ of bytes the stored fields take in this
+ undo record */
+ *ptr += 2;
+
+ for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+
+ /* make sure enought space to write the length */
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ ulint pos = col_no;
+ pos += REC_MAX_N_FIELDS;
+ *ptr += mach_write_compressed(*ptr, pos);
+
+ *ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, *ptr, first_v_col);
+ first_v_col = false;
+
+ if (*ptr == NULL) {
+ return(false);
+ }
+
+ const dfield_t* vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ switch (ulint flen = vfield->len) {
+ case 0: case UNIV_SQL_NULL:
+ if (trx_undo_left(undo_block, *ptr) < 5) {
+ return(false);
+ }
+
+ *ptr += mach_write_compressed(*ptr, flen);
+ break;
+ default:
+ ulint max_len
+ = dict_max_v_field_len_store_undo(
+ table, col_no);
+
+ if (flen > max_len) {
+ flen = max_len;
+ }
+
+ if (trx_undo_left(undo_block, *ptr)
+ < flen + 5) {
+ return(false);
+ }
+ *ptr += mach_write_compressed(*ptr, flen);
+
+ memcpy(*ptr, vfield->data, flen);
+ *ptr += flen;
+ }
+ }
+ }
+
+ /* Always mark the end of the log with 2 bytes length field */
+ mach_write_to_2(start, ulint(*ptr - start));
+
+ return(true);
+}
+
+/** Reports in the undo log of an insert of a clustered index record.
+@param undo_block undo log page
+@param trx transaction
+@param index clustered index
+@param clust_entry index entry which will be inserted to the
+ clustered index
+@param mtr mini-transaction
+@param write_empty write empty table undo log record
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+ buf_block_t* undo_block,
+ trx_t* trx,
+ dict_index_t* index,
+ const dtuple_t* clust_entry,
+ mtr_t* mtr,
+ bool write_empty)
+{
+ ut_ad(index->is_primary());
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->page.frame) <= 2);
+
+ uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_block->page.frame));
+ byte* ptr = undo_block->page.frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+ /* Not enough space for writing the general parameters */
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ *ptr++ = TRX_UNDO_INSERT_REC;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+
+ if (write_empty) {
+ /* Table is in bulk operation */
+ undo_block->page.frame[first_free + 2] = TRX_UNDO_EMPTY;
+ goto done;
+ }
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+ if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+ ut_ad(clust_entry->is_metadata());
+ ut_ad(index->is_instant());
+ ut_ad(undo_block->page.frame[first_free + 2]
+ == TRX_UNDO_INSERT_REC);
+ undo_block->page.frame[first_free + 2]
+ = TRX_UNDO_INSERT_METADATA;
+ goto done;
+ }
+
+ for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
+ ulint flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr) < flen) {
+
+ return(0);
+ }
+
+ memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ if (index->table->n_v_cols) {
+ if (!trx_undo_report_insert_virtual(
+ undo_block, index->table, clust_entry, &ptr)) {
+ return(0);
+ }
+ }
+
+done:
+ return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ byte* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ byte* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ bool* updated_extern, /*!< out: true if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ table_id_t* table_id) /*!< out: table id */
+{
+ ulint type_cmpl;
+
+ type_cmpl = undo_rec[2];
+ const byte *ptr = undo_rec + 3;
+
+ *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+ type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+ ut_ad(*type <= TRX_UNDO_EMPTY);
+ *cmpl_info = byte(type_cmpl / TRX_UNDO_CMPL_INFO_MULT);
+
+ *undo_no = mach_read_next_much_compressed(&ptr);
+ *table_id = mach_read_next_much_compressed(&ptr);
+ ut_ad(*table_id);
+
+ return ptr;
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr pointer to remaining part of the undo record
+@param field stored field
+@param len length of the field, or UNIV_SQL_NULL
+@param orig_len original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+ uint32_t *len, uint32_t *orig_len)
+{
+ *len = mach_read_next_compressed(&ptr);
+ *orig_len = 0;
+
+ switch (*len) {
+ case UNIV_SQL_NULL:
+ *field = NULL;
+ break;
+ case UNIV_EXTERN_STORAGE_FIELD:
+ *orig_len = mach_read_next_compressed(&ptr);
+ *len = mach_read_next_compressed(&ptr);
+ *field = ptr;
+ ptr += *len & ~SPATIAL_STATUS_MASK;
+
+ ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(*len > *orig_len);
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ /* we do not have access to index->table here
+ ut_ad(dict_table_has_atomic_blobs(index->table)
+ || *len >= col->max_prefix
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ */
+
+ *len += UNIV_EXTERN_STORAGE_FIELD;
+ break;
+ default:
+ *field = ptr;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+ & ~SPATIAL_STATUS_MASK;
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return ptr;
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ const byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t**ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ ut_ad(index->is_primary());
+
+ const ulint ref_len = dict_index_get_n_unique(index);
+
+ dtuple_t* tuple = dtuple_create(heap, ref_len);
+ *ref = tuple;
+
+ dict_index_copy_types(tuple, index, ref_len);
+
+ for (ulint i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return ptr;
+}
+
+/** Skip a row reference from an undo log record.
+@param ptr part of an update undo log record
+@param index clustered index
+@return pointer to remaining part of undo record */
+static const byte *trx_undo_rec_skip_row_ref(const byte *ptr,
+ const dict_index_t *index)
+{
+ ut_ad(index->is_primary());
+
+ ulint ref_len = dict_index_get_n_unique(index);
+
+ for (ulint i = 0; i < ref_len; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ }
+
+ return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out] ext_buf buffer to hold the prefix data and BLOB pointer
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field an externally stored column
+@param[in,out] len input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ /* Fetch the BLOB. */
+ ulint ext_len = btr_copy_externally_stored_field_prefix(
+ ext_buf, prefix_len, zip_size, field, *len);
+ /* BLOBs should always be nonempty. */
+ ut_a(ext_len);
+ /* Append the BLOB pointer to the prefix. */
+ memcpy(ext_buf + ext_len,
+ field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+ return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out] ptr undo log position, at least 15 bytes must be
+available
+@param[out] ext_buf a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+ size, or NULL when should not fetch a longer
+ prefix
+@param[in] prefix_len prefix size to store in the undo log
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] field the locally stored part of the externally
+stored column
+@param[in,out] len length of field, in bytes
+@param[in] spatial_status whether the column is used by spatial index or
+ regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+ byte* ptr,
+ byte* ext_buf,
+ ulint prefix_len,
+ ulint zip_size,
+ const byte** field,
+ ulint* len,
+ spatial_status_t spatial_status)
+{
+ ulint spatial_len= 0;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ case SPATIAL_NONE:
+ break;
+
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ spatial_len = DATA_MBR_LEN;
+ break;
+ }
+
+ /* Encode spatial status into length. */
+ spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+ if (spatial_status == SPATIAL_ONLY) {
+ /* If the column is only used by gis index, log its
+ MBR is enough.*/
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + spatial_len);
+
+ return(ptr);
+ }
+
+ if (ext_buf) {
+ ut_a(prefix_len > 0);
+
+ /* If an ordering column is externally stored, we will
+ have to store a longer prefix of the field. In this
+ case, write to the log a marker followed by the
+ original length and the real length of the field. */
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+ ptr += mach_write_compressed(ptr, *len);
+
+ *field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+ zip_size, *field, len);
+
+ ptr += mach_write_compressed(ptr, *len + spatial_len);
+ } else {
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + *len + spatial_len);
+ }
+
+ return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out] mbr MBR to fill
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] field field contain the geometry data
+@param[in,out] len length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+ double* mbr,
+ ulint zip_size,
+ const byte* field,
+ ulint* len)
+{
+ uchar* dptr = NULL;
+ ulint dlen;
+ mem_heap_t* heap = mem_heap_create(100);
+
+ dptr = btr_copy_externally_stored_field(
+ &dlen, field, zip_size, *len, heap);
+
+ if (dlen <= GEO_DATA_HEADER_SIZE) {
+ for (uint i = 0; i < SPDIMS; ++i) {
+ mbr[i * 2] = DBL_MAX;
+ mbr[i * 2 + 1] = -DBL_MAX;
+ }
+ } else {
+ rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+ static_cast<uint>(dlen
+ - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+ }
+
+ mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+ buf_block_t* undo_block, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index where update or
+ delete marking is done */
+ const rec_t* rec, /*!< in: clustered index record which
+ has NOT yet been modified */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const dtuple_t* row, /*!< in: clustered index row contains
+ virtual column info */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(index->is_primary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+ TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+ TRX_UNDO_INSERT == 1 into insert_undo pages,
+ or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+ + undo_block->page.frame) <= 2);
+
+ byte* ptr_to_first_free = my_assume_aligned<2>(
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->page.frame);
+
+ const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+ byte *ptr = undo_block->page.frame + first_free;
+
+ if (trx_undo_left(undo_block, ptr) < 50) {
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+ return 0;
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ dict_table_t* table = index->table;
+ const byte* field;
+ ulint flen;
+ ulint col_no;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+ trx_id_t trx_id;
+ ibool ignore_prefix = FALSE;
+ byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE];
+ bool first_v_col = true;
+
+ /* Store first some general parameters to the undo log */
+
+ if (!update) {
+ ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ } else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing update_undo log record. */
+ ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ /* We are about to update a delete marked record.
+ We don't typically need the prefix in this case unless
+ the delete marking is done by the same transaction
+ (which we check below). */
+ ignore_prefix = TRUE;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+
+ type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+ type_cmpl_ptr = ptr;
+
+ *ptr++ = (byte) type_cmpl;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+ ut_ad(flen == DATA_TRX_ID_LEN);
+
+ trx_id = trx_read_trx_id(field);
+
+ /* If it is an update of a delete marked record, then we are
+ allowed to ignore blob prefixes if the delete marking was done
+ by some other trx as it must have committed by now for us to
+ allow an over-write. */
+ if (trx_id == trx->id) {
+ ignore_prefix = false;
+ }
+ ptr += mach_u64_write_compressed(ptr, trx_id);
+
+ field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+ ut_ad(flen == DATA_ROLL_PTR_LEN);
+ ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+ ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ /* The ordering columns must not be instant added columns. */
+ ut_ad(!rec_offs_nth_default(offsets, i));
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ /* The ordering columns must not be stored externally. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ ulint n_updated = upd_get_n_fields(update);
+
+ /* If this is an online update while an inplace alter table
+ is in progress and the table has virtual column, we will
+ need to double check if there are any non-indexed columns
+ being registered in update vector in case they will be indexed
+ in new table */
+ if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ upd_field_t* fld = upd_get_nth_field(
+ update, i);
+ ulint pos = fld->field_no;
+
+ /* These columns must not have an index
+ on them */
+ if (upd_fld_is_virtual_col(fld)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ n_updated--;
+ }
+ }
+ }
+
+ i = 0;
+
+ if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+ ut_ad(update->n_fields >= 1);
+ ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+ ut_ad(update->fields[0].field_no
+ == index->first_user_field());
+ ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+ ut_ad(!dfield_is_null(&update->fields[0].new_val));
+ /* The instant ADD COLUMN metadata record does not
+ contain the BLOB. Do not write anything for it. */
+ i = !rec_is_alter_metadata(rec, *index);
+ n_updated -= i;
+ }
+
+ ptr += mach_write_compressed(ptr, n_updated);
+
+ for (; i < upd_get_n_fields(update); i++) {
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return 0;
+ }
+
+ upd_field_t* fld = upd_get_nth_field(update, i);
+
+ bool is_virtual = upd_fld_is_virtual_col(fld);
+ ulint max_v_log_len = 0;
+
+ ulint pos = fld->field_no;
+ const dict_col_t* col = NULL;
+
+ if (is_virtual) {
+ /* Skip the non-indexed column, during
+ an online alter table */
+ if (dict_index_is_online_ddl(index)
+ && dict_table_get_nth_v_col(
+ table, pos)->v_indexes.empty()) {
+ continue;
+ }
+
+ /* add REC_MAX_N_FIELDS to mark this
+ is a virtual col */
+ ptr += mach_write_compressed(
+ ptr, pos + REC_MAX_N_FIELDS);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ ut_ad(fld->field_no < table->n_v_def);
+
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ fld->field_no, ptr,
+ first_v_col);
+ if (ptr == NULL) {
+ return(0);
+ }
+ first_v_col = false;
+
+ max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, fld->field_no);
+
+ field = static_cast<byte*>(
+ fld->old_v_val->data);
+ flen = fld->old_v_val->len;
+
+ /* Only log sufficient bytes for index
+ record update */
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ goto store_len;
+ }
+
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(pos >= index->first_user_field());
+ ut_ad(rec_is_metadata(rec, *index));
+
+ if (rec_is_alter_metadata(rec, *index)) {
+ ut_ad(update->is_alter_metadata());
+
+ field = rec_offs_n_fields(offsets)
+ > pos
+ && !rec_offs_nth_default(
+ offsets, pos)
+ ? rec_get_nth_field(
+ rec, offsets,
+ pos, &flen)
+ : index->instant_field_value(
+ pos - 1, &flen);
+
+ if (pos == index->first_user_field()) {
+ ut_ad(rec_offs_nth_extern(
+ offsets, pos));
+ ut_ad(flen == FIELD_REF_SIZE);
+ goto write_field;
+ }
+ col = dict_index_get_nth_col(index,
+ pos - 1);
+ } else if (!update->is_alter_metadata()) {
+ goto get_field;
+ } else {
+ /* We are converting an ADD COLUMN
+ metadata record to an ALTER TABLE
+ metadata record, with BLOB. Subtract
+ the missing metadata BLOB field. */
+ ut_ad(pos > index->first_user_field());
+ --pos;
+ goto get_field;
+ }
+ } else {
+get_field:
+ col = dict_index_get_nth_col(index, pos);
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+ }
+write_field:
+ /* Write field number to undo log */
+ ptr += mach_write_compressed(ptr, pos);
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return 0;
+ }
+
+ if (rec_offs_n_fields(offsets) > pos
+ && rec_offs_nth_extern(offsets, pos)) {
+ ut_ad(col || pos == index->first_user_field());
+ ut_ad(col || update->is_alter_metadata());
+ ut_ad(col
+ || rec_is_alter_metadata(rec, *index));
+ ulint prefix_len = col
+ ? dict_max_field_len_store_undo(
+ table, col)
+ : 0;
+
+ ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+ <= sizeof ext_buf);
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ col
+ && col->ord_part
+ && !ignore_prefix
+ && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ ? ext_buf : NULL, prefix_len,
+ table->space->zip_size(),
+ &field, &flen, SPATIAL_UNKNOWN);
+
+ *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+ } else {
+store_len:
+ ptr += mach_write_compressed(ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr) < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ /* Also record the new value for virtual column */
+ if (is_virtual) {
+ field = static_cast<byte*>(fld->new_val.data);
+ flen = fld->new_val.len;
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ if (trx_undo_left(undo_block, ptr) < 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+ }
+
+ /* Reset the first_v_col, so to put the virtual column undo
+ version marker again, when we log all the indexed columns */
+ first_v_col = true;
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. Starting from 5.2, we no longer
+ store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+ but we can construct the column prefix fields in the index by
+ fetching the first page of the BLOB that is pointed to by the
+ clustered index. This works also in crash recovery, because all pages
+ (including BLOBs) are recovered before anything is rolled back. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ byte* old_ptr = ptr;
+ double mbr[SPDIMS * 2];
+ mem_heap_t* row_heap = NULL;
+
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ /* Reserve 2 bytes to write the number of bytes the stored
+ fields take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table);
+ col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+
+ if (!col->ord_part) {
+ continue;
+ }
+
+ const ulint pos = dict_index_get_nth_col_pos(
+ index, col_no, NULL);
+ /* All non-virtual columns must be present in
+ the clustered index. */
+ ut_ad(pos != ULINT_UNDEFINED);
+
+ const bool is_ext = rec_offs_nth_extern(offsets, pos);
+ const spatial_status_t spatial_status = is_ext
+ ? dict_col_get_spatial_status(col)
+ : SPATIAL_NONE;
+
+ switch (spatial_status) {
+ case SPATIAL_UNKNOWN:
+ ut_ad(0);
+ /* fall through */
+ case SPATIAL_MIXED:
+ case SPATIAL_ONLY:
+ /* Externally stored spatially indexed
+ columns will be (redundantly) logged
+ again, because we did not write the
+ MBR yet, that is, the previous call to
+ trx_undo_page_report_modify_ext()
+ was with SPATIAL_UNKNOWN. */
+ break;
+ case SPATIAL_NONE:
+ if (!update) {
+ /* This is a DELETE operation. */
+ break;
+ }
+ /* Avoid redundantly logging indexed
+ columns that were updated. */
+
+ for (i = 0; i < update->n_fields; i++) {
+ const ulint field_no
+ = upd_get_nth_field(update, i)
+ ->field_no;
+ if (field_no >= index->n_fields
+ || dict_index_get_nth_field(
+ index, field_no)->col
+ == col) {
+ goto already_logged;
+ }
+ }
+ }
+
+ if (true) {
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_cfield(
+ rec, index, offsets, pos, &flen);
+
+ if (is_ext) {
+ const dict_col_t* col =
+ dict_index_get_nth_col(
+ index, pos);
+ ulint prefix_len =
+ dict_max_field_len_store_undo(
+ table, col);
+
+ ut_a(prefix_len < sizeof ext_buf);
+ const ulint zip_size
+ = table->space->zip_size();
+
+ /* If there is a spatial index on it,
+ log its MBR */
+ if (spatial_status != SPATIAL_NONE) {
+ ut_ad(DATA_GEOMETRY_MTYPE(
+ col->mtype));
+
+ trx_undo_get_mbr_from_ext(
+ mbr, zip_size,
+ field, &flen);
+ }
+
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+ && !ignore_prefix
+ ? ext_buf : NULL, prefix_len,
+ zip_size,
+ &field, &flen,
+ spatial_status);
+ } else {
+ ptr += mach_write_compressed(
+ ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL
+ && spatial_status != SPATIAL_ONLY) {
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+
+ if (spatial_status != SPATIAL_NONE) {
+ if (trx_undo_left(undo_block, ptr)
+ < DATA_MBR_LEN) {
+ return(0);
+ }
+
+ for (int i = 0; i < SPDIMS * 2;
+ i++) {
+ mach_double_write(
+ ptr, mbr[i]);
+ ptr += sizeof(double);
+ }
+ }
+ }
+
+already_logged:
+ continue;
+ }
+
+ for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+ col_no++) {
+ const dict_v_col_t* col
+ = dict_table_get_nth_v_col(table, col_no);
+
+ if (col->m_col.ord_part) {
+ ulint pos = col_no;
+ ulint max_v_log_len
+ = dict_max_v_field_len_store_undo(
+ table, pos);
+
+ /* Write field number to undo log.
+ Make sure there is enought space in log */
+ if (trx_undo_left(undo_block, ptr) < 5) {
+ return(0);
+ }
+
+ pos += REC_MAX_N_FIELDS;
+ ptr += mach_write_compressed(ptr, pos);
+
+ ut_ad(col_no < table->n_v_def);
+ ptr = trx_undo_log_v_idx(undo_block, table,
+ col_no, ptr,
+ first_v_col);
+ first_v_col = false;
+
+ if (!ptr) {
+ return(0);
+ }
+
+ const dfield_t* vfield = NULL;
+
+ if (update) {
+ ut_ad(!row);
+ if (update->old_vrow == NULL) {
+ flen = UNIV_SQL_NULL;
+ } else {
+ vfield = dtuple_get_nth_v_field(
+ update->old_vrow,
+ col->v_pos);
+ }
+ } else if (row) {
+ vfield = dtuple_get_nth_v_field(
+ row, col->v_pos);
+ } else {
+ ut_ad(0);
+ }
+
+ if (vfield) {
+ field = static_cast<byte*>(vfield->data);
+ flen = vfield->len;
+ } else {
+ ut_ad(flen == UNIV_SQL_NULL);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ flen = ut_min(
+ flen, max_v_log_len);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ switch (flen) {
+ case 0: case UNIV_SQL_NULL:
+ break;
+ default:
+ if (trx_undo_left(undo_block, ptr)
+ < flen) {
+ return(0);
+ }
+
+ memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+ if (row_heap) {
+ mem_heap_free(row_heap);
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_block, ptr) < 2) {
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ const uint16_t new_free = static_cast<uint16_t>(
+ ptr + 2 - undo_block->page.frame);
+ mach_write_to_2(undo_block->page.frame + first_free, new_free);
+
+ mach_write_to_2(ptr_to_first_free, new_free);
+
+ const byte* start = &undo_block->page.frame[first_free + 2];
+ mtr->undo_append(*undo_block, start, ptr - start);
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ const byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ byte* info_bits) /*!< out: info bits state */
+{
+ /* Read the state of the info bits */
+ *info_bits = *ptr++;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_u64_read_next_compressed(&ptr);
+ *roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+ return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ const byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undo record */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ byte info_bits,/*!< in: info bits from this undo record */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /*!< out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+ ulint n_skip_field = 0;
+
+ ut_a(dict_index_is_clust(index));
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ n_fields = mach_read_next_compressed(&ptr);
+ } else {
+ n_fields = 0;
+ }
+
+ *upd = update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+ mach_write_to_6(buf, trx_id);
+
+ upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+
+ buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (ulint i = 0; i < n_fields; i++) {
+ const byte* field;
+ uint32_t len, orig_len;
+
+ upd_field = upd_get_nth_field(update, i);
+ uint32_t field_no = mach_read_next_compressed(&ptr);
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ /* If new version, we need to check index list to figure
+ out the correct virtual column position */
+ ptr = trx_undo_read_v_idx(
+ index->table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ /* This column could be dropped or no longer indexed */
+ if (field_no >= index->n_fields) {
+ /* Mark this is no longer needed */
+ upd_field->field_no = REC_MAX_N_FIELDS;
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ n_skip_field++;
+ continue;
+ }
+
+ upd_field_set_v_field_no(
+ upd_field, static_cast<uint16_t>(field_no),
+ index);
+ } else if (UNIV_UNLIKELY((update->info_bits
+ & ~REC_INFO_DELETED_FLAG)
+ == REC_INFO_MIN_REC_FLAG)) {
+ ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+ const uint32_t uf = index->first_user_field();
+ ut_ad(field_no >= uf);
+
+ if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+ /* Generic instant ALTER TABLE */
+ if (field_no == uf) {
+ upd_field->new_val.type
+ .metadata_blob_init();
+ } else if (field_no >= index->n_fields) {
+ /* This is reachable during
+ purge if the table was emptied
+ and converted to the canonical
+ format on a later ALTER TABLE.
+ In this case,
+ row_purge_upd_exist_or_extern()
+ would only be interested in
+ freeing any BLOBs that were
+ updated, that is, the metadata
+ BLOB above. Other BLOBs in
+ the metadata record are never
+ updated; they are for the
+ initial DEFAULT values of the
+ instantly added columns, and
+ they will never change.
+
+ Note: if the table becomes
+ empty during ROLLBACK or is
+ empty during subsequent ALTER
+ TABLE, and btr_page_empty() is
+ called to re-create the root
+ page without the metadata
+ record, in that case we should
+ only free the latest version
+ of BLOBs in the record,
+ which purge would never touch. */
+ field_no = REC_MAX_N_FIELDS;
+ n_skip_field++;
+ } else {
+ dict_col_copy_type(
+ dict_index_get_nth_col(
+ index, field_no - 1),
+ &upd_field->new_val.type);
+ }
+ } else {
+ /* Instant ADD COLUMN...LAST */
+ dict_col_copy_type(
+ dict_index_get_nth_col(index,
+ field_no),
+ &upd_field->new_val.type);
+ }
+ upd_field->field_no = field_no
+ & dict_index_t::MAX_N_FIELDS;
+ } else if (field_no < index->n_fields) {
+ upd_field_set_field_no(upd_field,
+ static_cast<uint16_t>(field_no),
+ index);
+ } else {
+ ib::error() << "Trying to access update undo rec"
+ " field " << field_no
+ << " in index " << index->name
+ << " of table " << index->table->name
+ << " but index has only "
+ << dict_index_get_n_fields(index)
+ << " fields " << BUG_REPORT_MSG
+ << ". Run also CHECK TABLE "
+ << index->table->name << "."
+ " n_fields = " << n_fields << ", i = " << i;
+
+ ut_ad(0);
+ *upd = NULL;
+ return(NULL);
+ }
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(&upd_field->new_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(&upd_field->new_val, field, len);
+ } else {
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+
+ dfield_set_data(&upd_field->new_val, field, len);
+ dfield_set_ext(&upd_field->new_val);
+ }
+
+ ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG)
+ || field_no != index->first_user_field()
+ || (upd_field->new_val.ext
+ && upd_field->new_val.len == FIELD_REF_SIZE));
+
+ if (is_virtual) {
+ upd_field->old_v_val = static_cast<dfield_t*>(
+ mem_heap_alloc(
+ heap, sizeof *upd_field->old_v_val));
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(upd_field->old_v_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(
+ upd_field->old_v_val, field, len);
+ } else {
+ ut_ad(0);
+ }
+ }
+ }
+
+ /* We may have to skip dropped indexed virtual columns.
+ Also, we may have to trim the update vector of a metadata record
+ if dict_index_t::clear_instant_alter() was invoked on the table
+ later, and the number of fields no longer matches. */
+
+ if (n_skip_field) {
+ upd_field_t* d = upd_get_nth_field(update, 0);
+ const upd_field_t* const end = d + n_fields + 2;
+
+ for (const upd_field_t* s = d; s != end; s++) {
+ if (s->field_no != REC_MAX_N_FIELDS) {
+ *d++ = *s;
+ }
+ }
+
+ ut_ad(d + n_skip_field == end);
+ update->n_fields = d - upd_get_nth_field(update, 0);
+ }
+
+ return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@param[in,out] block undo page
+@param[in,out] mtr mini-transaction
+@return byte offset of the undo log record
+@retval 0 in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+ buf_block_t* block, mtr_t* mtr)
+{
+ byte* ptr_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + block->page.frame);
+ const uint16_t first_free = mach_read_from_2(ptr_first_free);
+ ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+ byte* const start = block->page.frame + first_free;
+ size_t len = strlen(table->name.m_name);
+ const size_t fixed = 2 + 1 + 11 + 11 + 2;
+ ut_ad(len <= NAME_CHAR_LEN * 5 * 2 + 1);
+ /* The -10 is used in trx_undo_left() */
+ compile_time_assert(NAME_CHAR_LEN * 5 * 2 + fixed
+ + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+ < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+ if (trx_undo_left(block, start) < fixed + len) {
+ ut_ad(first_free > TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_HDR_SIZE);
+ return 0;
+ }
+
+ byte* ptr = start + 2;
+ *ptr++ = TRX_UNDO_RENAME_TABLE;
+ ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_u64_write_much_compressed(ptr, table->id);
+ memcpy(ptr, table->name.m_name, len);
+ ptr += len;
+ mach_write_to_2(ptr, first_free);
+ mach_write_to_2(ptr_first_free, ptr + 2 - block->page.frame);
+ memcpy(start, ptr_first_free, 2);
+ mtr->undo_append(*block, start + 2, ptr - start - 2);
+ return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out] trx transaction
+@param[in] table table that is being renamed
+@return DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ ut_ad(!table->is_temporary());
+
+ mtr_t mtr;
+ dberr_t err;
+ mtr.start();
+ if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+ ut_ad(err == DB_SUCCESS);
+ ut_ad(undo);
+ for (ut_d(int loop_count = 0);;) {
+ ut_ad(loop_count++ < 2);
+ ut_ad(undo->last_page_no
+ == block->page.id().page_no());
+
+ if (uint16_t offset = trx_undo_page_report_rename(
+ trx, table, block, &mtr)) {
+ undo->top_page_no = undo->last_page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = block;
+ ut_ad(!undo->empty());
+
+ err = DB_SUCCESS;
+ break;
+ } else {
+ mtr.commit();
+ mtr.start();
+ block = trx_undo_add_page(undo, &mtr, &err);
+ if (!block) {
+ break;
+ }
+ }
+ }
+ }
+
+ mtr.commit();
+ return err;
+}
+
+TRANSACTIONAL_TARGET ATTRIBUTE_NOINLINE
+/** @return whether the transaction holds an exclusive lock on a table */
+static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table)
+{
+ ut_ad(!table.is_temporary());
+
+ uint32_t n;
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (table.lock_mutex_is_locked())
+ xabort();
+ n= table.n_lock_x_or_s;
+ xend();
+ }
+ else
+#endif
+ {
+ table.lock_mutex_lock();
+ n= table.n_lock_x_or_s;
+ table.lock_mutex_unlock();
+ }
+
+ /* This thread is executing trx. No other thread can modify our table locks
+ (only record locks might be created, in an implicit-to-explicit conversion).
+ Hence, no mutex is needed here. */
+ if (n)
+ for (const lock_t *lock : trx.lock.table_locks)
+ if (lock && lock->type_mode == (LOCK_X | LOCK_TABLE))
+ return true;
+
+ return false;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index; in updates,
+ may contain a clustered index
+ record tuple that also contains
+ virtual columns of the table;
+ otherwise, NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index; NULL if insert */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the
+ undo log record */
+{
+ trx_t* trx;
+#ifdef UNIV_DEBUG
+ int loop_count = 0;
+#endif /* UNIV_DEBUG */
+
+ ut_a(dict_index_is_clust(index));
+ ut_ad(!update || rec);
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+ ut_ad(!srv_read_only_mode);
+
+ trx = thr_get_trx(thr);
+ /* This function must not be invoked during rollback
+ (of a TRX_STATE_PREPARE transaction or otherwise). */
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(!trx->in_rollback);
+
+ /* We must determine if this is the first time when this
+ transaction modifies this table. */
+ auto m = trx->mod_tables.emplace(index->table, trx->undo_no);
+ ut_ad(m.first->second.valid(trx->undo_no));
+
+ if (m.second && index->table->is_active_ddl()) {
+ trx->apply_online_log= true;
+ }
+
+ bool bulk = !rec;
+
+ if (!bulk) {
+ /* An UPDATE or DELETE must not be covered by an
+ earlier start_bulk_insert(). */
+ ut_ad(!m.first->second.is_bulk_insert());
+ } else if (m.first->second.is_bulk_insert()) {
+ /* Above, the emplace() tried to insert an object with
+ !is_bulk_insert(). Only an explicit start_bulk_insert()
+ (below) can set the flag. */
+ ut_ad(!m.second);
+ /* We already wrote a TRX_UNDO_EMPTY record. */
+ ut_ad(thr->run_node);
+ ut_ad(que_node_get_type(thr->run_node) == QUE_NODE_INSERT);
+ ut_ad(trx->bulk_insert);
+ return DB_SUCCESS;
+ } else if (!m.second || !trx->bulk_insert) {
+ bulk = false;
+ } else if (index->table->is_temporary()) {
+ } else if (trx_has_lock_x(*trx, *index->table)
+ && index->table->bulk_trx_id == trx->id) {
+ m.first->second.start_bulk_insert(index->table);
+
+ if (dberr_t err = m.first->second.bulk_insert_buffered(
+ *clust_entry, *index, trx)) {
+ return err;
+ }
+ } else {
+ bulk = false;
+ }
+
+ mtr_t mtr;
+ dberr_t err;
+ mtr.start();
+ trx_undo_t** pundo;
+ trx_rseg_t* rseg;
+ const bool is_temp = index->table->is_temporary();
+ buf_block_t* undo_block;
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ rseg = trx->get_temp_rseg();
+ pundo = &trx->rsegs.m_noredo.undo;
+ undo_block = trx_undo_assign_low<true>(trx, rseg, pundo,
+ &mtr, &err);
+ } else {
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id);
+ pundo = &trx->rsegs.m_redo.undo;
+ rseg = trx->rsegs.m_redo.rseg;
+ undo_block = trx_undo_assign_low<false>(trx, rseg, pundo,
+ &mtr, &err);
+ }
+
+ trx_undo_t* undo = *pundo;
+ ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+ if (UNIV_UNLIKELY(undo_block == NULL)) {
+err_exit:
+ mtr.commit();
+ return err;
+ }
+
+ ut_ad(undo != NULL);
+
+ do {
+ uint16_t offset = !rec
+ ? trx_undo_page_report_insert(
+ undo_block, trx, index, clust_entry, &mtr,
+ bulk)
+ : trx_undo_page_report_modify(
+ undo_block, trx, index, rec, offsets, update,
+ cmpl_info, clust_entry, &mtr);
+
+ if (UNIV_UNLIKELY(offset == 0)) {
+ const uint16_t first_free = mach_read_from_2(
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + undo_block->page.frame);
+ memset(undo_block->page.frame + first_free, 0,
+ (srv_page_size - FIL_PAGE_DATA_END)
+ - first_free);
+
+ if (first_free
+ == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+ /* The record did not fit on an empty
+ undo page. Discard the freshly allocated
+ page and return an error. */
+
+ /* When we remove a page from an undo
+ log, this is analogous to a
+ pessimistic insert in a B-tree, and we
+ must reserve the counterpart of the
+ tree latch, which is the rseg
+ mutex. We must commit the mini-transaction
+ first, because it may be holding lower-level
+ latches, such as SYNC_FSP_PAGE. */
+
+ mtr.commit();
+ mtr.start();
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ err = trx_undo_free_last_page(undo, &mtr);
+ rseg->latch.wr_unlock();
+
+ if (m.second) {
+ /* We are not going to modify
+ this table after all. */
+ trx->mod_tables.erase(m.first);
+ }
+
+ if (err == DB_SUCCESS) {
+ err = DB_UNDO_RECORD_TOO_BIG;
+ }
+ goto err_exit;
+ } else {
+ /* Write log for clearing the unused
+ tail of the undo page. It might
+ contain some garbage from a previously
+ written record, and mtr_t::write()
+ will optimize away writes of unchanged
+ bytes. Failure to write this caused a
+ recovery failure when we avoided
+ reading the undo log page from the
+ data file and initialized it based on
+ redo log records (which included the
+ write of the previous garbage). */
+ mtr.memset(*undo_block, first_free,
+ srv_page_size - first_free
+ - FIL_PAGE_DATA_END, 0);
+ }
+
+ mtr.commit();
+ } else {
+ /* Success */
+ undo->top_page_no = undo_block->page.id().page_no();
+ mtr.commit();
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no++;
+ undo->guess_block = undo_block;
+ ut_ad(!undo->empty());
+
+ if (!is_temp) {
+ trx_mod_table_time_t& time = m.first->second;
+ ut_ad(time.valid(undo->top_undo_no));
+
+ if (!time.is_versioned()
+ && index->table->versioned_by_id()
+ && (!rec /* INSERT */
+ || (update
+ && update->affects_versioned()))) {
+ time.set_versioned(undo->top_undo_no);
+ }
+ }
+
+ if (!bulk) {
+ *roll_ptr = trx_undo_build_roll_ptr(
+ !rec, trx_sys.rseg_id(rseg, !is_temp),
+ undo->top_page_no, offset);
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ ut_ad(++loop_count < 2);
+ mtr.start();
+
+ if (is_temp) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ undo_block = trx_undo_add_page(undo, &mtr, &err);
+
+ DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+ undo_block = NULL;);
+ } while (UNIV_LIKELY(undo_block != NULL));
+
+ if (err != DB_OUT_OF_FILE_SPACE) {
+ goto err_exit;
+ }
+
+ ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ DB_OUT_OF_FILE_SPACE,
+ //ER_INNODB_UNDO_LOG_FULL,
+ "No more space left over in %s tablespace for allocating UNDO"
+ " log pages. Please add new data file to the tablespace or"
+ " check if filesystem is full or enable auto-extension for"
+ " the tablespace",
+ undo->rseg->space == fil_system.sys_space
+ ? "system" : is_temp ? "temporary" : "undo");
+
+ goto err_exit;
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in] roll_ptr roll pointer to a record that exists
+@param[in,out] heap memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap)
+{
+ ulint rseg_id;
+ uint32_t page_no;
+ uint16_t offset;
+ bool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset);
+ ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+ ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
+ ut_ad(rseg->is_persistent());
+
+ mtr.start();
+
+ trx_undo_rec_t *undo_rec= nullptr;
+ if (const buf_block_t* undo_page=
+ buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr))
+ {
+ undo_rec= undo_page->page.frame + offset;
+ const size_t end= mach_read_from_2(undo_rec);
+ if (UNIV_UNLIKELY(end <= offset ||
+ end >= srv_page_size - FIL_PAGE_DATA_END))
+ undo_rec= nullptr;
+ else
+ {
+ size_t len{end - offset};
+ undo_rec=
+ static_cast<trx_undo_rec_t*>(mem_heap_dup(heap, undo_rec, len));
+ mach_write_to_2(undo_rec, len);
+ }
+ }
+
+ mtr.commit();
+ return undo_rec;
+}
+
+/** Copy an undo record to heap, to check if a secondary index record
+can be safely purged.
+@param trx_id DB_TRX_ID corresponding to roll_ptr
+@param name table name
+@param roll_ptr DB_ROLL_PTR pointing to the undo log record
+@param heap memory heap for allocation
+@return copy of the record
+@retval nullptr if the version is visible to purge_sys.view */
+static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id,
+ const table_name_t &name,
+ roll_ptr_t roll_ptr,
+ mem_heap_t* heap)
+{
+ {
+ purge_sys_t::view_guard check;
+ if (!check.view().changes_visible(trx_id))
+ return trx_undo_get_undo_rec_low(roll_ptr, heap);
+ }
+ return nullptr;
+}
+
+/** Copy an undo record to heap.
+@param trx_id DB_TRX_ID corresponding to roll_ptr
+@param name table name
+@param roll_ptr DB_ROLL_PTR pointing to the undo log record
+@param heap memory heap for allocation
+@return copy of the record
+@retval nullptr if the undo log is not available */
+static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id,
+ const table_name_t &name,
+ roll_ptr_t roll_ptr,
+ mem_heap_t *heap)
+{
+ {
+ purge_sys_t::end_view_guard check;
+ if (!check.view().changes_visible(trx_id))
+ return trx_undo_get_undo_rec_low(roll_ptr, heap);
+ }
+ return nullptr;
+}
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param rec version of a clustered index record
+@param index clustered index
+@param offsets rec_get_offsets(rec, index)
+@param heap memory heap from which the memory needed is
+ allocated
+@param old_vers previous version or NULL if rec is the
+ first inserted version, or if history data
+ has been deleted (an error), or if the purge
+ could have removed the version
+ though it has not yet done so
+@param v_heap memory heap used to create vrow
+ dtuple if it is not yet created. This heap
+ diffs from "heap" above in that it could be
+ prebuilt->old_vers_heap for selection
+@param v_row virtual column info, if any
+@param v_status status determine if it is going into this
+ function by purge thread or not.
+ And if we read "after image" of undo log
+@param undo_block undo log block which was cached during
+ online dml apply or nullptr
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+TRANSACTIONAL_TARGET
+dberr_t
+trx_undo_prev_version_build(
+ const rec_t *rec,
+ dict_index_t *index,
+ rec_offs *offsets,
+ mem_heap_t *heap,
+ rec_t **old_vers,
+ mem_heap_t *v_heap,
+ dtuple_t **vrow,
+ ulint v_status)
+{
+ dtuple_t* entry;
+ trx_id_t rec_trx_id;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ upd_t* update;
+ byte type;
+ byte info_bits;
+ byte cmpl_info;
+ bool dummy_extern;
+ byte* buf;
+
+ ut_ad(!index->table->is_temporary());
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+ /* The record rec is the first inserted version */
+ return DB_SUCCESS;
+ }
+
+ mariadb_increment_undo_records_read();
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ ut_ad(!index->table->skip_alter_undo);
+
+ trx_undo_rec_t* undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY
+ ? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name,
+ roll_ptr, heap)
+ : trx_undo_get_undo_rec(rec_trx_id, index->table->name,
+ roll_ptr, heap);
+ if (!undo_rec) {
+ return DB_MISSING_HISTORY;
+ }
+
+ const byte *ptr =
+ trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ if (table_id != index->table->id) {
+ /* The table should have been rebuilt, but purge has
+ not yet removed the undo log records for the
+ now-dropped old table (table_id). */
+ return DB_SUCCESS;
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ /* (a) If a clustered index record version is such that the
+ trx id stamp in it is bigger than purge_sys.view, then the
+ BLOBs in that version are known to exist (the purge has not
+ progressed that far);
+
+ (b) if the version is the first version such that trx id in it
+ is less than purge_sys.view, and it is not delete-marked,
+ then the BLOBs in that version are known to exist (the purge
+ cannot have purged the BLOBs referenced by that version
+ yet).
+
+ This function does not fetch any BLOBs. The callers might, by
+ possibly invoking row_ext_create() via row_build(). However,
+ they should have all needed information in the *old_vers
+ returned by this function. This is because *old_vers is based
+ on the transaction undo log records. The function
+ trx_undo_page_fetch_ext() will write BLOB prefixes to the
+ transaction undo log that are at least as long as the longest
+ possible column prefix in a secondary index. Thus, secondary
+ index entries for *old_vers can be constructed without
+ dereferencing any BLOB pointers. */
+
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits,
+ heap, &update);
+ ut_a(ptr);
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ /* We should confirm the existence of disowned external data,
+ if the previous version record is delete marked. If the trx_id
+ of the previous record is seen by purge view, we should treat
+ it as missing history, because the disowned external data
+ might be purged already.
+
+ The inherited external data (BLOBs) can be freed (purged)
+ after trx_id was committed, provided that no view was started
+ before trx_id. If the purge view can see the committed
+ delete-marked record by trx_id, no transactions need to access
+ the BLOB. */
+
+ if (update->info_bits & REC_INFO_DELETED_FLAG
+ && purge_sys.is_purgeable(trx_id)) {
+ return DB_SUCCESS;
+ }
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info: */
+
+ entry = row_rec_to_index_entry(rec, index, offsets, heap);
+ /* The page containing the clustered index record
+ corresponding to entry is latched in mtr. Thus the
+ following call is safe. */
+ if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+ heap)) {
+ return (v_status & TRX_UNDO_PREV_IN_PURGE)
+ ? DB_MISSING_HISTORY : DB_CORRUPTION;
+ }
+
+ /* Get number of externally stored columns in updated record */
+ const ulint n_ext = index->is_primary()
+ ? dtuple_get_n_ext(entry) : 0;
+
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_get_converted_size(index, entry, n_ext)));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index,
+ entry, n_ext);
+ } else {
+ buf = static_cast<byte*>(mem_heap_alloc(
+ heap, rec_offs_size(offsets)));
+
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, true, offsets);
+ rec_set_bit_field_1(*old_vers, update->info_bits,
+ rec_offs_comp(offsets)
+ ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+ for (ulint i = 0; i < update->n_fields; i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+ if (upd_fld_is_virtual_col(uf)) {
+ /* There are no virtual columns in
+ a clustered index record. */
+ continue;
+ }
+ const ulint n = uf->field_no;
+ ut_ad(!dfield_is_ext(&uf->new_val)
+ == !rec_offs_nth_extern(offsets, n));
+ ut_ad(!rec_offs_nth_default(offsets, n));
+
+ if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+ if (rec_offs_nth_sql_null(offsets, n)) {
+ ut_ad(index->table->is_instant());
+ ut_ad(n >= index->n_core_fields);
+ continue;
+ }
+ ut_ad(!index->table->not_redundant());
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+ - l;
+ *b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+ compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+ == REC_2BYTE_SQL_NULL_MASK);
+ continue;
+ }
+
+ ulint len;
+ memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+ uf->new_val.data, uf->new_val.len);
+ if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+ ut_ad(len == UNIV_SQL_NULL);
+ ut_ad(!rec_offs_comp(offsets));
+ ut_ad(uf->new_val.len
+ == rec_get_nth_field_size(rec, n));
+ ulint l = rec_get_1byte_offs_flag(*old_vers)
+ ? (n + 1) : (n + 1) * 2;
+ *(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+ &= byte(~REC_1BYTE_SQL_NULL_MASK);
+ }
+ }
+ }
+
+ /* Set the old value (which is the after image of an update) in the
+ update vector to dtuple vrow */
+ if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+ row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+ false, nullptr, nullptr);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+ rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_dbg);
+ ut_a(!rec_offs_any_null_extern(
+ *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+ index->n_core_fields,
+ ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+ if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ if (!(*vrow)) {
+ *vrow = dtuple_create_with_vcol(
+ v_heap ? v_heap : heap,
+ dict_table_get_n_cols(index->table),
+ dict_table_get_n_v_cols(index->table));
+ dtuple_init_v_fld(*vrow);
+ }
+
+ ut_ad(index->table->n_v_cols);
+ trx_undo_read_v_cols(index->table, ptr, *vrow,
+ v_status & TRX_UNDO_PREV_IN_PURGE);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Read virtual column value from undo log
+@param[in] table the table
+@param[in] ptr undo log pointer
+@param[in,out] row the dtuple to fill
+@param[in] in_purge whether this is called by purge */
+void
+trx_undo_read_v_cols(
+ const dict_table_t* table,
+ const byte* ptr,
+ dtuple_t* row,
+ bool in_purge)
+{
+ const byte* end_ptr;
+ bool first_v_col = true;
+ bool is_undo_log = true;
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+ while (ptr < end_ptr) {
+ dfield_t* dfield;
+ const byte* field;
+ uint32_t field_no, len, orig_len;
+
+ field_no = mach_read_next_compressed(
+ const_cast<const byte**>(&ptr));
+
+ const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+ if (is_virtual) {
+ ptr = trx_undo_read_v_idx(
+ table, ptr, first_v_col, &is_undo_log,
+ &field_no);
+ first_v_col = false;
+ }
+
+ ptr = trx_undo_rec_get_col_val(
+ ptr, &field, &len, &orig_len);
+
+ /* The virtual column is no longer indexed or does not exist.
+ This needs to put after trx_undo_rec_get_col_val() so the
+ undo ptr advances */
+ if (field_no == FIL_NULL) {
+ ut_ad(is_virtual);
+ continue;
+ }
+
+ if (is_virtual) {
+ dict_v_col_t* vcol = dict_table_get_nth_v_col(
+ table, field_no);
+
+ dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+ if (!in_purge
+ || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+ dict_col_copy_type(
+ &vcol->m_col,
+ dfield_get_type(dfield));
+ dfield_set_data(dfield, field, len);
+ }
+ }
+ }
+
+ ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..f21ba422
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,933 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t* trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return whether the rollback was completed normally
+@retval false if the rollback was aborted by shutdown */
+inline bool trx_t::rollback_finish()
+{
+ apply_online_log= false;
+ if (UNIV_LIKELY(error_state == DB_SUCCESS))
+ {
+ commit();
+ return true;
+ }
+
+ ut_a(error_state == DB_INTERRUPTED);
+ ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+ ut_a(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+ ut_d(in_rollback= false);
+ if (trx_undo_t *&undo= rsegs.m_redo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo= nullptr;
+ }
+ commit_low();
+ commit_cleanup();
+ return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+ mem_heap_t *heap= mem_heap_create(512);
+ roll_node_t *roll_node= roll_node_create(heap);
+ roll_node->savept= savept;
+
+ ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+ {
+ const auto s= state;
+ ut_ad(s == TRX_STATE_ACTIVE ||
+ s == TRX_STATE_PREPARED ||
+ s == TRX_STATE_PREPARED_RECOVERED);
+ if (savept)
+ {
+ ut_ad(s == TRX_STATE_ACTIVE);
+ ut_ad(mysql_thd);
+ ut_ad(!is_recovered);
+ }
+ }
+#endif
+
+ error_state = DB_SUCCESS;
+
+ if (has_logged())
+ {
+ ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+ que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+ nullptr);
+ ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+ (que_node_get_parent(thr))));
+ que_run_threads(thr);
+ que_run_threads(roll_node->undo_thr);
+
+ /* Free the memory reserved by the undo graph. */
+ que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+ }
+
+ if (!savept)
+ {
+ rollback_finish();
+ MONITOR_INC(MONITOR_TRX_ROLLBACK);
+ }
+ else
+ {
+ /* There must not be partial rollback if transaction was chosen as deadlock
+ victim. Galera transaction abort can be invoked during partial rollback. */
+ ut_ad(!(lock.was_chosen_as_deadlock_victim & 1));
+ ut_a(error_state == DB_SUCCESS);
+ const undo_no_t limit= savept->least_undo_no;
+ apply_online_log= false;
+ for (trx_mod_tables_t::iterator i= mod_tables.begin();
+ i != mod_tables.end(); )
+ {
+ trx_mod_tables_t::iterator j= i++;
+ ut_ad(j->second.valid());
+ if (j->second.rollback(limit))
+ {
+ j->second.clear_bulk_buffer();
+ mod_tables.erase(j);
+ }
+ else if (!apply_online_log)
+ apply_online_log= j->first->is_active_ddl();
+ }
+ MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+ }
+
+ mem_heap_free(heap);
+}
+
+/** Initiate rollback.
+@param savept savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+ ut_ad(!mutex_is_owner());
+ if (state == TRX_STATE_NOT_STARTED)
+ {
+ error_state= DB_SUCCESS;
+ return DB_SUCCESS;
+ }
+ ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+ if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+ wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+ rollback_low(savept);
+ return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx->op_info = "rollback";
+
+ /* If we are doing the XA recovery of prepared transactions,
+ then the transaction object does not have an InnoDB session
+ object, and we set a dummy session that we use for all MySQL
+ transactions. */
+
+ trx->rollback_low();
+
+ trx->op_info = "";
+
+ return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out] trx transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the rollback should be invoked for a running
+ active MySQL transaction (or recovered prepared transaction)
+ that is associated with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx->will_lock = false;
+ ut_ad(trx->mysql_thd);
+ /* Galera transaction abort can be invoked from MDL acquision
+ code, so trx->lock.was_chosen_as_deadlock_victim can be set
+ even if trx->state is TRX_STATE_NOT_STARTED. */
+ ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+#ifdef WITH_WSREP
+ trx->wsrep= false;
+ trx->lock.was_chosen_as_deadlock_victim= false;
+#endif
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ ut_ad(!trx->is_autocommit_non_locking());
+ if (trx->rsegs.m_redo.undo) {
+ /* The XA ROLLBACK of a XA PREPARE transaction
+ will consist of multiple mini-transactions.
+
+ As the very first step of XA ROLLBACK, we must
+ change the undo log state back from
+ TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+ to ensure that recovery will complete the
+ rollback.
+
+ Failure to perform this step could cause a
+ situation where we would roll back part of
+ a XA PREPARE transaction, the server would be
+ killed, and finally, the transaction would be
+ recovered in XA PREPARE state, with some of
+ the actions already having been rolled back. */
+ ut_ad(trx->rsegs.m_redo.undo->rseg
+ == trx->rsegs.m_redo.rseg);
+ mtr_t mtr;
+ mtr.start();
+ if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+ trx_undo_set_state_at_prepare(trx, undo, true,
+ &mtr);
+ }
+ /* Write the redo log for the XA ROLLBACK
+ state change to the global buffer. It is
+ not necessary to flush the redo log. If
+ a durable log write of a later mini-transaction
+ takes place for whatever reason, then this state
+ change will be durable as well. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ }
+ return(trx_rollback_for_mysql_low(trx));
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ ut_ad(!trx->is_autocommit_non_locking());
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ dberr_t err;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the statement rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ return(DB_SUCCESS);
+
+ case TRX_STATE_ACTIVE:
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->is_recovered);
+ ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx->rollback(&trx->last_sql_stat_start);
+
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_rollback_last_stmt(trx);
+ fts_savepoint_laststmt_refresh(trx);
+ }
+
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+ trx->end_bulk_insert();
+
+ trx->op_info = "";
+
+ return(err);
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The statement rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ const char* name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ savep != NULL;
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+ if (!strcmp(savep->name, name)) {
+ return(savep);
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: savepoint to free */
+{
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+}
+
+/** Discard all savepoints starting from a particular savepoint.
+@param savept first savepoint to discard */
+void trx_t::savepoints_discard(trx_named_savept_t *savept)
+{
+ while (savept)
+ {
+ auto next= UT_LIST_GET_NEXT(trx_savepoints, savept);
+ trx_roll_savepoint_free(this, savept);
+ savept= next;
+ }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ trx_named_savept_t* savep, /*!< in/out: savepoint */
+ int64_t* mysql_binlog_cache_pos)
+ /*!< out: the MySQL binlog
+ cache position corresponding
+ to this savepoint; MySQL needs
+ this information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ dberr_t err;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_ad(trx->mysql_thd);
+
+ /* Free all savepoints strictly later than savep. */
+
+ trx->savepoints_discard(UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx->rollback(&savep->savept);
+
+ /* Store the current undo_no of the transaction so that
+ we know where to roll back if we have to roll back the
+ next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+ return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+
+ /* We are reading trx->state without holding trx->mutex
+ here, because the savepoint rollback should be invoked for a
+ running active MySQL transaction that is associated with the
+ current thread. */
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep == NULL) {
+ return(DB_NO_SAVEPOINT);
+ }
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ ib::error() << "Transaction has a savepoint "
+ << savep->name
+ << " though it is not started";
+ return(DB_ERROR);
+
+ case TRX_STATE_ACTIVE:
+
+ return(trx_rollback_to_savepoint_for_mysql_low(
+ trx, savep, mysql_binlog_cache_pos));
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ /* The savepoint rollback is only allowed on an ACTIVE
+ transaction, not a PREPARED or COMMITTED one. */
+ break;
+ }
+
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ trx_start_if_not_started_xa(trx, false);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+ ut_free(savep->name);
+ ut_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = static_cast<trx_named_savept_t*>(
+ ut_malloc_nokey(sizeof(*savep)));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept.least_undo_no = trx->undo_no;
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+ trx->end_bulk_insert();
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+ || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+ ut_ad(trx->mysql_thd);
+
+ savep = trx_savepoint_find(trx, savepoint_name);
+
+ if (savep != NULL) {
+ trx_roll_savepoint_free(trx, savep);
+ return DB_SUCCESS;
+ } else if (trx->last_sql_stat_start.least_undo_no == 0) {
+ /* Bulk insert could have discarded savepoints */
+ return DB_SUCCESS;
+ }
+
+ return DB_NO_SAVEPOINT;
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ const trx_id_t trx_id = trx->id;
+
+ ut_ad(trx_id);
+
+ heap = mem_heap_create(512);
+
+ fork = que_fork_create(heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+
+ const bool dictionary_locked = trx->dict_operation;
+
+ if (dictionary_locked) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ que_run_threads(thr);
+ ut_a(roll_node->undo_thr != NULL);
+
+ que_run_threads(roll_node->undo_thr);
+
+ que_graph_free(
+ static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+ if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+ ut_ad(!dictionary_locked);
+ } else {
+ ib::info() << "Rolled back recovered transaction " << trx_id;
+ }
+
+ if (dictionary_locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+ uint32_t n_trx;
+ uint64_t n_rows;
+ trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+ trx_roll_count_callback_arg *arg)
+{
+ element->mutex.wr_lock();
+ if (trx_t *trx= element->trx)
+ {
+ if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+ {
+ arg->n_trx++;
+ arg->n_rows+= trx->undo_no;
+ }
+ }
+ element->mutex.wr_unlock();
+ return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+ time_t now = time(NULL);
+ mysql_mutex_lock(&recv_sys.mutex);
+ bool report = recv_sys.report(now);
+ mysql_mutex_unlock(&recv_sys.mutex);
+
+ if (report) {
+ trx_roll_count_callback_arg arg;
+
+ /* Get number of recovered active transactions and number of
+ rows they modified. Numbers must be accurate, because only this
+ thread is allowed to touch recovered transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(
+ trx_roll_count_callback, &arg);
+
+ if (arg.n_rows > 0) {
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "To roll back: " UINT32PF " transactions, "
+ UINT64PF " rows", arg.n_trx, arg.n_rows);
+ }
+
+ ib::info() << "To roll back: " << arg.n_trx
+ << " transactions, " << arg.n_rows << " rows";
+
+ }
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+ std::vector<trx_t*> *trx_list)
+{
+ element->mutex.wr_lock();
+ if (trx_t *trx= element->trx)
+ {
+ trx->mutex_lock();
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+ trx_list->push_back(trx);
+ trx->mutex_unlock();
+ }
+ element->mutex.wr_unlock();
+ return 0;
+}
+
+/**
+ Rollback any incomplete transactions which were encountered in crash recovery.
+
+ If the transaction already was committed, then we clean up a possible insert
+ undo log. If the transaction was not yet committed, then we roll it back.
+
+ Note: For XA recovered transactions, we rely on MySQL to
+ do rollback. They will be in TRX_STATE_PREPARED state. If the server
+ is shutdown and they are still lingering in trx_sys_t::trx_list
+ then the shutdown will hang.
+
+ @param[in] all true=roll back all recovered active transactions;
+ false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+ std::vector<trx_t*> trx_list;
+
+ ut_a(srv_force_recovery <
+ ulong(all ? SRV_FORCE_NO_TRX_UNDO : SRV_FORCE_NO_DDL_UNDO));
+
+ /*
+ Collect list of recovered ACTIVE transaction ids first. Once collected, no
+ other thread is allowed to modify or remove these transactions from
+ rw_trx_hash.
+ */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+ &trx_list);
+
+ while (!trx_list.empty())
+ {
+ trx_t *trx= trx_list.back();
+ trx_list.pop_back();
+
+ ut_ad(trx);
+ ut_d(trx->mutex_lock());
+ ut_ad(trx->is_recovered);
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+ ut_d(trx->mutex_unlock());
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+ srv_fast_shutdown)
+ goto discard;
+
+ if (all || trx->dict_operation || trx->has_stats_table_lock())
+ {
+ trx_rollback_active(trx);
+ if (trx->error_state != DB_SUCCESS)
+ {
+ ut_ad(trx->error_state == DB_INTERRUPTED);
+ trx->error_state= DB_SUCCESS;
+ ut_ad(!srv_undo_sources);
+ ut_ad(srv_fast_shutdown);
+discard:
+ /* Note: before kill_server() invoked innobase_end() via
+ unireg_end(), it invoked close_connections(), which should initiate
+ the rollback of any user transactions via THD::cleanup() in the
+ connection threads, and wait for all THD::cleanup() to complete.
+ So, no active user transactions should exist at this point.
+
+ srv_undo_sources=false was cleared early in innobase_end().
+
+ Generally, the server guarantees that all connections using
+ InnoDB must be disconnected by the time we are reaching this code,
+ be it during shutdown or UNINSTALL PLUGIN.
+
+ Because there is no possible race condition with any
+ concurrent user transaction, we do not have to invoke
+ trx->commit_state() or wait for !trx->is_referenced()
+ before trx_sys.deregister_rw(trx). */
+ trx_sys.deregister_rw(trx);
+ trx_free_at_shutdown(trx);
+ }
+ else
+ trx->free();
+ }
+ }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*)
+{
+ ut_ad(!srv_read_only_mode);
+
+ if (trx_sys.rw_trx_hash.size()) {
+ ib::info() << "Starting in background the rollback of"
+ " recovered transactions";
+ trx_rollback_recovered(true);
+ ib::info() << "Rollback of non-prepared transactions"
+ " completed";
+ }
+
+ trx_rollback_is_active = false;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+
+ ut_ad(trx->mutex_is_owner());
+ heap = mem_heap_create(512);
+ fork = que_fork_create(heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap, NULL);
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+
+ return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t roll_limit) /*!< in: rollback to undo no (for
+ partial undo), 0 if we are rolling back
+ the entire transaction */
+{
+ /* Initialize the rollback field in the transaction */
+
+ ut_ad(trx->mutex_is_owner());
+ ut_ad(!trx->roll_limit);
+ ut_ad(!trx->in_rollback);
+
+ trx->roll_limit = roll_limit;
+ trx->in_rollback = true;
+
+ ut_a(trx->roll_limit <= trx->undo_no);
+
+ trx->pages_undone = 0;
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ que_t* roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+
+ return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+ node->state = ROLL_NODE_SEND;
+
+ node->common.type = QUE_NODE_ROLLBACK;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_node_t* node;
+
+ node = static_cast<roll_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ trx_t* trx;
+ ib_id_t roll_limit;
+
+ trx = thr_get_trx(thr);
+
+ node->state = ROLL_NODE_WAIT;
+
+ ut_a(node->undo_thr == NULL);
+
+ roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+ trx->mutex_lock();
+
+ trx_commit_or_rollback_prepare(trx);
+
+ node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+ trx->mutex_unlock();
+ } else {
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..8d1a381c
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,727 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+#include "log.h"
+
+#ifdef WITH_WSREP
+# include <mysql/service_wsrep.h>
+
+/** The offset to WSREP XID headers, after TRX_RSEG */
+# define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512
+
+/** WSREP XID format (1 if present and valid, 0 if not present) */
+# define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO
+/** WSREP XID GTRID length */
+# define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
+/** WSREP XID bqual length */
+# define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
+/** WSREP XID data (XIDDATASIZE bytes) */
+# define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12
+
+# ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+# endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ DBUG_ASSERT(xid->gtrid_length >= 0);
+ DBUG_ASSERT(xid->bqual_length >= 0);
+ DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->page.frame,
+ uint32_t(xid->formatID));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->page.frame,
+ uint32_t(xid->gtrid_length));
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->page.frame,
+ uint32_t(xid->bqual_length));
+
+ const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+ + xid->bqual_length);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->page.frame,
+ xid->data, xid_length);
+ if (xid_length < XIDDATASIZE
+ && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->page.frame, field_ref_zero,
+ XIDDATASIZE - xid_length)) {
+ mtr->memset(rseg_header,
+ TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+ XIDDATASIZE - xid_length, 0);
+ }
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out] rseg_header rollback segment header
+@param[in] xid WSREP XID
+@param[in,out] mtr mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+ buf_block_t* rseg_header,
+ const XID* xid,
+ mtr_t* mtr)
+{
+ ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+ /* Check that seqno is monotonically increasing */
+ long long xid_seqno = wsrep_xid_seqno(xid);
+ const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+ if (xid_seqno != -1
+ && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+ ut_ad(xid_seqno > wsrep_seqno);
+ } else {
+ memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+ }
+ wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+ trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+static dberr_t trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+ dberr_t err;
+ buf_block_t *rseg_header = trx_sys.rseg_array[0].get(mtr, &err);
+
+ if (UNIV_UNLIKELY(!rseg_header))
+ return err;
+
+ /* We must make check against wsrep_uuid here, the
+ trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with xid
+ contents in debug mode and the memcmp() will never give nonzero
+ result. */
+ const bool must_clear_rsegs=
+ memcmp(wsrep_uuid, wsrep_xid_uuid(xid), sizeof wsrep_uuid);
+
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+ rseg_header->page.frame)))
+ trx_rseg_format_upgrade(rseg_header, mtr);
+
+ trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+ if (must_clear_rsegs)
+ /* Because the UUID part of the WSREP XID differed from
+ current_xid_uuid, the WSREP group UUID was changed, and we must
+ reset the XID in all rollback segment headers. */
+ for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id)
+ if (buf_block_t* block= trx_sys.rseg_array[rseg_id].get(mtr, &err))
+ mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+ TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE -
+ TRX_RSEG_WSREP_XID_INFO, 0);
+ return err;
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in] xid WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+ mtr_t mtr;
+ mtr.start();
+ trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+ mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in] rseg_header Rollback segment header
+@param[out] xid Transaction XID
+@return whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+ int formatID = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+ + rseg_header->page.frame));
+ if (formatID == 0) {
+ return false;
+ }
+
+ xid.formatID = formatID;
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+ + rseg_header->page.frame));
+
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+ + rseg_header->page.frame));
+
+ memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ + rseg_header->page.frame, XIDDATASIZE);
+
+ return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in] page TRX_SYS page
+@param[out] xid WSREP XID (if present)
+@return whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+ + page)
+ != TRX_SYS_WSREP_XID_MAGIC_N) {
+ return false;
+ }
+
+ xid.formatID = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_FORMAT + page));
+ xid.gtrid_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_GTRID_LEN + page));
+ xid.bqual_length = static_cast<int>(
+ mach_read_from_4(
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+ memcpy(xid.data,
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+ return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out] xid WSREP XID
+@return whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+ mtr_t mtr;
+ long long max_xid_seqno = -1;
+ bool found = false;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+ rseg_id++, mtr.commit()) {
+ mtr.start();
+ const buf_block_t* sys = trx_sysf_get(&mtr, false);
+ if (UNIV_UNLIKELY(!sys)) {
+ continue;
+ }
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+
+ if (page_no == FIL_NULL) {
+ continue;
+ }
+
+ const buf_block_t* rseg_header = buf_page_get_gen(
+ page_id_t(trx_sysf_rseg_get_space(sys, rseg_id),
+ page_no),
+ 0, RW_S_LATCH, nullptr, BUF_GET, &mtr);
+
+ if (!rseg_header) {
+ continue;
+ }
+
+ if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+ + rseg_header->page.frame)) {
+ continue;
+ }
+
+ XID tmp_xid;
+ long long tmp_seqno = 0;
+ if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+ && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+ > max_xid_seqno) {
+ found = true;
+ max_xid_seqno = tmp_seqno;
+ xid = tmp_xid;
+ memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+ sizeof wsrep_uuid);
+ }
+ }
+
+ return found;
+}
+#endif /* WITH_WSREP */
+
+buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const
+{
+ if (!space)
+ {
+ if (err) *err= DB_TABLESPACE_NOT_FOUND;
+ return nullptr;
+ }
+ return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
+ BUF_GET, mtr, err);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out] rseg_header rollback segment header page
+@param[in,out] mtr mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+ /* Clear also possible garbage at the end of the page. Old
+ InnoDB versions did not initialize unused parts of pages. */
+ mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+ srv_page_size
+ - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+ 0);
+}
+
+/** Create a rollback segment header.
+@param[in,out] space system, undo, or temporary tablespace
+@param[in] rseg_id rollback segment identifier
+@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+ trx_id_t max_trx_id, mtr_t *mtr,
+ dberr_t *err)
+{
+ ut_ad(mtr->memo_contains(*space));
+ buf_block_t *block=
+ fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr, err);
+ if (block)
+ {
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG +
+ block->page.frame));
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+ block->page.frame));
+ ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG +
+ block->page.frame));
+
+ /* Initialize the history list */
+ flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+ block->page.frame, max_trx_id);
+
+ /* Reset the undo log slots */
+ mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, TRX_RSEG_N_SLOTS * 4,
+ 0xff);
+ }
+ return block;
+}
+
+void trx_rseg_t::destroy()
+{
+ latch.destroy();
+
+ /* There can't be any active transactions. */
+ ut_a(!UT_LIST_GET_LEN(undo_list));
+
+ for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+ undo= next)
+ {
+ next= UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_cached, undo);
+ ut_free(undo);
+ }
+}
+
+void trx_rseg_t::init(fil_space_t *space, uint32_t page)
+{
+ latch.SRW_LOCK_INIT(trx_rseg_latch_key);
+ ut_ad(!this->space || this->space != space);
+ this->space= space;
+ page_no= page;
+ last_page_no= FIL_NULL;
+ curr_size= 1;
+
+ UT_LIST_INIT(undo_list, &trx_undo_t::undo_list);
+ UT_LIST_INIT(undo_cached, &trx_undo_t::undo_list);
+}
+
+void trx_rseg_t::reinit(uint32_t page)
+{
+ ut_ad(is_persistent());
+ ut_ad(page_no == page);
+ ut_a(!UT_LIST_GET_LEN(undo_list));
+ ut_ad(!history_size || UT_LIST_GET_FIRST(undo_cached));
+
+ history_size= 0;
+ page_no= page;
+
+ for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+ undo= next)
+ {
+ next= UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_cached, undo);
+ ut_free(undo);
+ }
+
+ ut_ad(!is_referenced());
+ needs_purge= 0;
+ last_commit_and_offset= 0;
+ last_page_no= FIL_NULL;
+ curr_size= 1;
+ ref.store(0, std::memory_order_release);
+}
+
+/** Read the undo log lists.
+@param[in,out] rseg rollback segment
+@param[in] rseg_header rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg,
+ const buf_block_t *rseg_header)
+{
+ ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+ bool is_undo_empty= true;
+
+ for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+ {
+ uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+ if (page_no != FIL_NULL)
+ {
+ const trx_undo_t *undo=
+ trx_undo_mem_create_at_db_start(rseg, i, page_no);
+ if (!undo)
+ return DB_CORRUPTION;
+ if (is_undo_empty)
+ is_undo_empty= !undo->size || undo->state == TRX_UNDO_CACHED;
+ rseg->curr_size+= undo->size;
+ }
+ }
+
+ trx_sys.set_undo_non_empty(!is_undo_empty);
+ return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out] rseg persistent rollback segment
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
+{
+ if (!rseg->space)
+ return DB_TABLESPACE_NOT_FOUND;
+ dberr_t err;
+ const buf_block_t *rseg_hdr=
+ buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
+ &err);
+ if (!rseg_hdr)
+ return err;
+
+ if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame))
+ {
+ trx_id_t id= mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+ rseg_hdr->page.frame);
+
+ if (id > rseg->needs_purge)
+ rseg->needs_purge= id;
+
+ const byte *binlog_name=
+ TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame;
+ if (*binlog_name)
+ {
+ lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN + rseg_hdr->page.frame));
+ static_assert(TRX_RSEG_BINLOG_NAME_LEN ==
+ sizeof trx_sys.recovered_binlog_filename, "compatibility");
+ if (lsn > trx_sys.recovered_binlog_lsn)
+ {
+ trx_sys.recovered_binlog_lsn= lsn;
+ trx_sys.recovered_binlog_offset=
+ mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET +
+ rseg_hdr->page.frame);
+ memcpy(trx_sys.recovered_binlog_filename, binlog_name,
+ TRX_RSEG_BINLOG_NAME_LEN);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+ }
+ }
+
+ if (srv_operation == SRV_OPERATION_RESTORE)
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return DB_SUCCESS;
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+ rseg_hdr->page.frame) + 1;
+ err= trx_undo_lists_init(rseg, rseg_hdr);
+ if (err != DB_SUCCESS);
+ else if (auto len= flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY +
+ rseg_hdr->page.frame))
+ {
+ rseg->history_size+= len;
+
+ fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY +
+ rseg_hdr->page.frame);
+ node_addr.boffset= static_cast<uint16_t>(node_addr.boffset -
+ TRX_UNDO_HISTORY_NODE);
+ rseg->last_page_no= node_addr.page;
+
+ const buf_block_t* block=
+ buf_page_get_gen(page_id_t(rseg->space->id, node_addr.page),
+ 0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err);
+ if (!block)
+ return err;
+
+ trx_id_t id= mach_read_from_8(block->page.frame + node_addr.boffset +
+ TRX_UNDO_TRX_ID);
+ if (id > rseg->needs_purge)
+ rseg->needs_purge= id;
+ id= mach_read_from_8(block->page.frame + node_addr.boffset +
+ TRX_UNDO_TRX_NO);
+ if (id > rseg->needs_purge)
+ rseg->needs_purge= id;
+
+ rseg->set_last_commit(node_addr.boffset, id);
+ ut_ad(mach_read_from_2(block->page.frame + node_addr.boffset +
+ TRX_UNDO_NEEDS_PURGE) <= 1);
+
+ if (rseg->last_page_no != FIL_NULL)
+ /* There is no need to cover this operation by the purge
+ mutex because we are still bootstrapping. */
+ purge_sys.purge_queue.push(*rseg);
+ }
+
+ trx_sys.set_undo_non_empty(rseg->history_size > 0);
+ return err;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+ if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+ + page)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+ memcpy(trx_sys.recovered_binlog_filename,
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+ + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+ trx_sys.recovered_binlog_offset = mach_read_from_8(
+ TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+ + TRX_SYS + page);
+ }
+
+#ifdef WITH_WSREP
+ trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+ trx_id_t max_trx_id = 0;
+
+ *trx_sys.recovered_binlog_filename = '\0';
+ trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+ trx_sys.recovered_wsrep_xid.null();
+ XID wsrep_sys_xid;
+ wsrep_sys_xid.null();
+ bool wsrep_xid_in_rseg_found = false;
+#endif
+ mtr_t mtr;
+ dberr_t err = DB_SUCCESS;
+
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ mtr.start();
+ if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+ if (rseg_id == 0) {
+ /* In case this is an upgrade from
+ before MariaDB 10.3.5, fetch the base
+ information from the TRX_SYS page. */
+ max_trx_id = mach_read_from_8(
+ TRX_SYS + TRX_SYS_TRX_ID_STORE
+ + sys->page.frame);
+ trx_rseg_init_binlog_info(sys->page.frame);
+#ifdef WITH_WSREP
+ wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+ }
+
+ const uint32_t page_no = trx_sysf_rseg_get_page_no(
+ sys, rseg_id);
+ if (page_no != FIL_NULL) {
+ trx_rseg_t& rseg = trx_sys.rseg_array[rseg_id];
+ uint32_t space_id=
+ trx_sysf_rseg_get_space(
+ sys, rseg_id);
+
+ fil_space_t *rseg_space =
+ fil_space_get(space_id);
+ if (!rseg_space) {
+ mtr.commit();
+ err = DB_ERROR;
+ sql_print_error(
+ "InnoDB: Failed to open the undo "
+ "tablespace undo%03" PRIu32,
+ (space_id -
+ srv_undo_space_id_start + 1));
+ break;
+ }
+
+ rseg.init(rseg_space, page_no);
+ ut_ad(rseg.is_persistent());
+ err = trx_rseg_mem_restore(&rseg, &mtr);
+ if (rseg.needs_purge > max_trx_id) {
+ max_trx_id = rseg.needs_purge;
+ }
+ if (err != DB_SUCCESS) {
+ mtr.commit();
+ break;
+ }
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null() &&
+ !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+ wsrep_xid_in_rseg_found = true;
+ ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+ wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+ sizeof wsrep_uuid)
+ || wsrep_xid_seqno(
+ &wsrep_sys_xid)
+ <= wsrep_xid_seqno(
+ &trx_sys.recovered_wsrep_xid));
+ }
+#endif
+ }
+ }
+
+ mtr.commit();
+ }
+
+ if (err != DB_SUCCESS) {
+ for (auto& rseg : trx_sys.rseg_array) {
+ while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) {
+ UT_LIST_REMOVE(rseg.undo_list, u);
+ ut_free(u);
+ }
+ }
+ return err;
+ }
+
+#ifdef WITH_WSREP
+ if (!wsrep_sys_xid.is_null()) {
+ /* Upgrade from a version prior to 10.3.5,
+ where WSREP XID was stored in TRX_SYS page.
+ If no rollback segment has a WSREP XID set,
+ we must copy the XID found in TRX_SYS page
+ to rollback segments. */
+ mtr.start();
+
+ if (!wsrep_xid_in_rseg_found) {
+ trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+ }
+
+ /* Finally, clear WSREP XID in TRX_SYS page. */
+ mtr.memset(trx_sysf_get(&mtr),
+ TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+ TRX_SYS_WSREP_XID_LEN, 0);
+ mtr.commit();
+ }
+#endif
+
+ trx_sys.init_max_trx_id(max_trx_id + 1);
+ return DB_SUCCESS;
+}
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr)
+{
+ for (ulong i= 0; i < array_elements(trx_sys.temp_rsegs); i++)
+ {
+ mtr->start();
+ mtr->set_log_mode(MTR_LOG_NO_REDO);
+ mtr->x_lock_space(fil_system.temp_space);
+ dberr_t err;
+ buf_block_t *rblock=
+ trx_rseg_header_create(fil_system.temp_space, i, 0, mtr, &err);
+ if (UNIV_UNLIKELY(!rblock))
+ {
+ mtr->commit();
+ return err;
+ }
+ trx_sys.temp_rsegs[i].init(fil_system.temp_space,
+ rblock->page.id().page_no());
+ mtr->commit();
+ }
+ return DB_SUCCESS;
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out] rseg_header rollback segment header
+@param[in] log_file_name binlog file name
+@param[in] log_offset binlog file offset
+@param[in,out] mtr mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header,
+ const char *log_file_name,
+ ulonglong log_offset,
+ mtr_t *mtr)
+{
+ DBUG_PRINT("trx", ("trx_mysql_binlog_offset %llu", log_offset));
+ const size_t len= strlen(log_file_name) + 1;
+ ut_ad(len > 1);
+
+ if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN))
+ return;
+
+ mtr->write<8,mtr_t::MAYBE_NOP>(
+ *rseg_header,
+ TRX_RSEG + TRX_RSEG_BINLOG_OFFSET + rseg_header->page.frame,
+ log_offset);
+
+ byte *name= TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->page.frame;
+
+ if (memcmp(log_file_name, name, len))
+ mtr->memcpy(*rseg_header, name, log_file_name, len);
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..319ba99a
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,370 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t trx_sys;
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+ if (!*trx_sys.recovered_binlog_filename) {
+ return;
+ }
+
+ ib::info() << "Last binlog file '"
+ << trx_sys.recovered_binlog_filename
+ << "', position "
+ << trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in] sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+ == FIL_NULL) {
+ return rseg_id;
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+ mtr_t mtr;
+ mtr.start();
+
+ srv_available_undo_logs = 0;
+ if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+ for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+ srv_available_undo_logs
+ += trx_sysf_rseg_get_page_no(sys_header,
+ rseg_id)
+ != FIL_NULL;
+ }
+ }
+
+ mtr.commit();
+}
+
+/** Initialize the transaction system when creating the database. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
+{
+ mtr->start();
+ mtr->x_lock_space(fil_system.sys_space);
+ static_assert(TRX_SYS_SPACE == 0, "compatibility");
+
+ /* Create the trx sys file block in a new allocated file segment */
+ dberr_t err;
+ buf_block_t *block= fseg_create(fil_system.sys_space,
+ TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err);
+ if (UNIV_UNLIKELY(!block))
+ {
+ error:
+ mtr->commit();
+ return err;
+ }
+ ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+ mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+ FIL_PAGE_TYPE_TRX_SYS);
+
+ /* Reset the rollback segment slots. Old versions of InnoDB
+ (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+ that the whole array is initialized. */
+ static_assert(256 >= TRX_SYS_N_RSEGS, "");
+ static_assert(TRX_SYS + TRX_SYS_RSEGS + 256 * TRX_SYS_RSEG_SLOT_SIZE <=
+ UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END, "");
+ mtr->write<4>(*block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+ block->page.frame, FSP_FIRST_RSEG_PAGE_NO);
+ mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE,
+ 255 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+
+ buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0,
+ mtr, &err);
+ if (UNIV_UNLIKELY(!r))
+ goto error;
+ ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+ mtr->commit();
+
+ return trx_lists_init_at_db_start();
+}
+
+void trx_sys_t::create()
+{
+ ut_ad(this == &trx_sys);
+ ut_ad(!is_initialised());
+ m_initialised= true;
+ trx_list.create();
+ rw_trx_hash.init();
+}
+
+size_t trx_sys_t::history_size()
+{
+ ut_ad(is_initialised());
+ size_t size= 0;
+ for (auto &rseg : rseg_array)
+ {
+ rseg.latch.rd_lock(SRW_LOCK_CALL);
+ size+= rseg.history_size;
+ }
+ for (auto &rseg : rseg_array)
+ rseg.latch.rd_unlock();
+ return size;
+}
+
+bool trx_sys_t::history_exceeds(size_t threshold)
+{
+ ut_ad(is_initialised());
+ size_t size= 0;
+ bool exceeds= false;
+ size_t i;
+ for (i= 0; i < array_elements(rseg_array); i++)
+ {
+ rseg_array[i].latch.rd_lock(SRW_LOCK_CALL);
+ size+= rseg_array[i].history_size;
+ if (size > threshold)
+ {
+ exceeds= true;
+ i++;
+ break;
+ }
+ }
+ while (i)
+ rseg_array[--i].latch.rd_unlock();
+ return exceeds;
+}
+
+TPOOL_SUPPRESS_TSAN bool trx_sys_t::history_exists()
+{
+ ut_ad(is_initialised());
+ for (auto &rseg : rseg_array)
+ if (rseg.history_size)
+ return true;
+ return false;
+}
+
+TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const
+{
+ ut_ad(is_initialised());
+ size_t size= 0;
+ for (auto &rseg : rseg_array)
+ size+= rseg.history_size;
+ return size;
+}
+
+/** Create a persistent rollback segment.
+@param space_id system or undo tablespace id
+@return pointer to new rollback segment
+@retval nullptr on failure */
+static trx_rseg_t *trx_rseg_create(uint32_t space_id)
+{
+ trx_rseg_t *rseg= nullptr;
+ mtr_t mtr;
+
+ mtr.start();
+
+ if (fil_space_t *space= mtr.x_lock_space(space_id))
+ {
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+ if (buf_block_t *sys_header= trx_sysf_get(&mtr))
+ {
+ ulint rseg_id= trx_sys_rseg_find_free(sys_header);
+ dberr_t err;
+ if (buf_block_t *rblock= rseg_id == ULINT_UNDEFINED
+ ? nullptr : trx_rseg_header_create(space, rseg_id, 0, &mtr, &err))
+ {
+ rseg= &trx_sys.rseg_array[rseg_id];
+ rseg->init(space, rblock->page.id().page_no());
+ ut_ad(rseg->is_persistent());
+ mtr.write<4,mtr_t::MAYBE_NOP>
+ (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE +
+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+ space_id);
+ mtr.write<4,mtr_t::MAYBE_NOP>
+ (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+ rseg->page_no);
+ }
+ }
+ }
+
+ mtr.commit();
+ return rseg;
+}
+
+/** Create the rollback segments.
+@return whether the creation succeeded */
+bool trx_sys_create_rsegs()
+{
+ /* srv_available_undo_logs reflects the number of persistent
+ rollback segments that have been initialized in the
+ transaction system header page. */
+ ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+ if (high_level_read_only) {
+ srv_available_undo_logs = 0;
+ return(true);
+ }
+
+ /* This is executed in single-threaded mode therefore it is not
+ necessary to use the same mtr in trx_rseg_create(). n_used cannot
+ change while the function is executing. */
+ trx_sysf_get_n_rseg_slots();
+
+ ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+ /* The first persistent rollback segment is always initialized
+ in the system tablespace. */
+ ut_a(srv_available_undo_logs > 0);
+
+ for (uint32_t i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+ i++, srv_available_undo_logs++) {
+ /* Tablespace 0 is the system tablespace.
+ Dedicated undo log tablespaces start from 1. */
+ uint32_t space = srv_undo_tablespaces > 0
+ ? (i % srv_undo_tablespaces)
+ + srv_undo_space_id_start
+ : TRX_SYS_SPACE;
+
+ if (!trx_rseg_create(space)) {
+ ib::error() << "Unable to allocate the"
+ " requested innodb_undo_logs";
+ return(false);
+ }
+
+ /* Increase the number of active undo
+ tablespace in case new rollback segment
+ assigned to new undo tablespace. */
+ if (space > (srv_undo_space_id_start
+ + srv_undo_tablespaces_active - 1)) {
+ srv_undo_tablespaces_active++;
+ }
+ }
+
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ ib::info info;
+ info << srv_available_undo_logs;
+ if (srv_undo_tablespaces_active) {
+ info << " rollback segments in " << srv_undo_tablespaces_active
+ << " undo tablespaces are active.";
+ } else {
+ info << " rollback segments are active.";
+ }
+
+ return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+ ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+ if (!is_initialised()) {
+ return;
+ }
+
+ if (size_t size = view_count()) {
+ ib::error() << "All read views were not closed before"
+ " shutdown: " << size << " read views open";
+ }
+
+ rw_trx_hash.destroy();
+
+ /* There can't be any active transactions. */
+
+ for (ulint i = 0; i < array_elements(temp_rsegs); ++i) {
+ temp_rsegs[i].destroy();
+ }
+ for (ulint i = 0; i < array_elements(rseg_array); ++i) {
+ rseg_array[i].destroy();
+ }
+
+ ut_a(trx_list.empty());
+ trx_list.close();
+ m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+size_t trx_sys_t::any_active_transactions(size_t *prepared)
+{
+ size_t total_trx= 0, prepared_trx= 0;
+
+ trx_sys.trx_list.for_each([&](const trx_t &trx) {
+ switch (trx.state) {
+ case TRX_STATE_NOT_STARTED:
+ break;
+ case TRX_STATE_ACTIVE:
+ if (!trx.id)
+ break;
+ /* fall through */
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ total_trx++;
+ break;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ prepared_trx++;
+ }
+ });
+
+ if (prepared)
+ *prepared= prepared_trx;
+
+ return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..e5e2ef9e
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2292 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+#include "log.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+ 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg) /*!< in: detailed error message */
+{
+ strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+ trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file) /*!< in: file to read message from */
+{
+ os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+ trx_t* trx)
+{
+ trx->state = TRX_STATE_NOT_STARTED;
+
+ trx->is_recovered = false;
+
+ trx->op_info = "";
+
+ trx->active_commit_ordered = false;
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->check_foreigns = true;
+
+ trx->check_unique_secondary = true;
+
+ trx->lock.n_rec_locks = 0;
+
+ trx->dict_operation = false;
+
+ trx->error_state = DB_SUCCESS;
+
+ trx->error_key_num = ULINT_UNDEFINED;
+
+ trx->undo_no = 0;
+
+ trx->rsegs.m_redo.rseg = NULL;
+
+ trx->rsegs.m_noredo.rseg = NULL;
+
+ trx->read_only = false;
+
+ trx->auto_commit = false;
+
+ trx->will_lock = false;
+
+ trx->bulk_insert = false;
+
+ trx->apply_online_log = false;
+
+ ut_d(trx->start_file = 0);
+
+ ut_d(trx->start_line = 0);
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->last_sql_stat_start.least_undo_no = 0;
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.rec_cached = 0;
+
+ trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+ ut_ad(!trx->wsrep);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+ /** Initializes a transaction object. It must be explicitly started
+ with trx_start_if_not_started() before using it. The default isolation
+ level is TRX_ISO_REPEATABLE_READ.
+ @param trx Transaction instance to initialise */
+ static void init(trx_t* trx)
+ {
+ /* Explicitly call the constructor of the already
+ allocated object. trx_t objects are allocated by
+ ut_zalloc_nokey() in Pool::Pool() which would not call
+ the constructors of the trx_t members. */
+ new(&trx->mod_tables) trx_mod_tables_t();
+
+ new(&trx->lock.table_locks) lock_list();
+
+ new(&trx->read_view) ReadView();
+
+ trx->rw_trx_hash_pins = 0;
+ trx_init(trx);
+
+ trx->dict_operation_lock_mode = false;
+
+ trx->detailed_error = reinterpret_cast<char*>(
+ ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+ trx->lock.lock_heap = mem_heap_create_typed(
+ 1024, MEM_HEAP_FOR_LOCK_HEAP);
+ pthread_cond_init(&trx->lock.cond, nullptr);
+
+ UT_LIST_INIT(trx->lock.trx_locks, &lock_t::trx_locks);
+ UT_LIST_INIT(trx->lock.evicted_tables,
+ &dict_table_t::table_LRU);
+
+ UT_LIST_INIT(
+ trx->trx_savepoints,
+ &trx_named_savept_t::trx_savepoints);
+
+ trx->mutex_init();
+ }
+
+ /** Release resources held by the transaction object.
+ @param trx the transaction for which to release resources */
+ static void destroy(trx_t* trx)
+ {
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer */
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+ ut_ad(!trx->mysql_thd);
+
+ ut_a(trx->lock.wait_lock == NULL);
+ ut_a(trx->lock.wait_thr == NULL);
+ ut_a(!trx->dict_operation_lock_mode);
+
+ if (trx->lock.lock_heap != NULL) {
+ mem_heap_free(trx->lock.lock_heap);
+ trx->lock.lock_heap = NULL;
+ }
+
+ pthread_cond_destroy(&trx->lock.cond);
+
+ ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+ ut_free(trx->detailed_error);
+
+ trx->mutex_destroy();
+
+ trx->mod_tables.~trx_mod_tables_t();
+
+ ut_ad(!trx->read_view.is_open());
+
+ trx->lock.table_locks.~lock_list();
+
+ trx->read_view.~ReadView();
+ }
+};
+
+/** The lock strategy for TrxPool */
+class TrxPoolLock
+{
+ mysql_mutex_t mutex;
+
+public:
+ /** Create the mutex */
+ void create()
+ {
+ mysql_mutex_init(trx_pool_mutex_key, &mutex, nullptr);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mysql_mutex_lock(&mutex); }
+
+ /** Release the mutex */
+ void exit() { mysql_mutex_unlock(&mutex); }
+
+ /** Free the mutex */
+ void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** The lock strategy for the TrxPoolManager */
+class TrxPoolManagerLock
+{
+ mysql_mutex_t mutex;
+
+public:
+ /** Create the mutex */
+ void create()
+ {
+ mysql_mutex_init(trx_pool_manager_mutex_key, &mutex, nullptr);
+ }
+
+ /** Acquire the mutex */
+ void enter() { mysql_mutex_lock(&mutex); }
+
+ /** Release the mutex */
+ void exit() { mysql_mutex_unlock(&mutex); }
+
+ /** Free the mutex */
+ void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+ trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+ ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+ UT_DELETE(trx_pools);
+
+ trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+ trx_t* trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+ /* Unpoison the memory for AddressSanitizer.
+ It may have been poisoned in trx_t::free().*/
+ MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+ /* In Valgrind, we cannot cancel MEM_NOACCESS() without
+ changing the state of the V bits (which indicate
+ which bits are initialized).
+ We will declare the contents as initialized.
+ We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+ MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+ trx->assert_freed();
+
+ mem_heap_t* heap;
+ ib_alloc_t* alloc;
+
+ /* We just got trx from pool, it should be non locking */
+ ut_ad(!trx->will_lock);
+ ut_ad(!trx->rw_trx_hash_pins);
+
+ DBUG_LOG("trx", "Create: " << trx);
+
+ heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+ alloc = ib_heap_allocator_create(heap);
+
+ trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+ ut_ad(trx->mod_tables.empty());
+ ut_ad(trx->lock.n_rec_locks == 0);
+ ut_ad(trx->lock.table_cached == 0);
+ ut_ad(trx->lock.rec_cached == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+ trx_sys.register_trx(trx);
+
+ return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+#ifdef HAVE_MEM_CHECK
+ if (xid.is_null())
+ MEM_MAKE_DEFINED(&xid, sizeof xid);
+ else
+ MEM_MAKE_DEFINED(&xid.data[xid.gtrid_length + xid.bqual_length],
+ sizeof xid.data - (xid.gtrid_length + xid.bqual_length));
+#endif
+ MEM_CHECK_DEFINED(this, sizeof *this);
+
+ ut_ad(!n_mysql_tables_in_use);
+ ut_ad(!mysql_log_file_name);
+ ut_ad(!mysql_n_tables_locked);
+ ut_ad(!will_lock);
+ ut_ad(error_state == DB_SUCCESS);
+ ut_ad(magic_n == TRX_MAGIC_N);
+ ut_ad(!read_only);
+ ut_ad(!lock.wait_lock);
+
+ dict_operation= false;
+ trx_sys.deregister_trx(this);
+ check_unique_secondary= true;
+ check_foreigns= true;
+ assert_freed();
+ trx_sys.rw_trx_hash.put_pins(this);
+ mysql_thd= nullptr;
+
+ // FIXME: We need to avoid this heap free/alloc for each commit.
+ if (autoinc_locks)
+ {
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ /* We allocated a dedicated heap for the vector. */
+ ib_vector_free(autoinc_locks);
+ autoinc_locks= NULL;
+ }
+
+ MEM_NOACCESS(&skip_lock_inheritance_and_n_ref,
+ sizeof skip_lock_inheritance_and_n_ref);
+ /* do not poison mutex */
+ MEM_NOACCESS(&id, sizeof id);
+ MEM_NOACCESS(&max_inactive_id, sizeof id);
+ MEM_NOACCESS(&state, sizeof state);
+ MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+ MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+ read_view.mem_noaccess();
+ MEM_NOACCESS(&lock, sizeof lock);
+ MEM_NOACCESS(&op_info, sizeof op_info);
+ MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+ MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+ MEM_NOACCESS(&is_registered, sizeof is_registered);
+ MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+ MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+ MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+ MEM_NOACCESS(&duplicates, sizeof duplicates);
+ MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+ MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+ MEM_NOACCESS(&start_time, sizeof start_time);
+ MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+ MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+ MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+ MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+ MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+ MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+ MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+ MEM_NOACCESS(&error_state, sizeof error_state);
+ MEM_NOACCESS(&error_info, sizeof error_info);
+ MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+ MEM_NOACCESS(&graph, sizeof graph);
+ MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+ MEM_NOACCESS(&undo_no, sizeof undo_no);
+ MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+ MEM_NOACCESS(&rsegs, sizeof rsegs);
+ MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+ MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+ MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+ MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+ MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+ MEM_NOACCESS(&read_only, sizeof read_only);
+ MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+ MEM_NOACCESS(&will_lock, sizeof will_lock);
+ MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+ MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+ MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+#ifdef UNIV_DEBUG
+ MEM_NOACCESS(&start_line, sizeof start_line);
+ MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+ MEM_NOACCESS(&xid, sizeof xid);
+ MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+ MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+ MEM_NOACCESS(&magic_n, sizeof magic_n);
+ MEM_NOACCESS(&apply_online_log, sizeof apply_online_log);
+ trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+TRANSACTIONAL_INLINE inline void trx_t::commit_state()
+{
+ ut_ad(state == TRX_STATE_PREPARED
+ || state == TRX_STATE_PREPARED_RECOVERED
+ || state == TRX_STATE_ACTIVE);
+ /* This makes the transaction committed in memory and makes its
+ changes to data visible to other transactions. NOTE that there is a
+ small discrepancy from the strict formal visibility rules here: a
+ user of the database can see modifications made by another
+ transaction T even before the necessary redo log segment has been
+ flushed to the disk. If the database happens to crash before the
+ flush, the user has seen modifications from T which will never be a
+ committed transaction. However, any transaction T2 which sees the
+ modifications of the committing transaction T, and which also itself
+ makes modifications to the database, will get an lsn larger than the
+ committing transaction T. In the case where the log flush fails, and
+ T never gets committed, also T2 will never get committed. */
+ TMTrxGuard tg{*this};
+ state= TRX_STATE_COMMITTED_IN_MEMORY;
+ ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+ DEBUG_SYNC_C("trx_t_release_locks_enter");
+ DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+ DBUG_ASSERT(!is_referenced());
+
+ if (UT_LIST_GET_LEN(lock.trx_locks))
+ {
+ lock_release(this);
+ ut_ad(!lock.n_rec_locks);
+ ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(ib_vector_is_empty(autoinc_locks));
+ mem_heap_empty(lock.lock_heap);
+ }
+
+ lock.table_locks.clear();
+ reset_skip_lock_inheritance();
+ id= 0;
+ while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+ {
+ UT_LIST_REMOVE(lock.evicted_tables, table);
+ dict_mem_table_free(table);
+ }
+ DEBUG_SYNC_C("after_trx_committed_in_memory");
+}
+
+/** At shutdown, frees a transaction object. */
+TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx)
+{
+ ut_ad(trx->is_recovered);
+ ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+ || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+ || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+ && (!srv_was_started
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || (!srv_is_being_started
+ && !srv_undo_sources && srv_fast_shutdown))));
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ ut_d(trx->apply_online_log = false);
+ trx->commit_state();
+ trx->release_locks();
+ trx->mod_tables.clear();
+ trx_undo_free_at_shutdown(trx);
+
+ ut_a(!trx->read_only);
+
+ DBUG_LOG("trx", "Free prepared: " << trx);
+ trx->state = TRX_STATE_NOT_STARTED;
+ ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+ trx->free();
+}
+
+
+/**
+ Disconnect a prepared transaction from MySQL
+ @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+ ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->mysql_thd);
+ ut_ad(!trx->mysql_log_file_name);
+ trx->read_view.close();
+ trx_sys.trx_list.freeze();
+ trx->is_recovered= true;
+ trx->mysql_thd= NULL;
+ trx_sys.trx_list.unfreeze();
+ /* todo/fixme: suggest to do it at innodb prepare */
+ trx->will_lock= false;
+ trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Resurrect the table locks for a resurrected transaction. */
+static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo)
+{
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(undo.rseg == trx->rsegs.m_redo.rseg);
+
+ if (undo.empty())
+ return DB_SUCCESS;
+
+ mtr_t mtr;
+ std::map<table_id_t, bool> tables;
+ mtr.start();
+
+ dberr_t err;
+ if (buf_block_t *block=
+ buf_page_get_gen(page_id_t(trx->rsegs.m_redo.rseg->space->id,
+ undo.top_page_no), 0, RW_S_LATCH, nullptr,
+ BUF_GET, &mtr, &err))
+ {
+ buf_block_t *undo_block= block;
+ const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset;
+
+ do
+ {
+ byte type;
+ byte cmpl_info;
+ undo_no_t undo_no;
+ table_id_t table_id;
+ bool updated_extern;
+
+ if (undo_block != block)
+ {
+ mtr.release(*undo_block);
+ undo_block= block;
+ }
+ trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &updated_extern, &undo_no, &table_id);
+ tables.emplace(table_id, type == TRX_UNDO_EMPTY);
+ undo_rec= trx_undo_get_prev_rec(block, page_offset(undo_rec),
+ undo.hdr_page_no, undo.hdr_offset,
+ true, &mtr);
+ }
+ while (undo_rec);
+ }
+
+ mtr.commit();
+
+ if (err != DB_SUCCESS)
+ return err;
+
+ for (auto p : tables)
+ {
+ if (dict_table_t *table=
+ dict_table_open_on_id(p.first, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE))
+ {
+ if (!table->is_readable())
+ {
+ dict_sys.lock(SRW_LOCK_CALL);
+ table->release();
+ dict_sys.remove(table);
+ dict_sys.unlock();
+ continue;
+ }
+
+ if (trx->state == TRX_STATE_PREPARED)
+ trx->mod_tables.emplace(table, 0);
+
+ lock_table_resurrect(table, trx, p.second ? LOCK_X : LOCK_IX);
+
+ DBUG_LOG("ib_trx",
+ "resurrect " << ib::hex(trx->id) << " lock on " << table->name);
+ table->release();
+ }
+ }
+
+ return DB_SUCCESS;
+}
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/**
+ Resurrect the transactions that were doing inserts/updates the time of the
+ crash, they need to be undone.
+*/
+static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+ time_t start_time, ulonglong start_time_micro,
+ uint64_t *rows_to_undo)
+{
+ trx_state_t state;
+ ut_ad(rseg->needs_purge >= undo->trx_id);
+ /*
+ This is single-threaded startup code, we do not need the
+ protection of trx->mutex here.
+ */
+ switch (undo->state)
+ {
+ case TRX_UNDO_ACTIVE:
+ state= TRX_STATE_ACTIVE;
+ break;
+ case TRX_UNDO_PREPARED:
+ /*
+ Prepared transactions are left in the prepared state
+ waiting for a commit or abort decision from MySQL
+ */
+ state= TRX_STATE_PREPARED;
+ sql_print_information("InnoDB: Transaction " TRX_ID_FMT
+ " was in the XA prepared state.", undo->trx_id);
+ break;
+ default:
+ return DB_SUCCESS;
+ }
+
+ rseg->acquire();
+ trx_t *trx= trx_create();
+ trx->state= state;
+ ut_d(trx->start_file= __FILE__);
+ ut_d(trx->start_line= __LINE__);
+
+ trx->rsegs.m_redo.undo= undo;
+ trx->undo_no= undo->top_undo_no + 1;
+ trx->rsegs.m_redo.rseg= rseg;
+ trx->xid= undo->xid;
+ trx->id= undo->trx_id;
+ trx->is_recovered= true;
+ trx->start_time= start_time;
+ trx->start_time_micro= start_time_micro;
+ trx->dict_operation= undo->dict_operation;
+
+ trx_sys.rw_trx_hash.insert(trx);
+ trx_sys.rw_trx_hash.put_pins(trx);
+ if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+ *rows_to_undo+= trx->undo_no;
+ return trx_resurrect_table_locks(trx, *undo);
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+ ut_a(srv_is_being_started);
+ ut_ad(!srv_was_started);
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* mariabackup --prepare only deals with
+ the redo log and the data files, not with
+ transactions or the data dictionary. */
+ return trx_rseg_array_init();
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+ return DB_SUCCESS;
+ }
+
+ purge_sys.create();
+ dberr_t err = trx_rseg_array_init();
+
+ if (err != DB_SUCCESS) {
+corrupted:
+ ib::info() << "Retry with innodb_force_recovery=5";
+ return err;
+ }
+
+ if (trx_sys.is_undo_empty()) {
+func_exit:
+ purge_sys.clone_oldest_view<true>();
+ return DB_SUCCESS;
+ }
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions. */
+ const time_t start_time = time(NULL);
+ const ulonglong start_time_micro= microsecond_interval_timer();
+ uint64_t rows_to_undo = 0;
+
+ for (auto& rseg : trx_sys.rseg_array) {
+ trx_undo_t* undo;
+
+ /* Some rollback segment may be unavailable,
+ especially if the server was previously run with a
+ non-default value of innodb_undo_logs. */
+ if (!rseg.space) {
+ continue;
+ }
+ /* Resurrect other transactions. */
+ for (undo = UT_LIST_GET_FIRST(rseg.undo_list);
+ undo != NULL;
+ undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+ trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+ if (!trx) {
+ err = trx_resurrect(undo, &rseg, start_time,
+ start_time_micro,
+ &rows_to_undo);
+ } else {
+ ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED));
+ ut_ad(trx->start_time == start_time);
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->rsegs.m_redo.rseg == &rseg);
+ ut_ad(rseg.is_referenced());
+ ut_ad(rseg.needs_purge);
+
+ trx->rsegs.m_redo.undo = undo;
+ if (undo->top_undo_no >= trx->undo_no) {
+ if (trx_state_eq(trx,
+ TRX_STATE_ACTIVE)) {
+ rows_to_undo -= trx->undo_no;
+ rows_to_undo +=
+ undo->top_undo_no + 1;
+ }
+
+ trx->undo_no = undo->top_undo_no + 1;
+ }
+ err = trx_resurrect_table_locks(trx, *undo);
+ }
+
+ if (err != DB_SUCCESS) {
+ goto corrupted;
+ }
+ }
+ }
+
+ if (const auto size = trx_sys.rw_trx_hash.size()) {
+ ib::info() << size
+ << " transaction(s) which must be rolled back or"
+ " cleaned up in total " << rows_to_undo
+ << " row operations to undo";
+ ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+ }
+
+ goto func_exit;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@param trx transaction */
+static void trx_assign_rseg_low(trx_t *trx)
+{
+ ut_ad(!trx->rsegs.m_redo.rseg);
+ ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+ /* The first slot is always assigned to the system tablespace. */
+ ut_ad(trx_sys.rseg_array[0].space == fil_system.sys_space);
+
+ trx_sys.register_rw(trx);
+ ut_ad(trx->id);
+
+ /* Choose a rollback segment evenly distributed between 0 and
+ innodb_undo_logs-1 in a round-robin fashion, skipping those
+ undo tablespaces that are scheduled for truncation. */
+ static Atomic_counter<unsigned> rseg_slot;
+ unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+ ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+ ut_d(const auto start_scan_slot = slot);
+ ut_d(bool look_for_rollover = false);
+ trx_rseg_t* rseg;
+
+ bool allocated;
+
+ do {
+ for (;;) {
+ rseg = &trx_sys.rseg_array[slot];
+ ut_ad(!look_for_rollover || start_scan_slot != slot);
+ ut_d(look_for_rollover = true);
+ ut_d(if (!trx_rseg_n_slots_debug))
+ slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+ if (!rseg->space) {
+ continue;
+ }
+
+ ut_ad(rseg->is_persistent());
+
+ if (rseg->space != fil_system.sys_space) {
+ if (rseg->skip_allocation()) {
+ continue;
+ }
+ } else if (const fil_space_t *space =
+ trx_sys.rseg_array[slot].space) {
+ if (space != fil_system.sys_space
+ && srv_undo_tablespaces > 0) {
+ /** If dedicated
+ innodb_undo_tablespaces have
+ been configured, try to use them
+ instead of the system tablespace. */
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ /* By now we have only selected the rseg but not marked it
+ allocated. By marking it allocated we are ensuring that it will
+ never be selected for UNDO truncate purge. */
+ allocated = rseg->acquire_if_available();
+ } while (!allocated);
+
+ trx->rsegs.m_redo.rseg = rseg;
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+ ut_ad(!rsegs.m_noredo.rseg);
+ ut_ad(!is_autocommit_non_locking());
+ compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+ /* Choose a temporary rollback segment between 0 and 127
+ in a round-robin fashion. */
+ static Atomic_counter<unsigned> rseg_slot;
+ trx_rseg_t* rseg = &trx_sys.temp_rsegs[
+ rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+ ut_ad(!rseg->is_persistent());
+ rsegs.m_noredo.rseg = rseg;
+
+ if (id == 0) {
+ trx_sys.register_rw(this);
+ }
+
+ return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read-write transaction */
+{
+ ut_ad(!trx->in_rollback);
+ ut_ad(!trx->is_recovered);
+ ut_ad(trx->start_line != 0);
+ ut_ad(trx->start_file != 0);
+ ut_ad(trx->roll_limit == 0);
+ ut_ad(trx->error_state == DB_SUCCESS);
+ ut_ad(trx->rsegs.m_redo.rseg == NULL);
+ ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+ ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+ ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+ /* Check whether it is an AUTOCOMMIT SELECT */
+ trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+ trx->read_only = srv_read_only_mode
+ || (!trx->dict_operation
+ && thd_trx_is_read_only(trx->mysql_thd));
+
+ if (!trx->auto_commit) {
+ trx->will_lock = true;
+ } else if (!trx->will_lock) {
+ trx->read_only = true;
+ }
+
+#ifdef WITH_WSREP
+ trx->xid.null();
+#endif /* WITH_WSREP */
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ ut_a(trx->lock.table_locks.empty());
+
+ /* No other thread can access this trx object through rw_trx_hash,
+ still it can be found through trx_sys.trx_list. Sometimes it's
+ possible to indirectly protect trx_t::state by freezing
+ trx_sys.trx_list.
+
+ For now we update it without mutex protection, because original code
+ did it this way. It has to be reviewed and fixed properly. */
+ trx->state = TRX_STATE_ACTIVE;
+
+ /* By default all transactions are in the read-only list unless they
+ are non-locking auto-commit read only transactions or background
+ (internal) transactions. Note: Transactions marked explicitly as
+ read only can write to temporary tables, we put those on the RO
+ list too. */
+
+ if (!trx->read_only
+ && (!trx->mysql_thd || read_write || trx->dict_operation)) {
+ /* Temporary rseg is assigned only if the transaction
+ updates a temporary table */
+ if (!high_level_read_only) {
+ trx_assign_rseg_low(trx);
+ }
+ } else {
+ if (!trx->is_autocommit_non_locking()) {
+
+ /* If this is a read-only transaction that is writing
+ to a temporary table then it needs a transaction id
+ to write to the temporary table. */
+
+ if (read_write) {
+ ut_ad(!srv_read_only_mode);
+ trx_sys.register_rw(trx);
+ }
+ } else {
+ ut_ad(!read_write);
+ }
+ }
+
+ trx->start_time = time(NULL);
+ trx->start_time_micro = trx->mysql_thd
+ ? thd_start_utime(trx->mysql_thd)
+ : microsecond_interval_timer();
+
+ ut_a(trx->error_state == DB_SUCCESS);
+}
+
+/** Release an empty undo log that was associated with a transaction. */
+ATTRIBUTE_COLD
+void trx_t::commit_empty(mtr_t *mtr)
+{
+ trx_rseg_t *rseg= rsegs.m_redo.rseg;
+ trx_undo_t *&undo= rsegs.m_redo.undo;
+
+ ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED);
+ ut_ad(undo->size == 1);
+
+ if (buf_block_t *u=
+ buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0,
+ RW_X_LATCH, mtr))
+ {
+ ut_d(const uint16_t state=
+ mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame));
+ ut_ad(state == undo->state || state == TRX_UNDO_ACTIVE);
+ static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+ "compatibility");
+ ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + u->page.frame,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + u->page.frame, 2));
+ ut_ad(mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_PAGE + u->page.frame) == FIL_NULL);
+ ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_BYTE + u->page.frame) == 0);
+ ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ u->page.frame,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ u->page.frame, FIL_ADDR_SIZE));
+
+ /* Delete the last undo log header, which must be for this transaction.
+
+ An undo segment can be reused (TRX_UNDO_CACHED) only if it
+ comprises of one page and that single page contains enough space
+ for the undo log header of a subsequent transaction. See
+ trx_purge_add_undo_to_history(), which is executed when committing
+ a nonempty transaction.
+
+ If we simply changed the undo page state to TRX_UNDO_CACHED,
+ then trx_undo_reuse_cached() could run out of space. We will
+ release the space consumed by our empty undo log to avoid that. */
+ for (byte *last= &u->page.frame[TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE],
+ *prev= nullptr;;)
+ {
+ /* TRX_UNDO_PREV_LOG is only being read in debug assertions, and
+ written in trx_undo_header_create(). To remain compatible with
+ possibly corrupted old data files, we will not read the field
+ TRX_UNDO_PREV_LOG but instead rely on TRX_UNDO_NEXT_LOG. */
+ ut_ad(mach_read_from_2(TRX_UNDO_PREV_LOG + last) ==
+ (reinterpret_cast<size_t>(prev) & (srv_page_size - 1)));
+
+ if (uint16_t next= mach_read_from_2(TRX_UNDO_NEXT_LOG + last))
+ {
+ ut_ad(ulint{next} + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+ ut_ad(&u->page.frame[next] > last);
+ ut_ad(mach_read_from_2(TRX_UNDO_LOG_START + last) <= next);
+ prev= last;
+ last= &u->page.frame[next];
+ continue;
+ }
+
+ ut_ad(mach_read_from_8(TRX_UNDO_TRX_ID + last) == id);
+ ut_ad(!mach_read_from_8(TRX_UNDO_TRX_NO + last));
+ ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + u->page.frame,
+ TRX_UNDO_LOG_START + last, 2));
+
+ if (prev)
+ {
+ mtr->memcpy(*u, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+ u->page.frame, prev + TRX_UNDO_LOG_START, 2);
+ const ulint free= page_offset(last);
+ mtr->write<2>(*u, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+ u->page.frame, free);
+ mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame,
+ TRX_UNDO_CACHED);
+ mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + u->page.frame,
+ page_offset(prev));
+ mtr->write<2>(*u, prev + TRX_UNDO_NEXT_LOG, 0U);
+ mtr->memset(u, free, srv_page_size - FIL_PAGE_DATA_END - free, 0);
+
+ /* We may have updated PAGE_MAX_TRX_ID on secondary index pages
+ to this->id. Ensure that trx_sys.m_max_trx_id will be recovered
+ correctly, even though we removed our undo log record along
+ with the TRX_UNDO_TRX_ID above. */
+
+ /* Below, we are acquiring rseg_header->page.lock after
+ u->page.lock (the opposite of trx_purge_add_undo_to_history()).
+ This is fine, because both functions are holding exclusive
+ rseg->latch. */
+
+ if (mach_read_from_8(prev + TRX_UNDO_TRX_NO) >= id);
+ else if (buf_block_t *rseg_header= rseg->get(mtr, nullptr))
+ {
+ byte *m= TRX_RSEG + TRX_RSEG_MAX_TRX_ID + rseg_header->page.frame;
+
+ do
+ {
+ if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+ rseg_header->page.frame)))
+ /* This must have been upgraded from before MariaDB 10.3.5. */
+ trx_rseg_format_upgrade(rseg_header, mtr);
+ else if (mach_read_from_8(m) >= id)
+ continue;
+ mtr->write<8>(*rseg_header, m, id);
+ }
+ while (0);
+ }
+ }
+ else
+ /* Our undo log header was right after the undo log segment header.
+ This page should have been created by trx_undo_create(), not
+ returned by trx_undo_reuse_cached().
+
+ We retain the dummy empty log in order to remain compatible with
+ trx_undo_mem_create_at_db_start(). This page will remain available
+ to trx_undo_reuse_cached(), and it will eventually be freed by
+ trx_purge_truncate_rseg_history(). */
+ mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame,
+ TRX_UNDO_CACHED);
+ break;
+ }
+ }
+ else
+ ut_ad("undo log page was not found" == 0);
+
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+ UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+ undo->state= TRX_UNDO_CACHED;
+ undo= nullptr;
+
+ /* We must assign an "end" identifier even though we are not going
+ to persistently write it anywhere, to make sure that the purge of
+ history will not be stuck. */
+ trx_sys.assign_new_trx_no(this);
+}
+
+/** Assign the transaction its history serialisation number and write the
+UNDO log to the assigned rollback segment.
+@param mtr mini-transaction */
+inline void trx_t::write_serialisation_history(mtr_t *mtr)
+{
+ ut_ad(!read_only);
+ trx_rseg_t *rseg= rsegs.m_redo.rseg;
+ trx_undo_t *&undo= rsegs.m_redo.undo;
+ if (UNIV_LIKELY(undo != nullptr))
+ {
+ MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+
+ /* We have to hold exclusive rseg->latch because undo log headers have
+ to be put to the history list in the (serialisation) order of the
+ UNDO trx number. This is required for purge_sys too. */
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ ut_ad(undo->rseg == rseg);
+ /* Assign the transaction serialisation number and add any
+ undo log to the purge queue. */
+ if (UNIV_UNLIKELY(!undo_no))
+ {
+ /* The transaction was rolled back. */
+ commit_empty(mtr);
+ goto done;
+ }
+ else if (rseg->last_page_no == FIL_NULL)
+ {
+ mysql_mutex_lock(&purge_sys.pq_mutex);
+ trx_sys.assign_new_trx_no(this);
+ const trx_id_t end{rw_trx_hash_element->no};
+ /* end cannot be less than anything in rseg. User threads only
+ produce events when a rollback segment is empty. */
+ purge_sys.purge_queue.push(TrxUndoRsegs{end, *rseg});
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+ rseg->last_page_no= undo->hdr_page_no;
+ rseg->set_last_commit(undo->hdr_offset, end);
+ }
+ else
+ trx_sys.assign_new_trx_no(this);
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+ /* Change the undo log segment state from TRX_UNDO_ACTIVE, to
+ define the transaction as committed in the file based domain,
+ at mtr->commit_lsn() obtained in mtr->commit() below. */
+ trx_purge_add_undo_to_history(this, undo, mtr);
+ done:
+ rseg->release();
+ rseg->latch.wr_unlock();
+ }
+ else
+ rseg->release();
+ mtr->commit();
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+ fts_trx_table_t* ftt) /* in: FTS trx table */
+{
+ fts_t* fts = ftt->table->fts;
+ fts_doc_ids_t* doc_ids = ftt->added_doc_ids;
+
+ ut_a(fts->add_wq);
+
+ mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+ ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+ /* fts_trx_table_t no longer owns the list. */
+ ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool is_commit) /*!< in: true if the transaction was
+ committed, false if it was rolled back. */
+{
+ if (is_commit) {
+ const ib_rbt_node_t* node;
+ ib_rbt_t* tables;
+ fts_savepoint_t* savepoint;
+
+ savepoint = static_cast<fts_savepoint_t*>(
+ ib_vector_last(trx->fts_trx->savepoints));
+
+ tables = savepoint->tables;
+
+ for (node = rbt_first(tables);
+ node;
+ node = rbt_next(tables, node)) {
+ fts_trx_table_t** ftt;
+
+ ftt = rbt_value(fts_trx_table_t*, node);
+
+ if ((*ftt)->added_doc_ids) {
+ trx_finalize_for_fts_table(*ftt);
+ }
+ }
+ }
+
+ fts_trx_free(trx->fts_trx);
+ trx->fts_trx = NULL;
+}
+
+extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD);
+extern "C" void thd_decrement_pending_ops(MYSQL_THD);
+
+
+#include "../log/log0sync.h"
+
+/*
+ If required, initiates write and optionally flush of the log to
+ disk
+ @param lsn LSN up to which logs are to be flushed.
+ @param trx transaction; if trx->state is PREPARED, the function will
+ also wait for the flush to complete.
+*/
+static void trx_flush_log_if_needed(lsn_t lsn, trx_t *trx)
+{
+ ut_ad(srv_flush_log_at_trx_commit);
+ ut_ad(trx->state != TRX_STATE_PREPARED);
+
+ if (log_sys.get_flushed_lsn(std::memory_order_relaxed) >= lsn)
+ return;
+
+ const bool flush=
+ (srv_file_flush_method != SRV_NOSYNC &&
+ (srv_flush_log_at_trx_commit & 1));
+
+ completion_callback cb;
+ if (!log_sys.is_pmem() &&
+ (cb.m_param= thd_increment_pending_ops(trx->mysql_thd)))
+ {
+ cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops;
+ log_write_up_to(lsn, flush, &cb);
+ }
+ else
+ {
+ trx->op_info= "flushing log";
+ log_write_up_to(lsn, flush);
+ trx->op_info= "";
+ }
+}
+
+/** Process tables that were modified by the committing transaction. */
+inline void trx_t::commit_tables()
+{
+ if (undo_no && !mod_tables.empty())
+ {
+ const trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+ const auto now= start_time;
+
+ for (const auto &p : mod_tables)
+ {
+ dict_table_t *table= p.first;
+ table->update_time= now;
+ table->query_cache_inv_trx_id= max_trx_id;
+ }
+ }
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param table_id table identifier
+@param reset_only whether to only reset dict_table_t::def_trx_id */
+void trx_t::evict_table(table_id_t table_id, bool reset_only)
+{
+ ut_ad(in_rollback);
+
+ dict_table_t* table = dict_sys.find_table(table_id);
+ if (!table) {
+ return;
+ }
+
+ table->def_trx_id = 0;
+
+ if (auto ref_count = table->get_ref_count()) {
+ /* This must be a DDL operation that is being rolled
+ back in an active connection. */
+ ut_a(ref_count == 1);
+ ut_ad(!is_recovered);
+ ut_ad(mysql_thd);
+ return;
+ }
+
+ if (reset_only) {
+ return;
+ }
+
+ /* This table should only be locked by this transaction, if at all. */
+ ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+ const bool locked = UT_LIST_GET_LEN(table->locks);
+ ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+ dict_sys.remove(table, true, locked);
+ if (locked) {
+ UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+ }
+}
+
+/** Free temporary undo log after commit or rollback.
+@param undo temporary undo log */
+ATTRIBUTE_NOINLINE static void trx_commit_cleanup(trx_undo_t *&undo)
+{
+ trx_rseg_t *const rseg= undo->rseg;
+ ut_ad(rseg->space == fil_system.temp_space);
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ UT_LIST_REMOVE(rseg->undo_list, undo);
+ ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED);
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+ /* Delete first the undo log segment in the file */
+ bool finished;
+ mtr_t mtr;
+ do
+ {
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ finished= true;
+
+ if (buf_block_t *block=
+ buf_page_get(page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), 0,
+ RW_X_LATCH, &mtr))
+ {
+ fseg_header_t *file_seg= TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+ block->page.frame;
+
+ finished= fseg_free_step(file_seg, &mtr);
+
+ if (!finished);
+ else if (buf_block_t *rseg_header= rseg->get(&mtr, nullptr))
+ {
+ static_assert(FIL_NULL == 0xffffffff, "compatibility");
+ memset(rseg_header->page.frame + TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+ undo->id * TRX_RSEG_SLOT_SIZE, 0xff, 4);
+ }
+ }
+
+ mtr.commit();
+ }
+ while (!finished);
+
+ ut_ad(rseg->curr_size > undo->size);
+ rseg->curr_size-= undo->size;
+ rseg->latch.wr_unlock();
+ ut_free(undo);
+ undo= nullptr;
+}
+
+TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+ /* We already detached from rseg in write_serialisation_history() */
+ ut_ad(!rsegs.m_redo.undo);
+ read_view.close();
+
+ if (is_autocommit_non_locking())
+ {
+ ut_ad(id == 0);
+ ut_ad(read_only);
+ ut_ad(!will_lock);
+ ut_a(!is_recovered);
+ ut_ad(!rsegs.m_redo.rseg);
+ ut_ad(!rsegs.m_redo.undo);
+ ut_ad(mysql_thd);
+ ut_ad(state == TRX_STATE_ACTIVE);
+
+ /* Note: We do not have to hold any lock_sys latch here, because
+ this is a non-locking transaction. */
+ ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+ ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+ /* This state change is not protected by any mutex, therefore
+ there is an inherent race here around state transition during
+ printouts. We ignore this race for the sake of efficiency.
+ However, the freezing of trx_sys.trx_list will protect the trx_t
+ instance and it cannot be removed from the trx_list and freed
+ without first unfreezing trx_list. */
+ state= TRX_STATE_NOT_STARTED;
+
+ MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+ DBUG_LOG("trx", "Autocommit in memory: " << this);
+ }
+ else
+ {
+#ifdef UNIV_DEBUG
+ if (!UT_LIST_GET_LEN(lock.trx_locks))
+ for (auto l : lock.table_locks)
+ ut_ad(!l);
+#endif /* UNIV_DEBUG */
+ commit_state();
+
+ if (id)
+ {
+ trx_sys.deregister_rw(this);
+
+ /* Wait for any implicit-to-explicit lock conversions to cease,
+ so that there will be no race condition in lock_release(). */
+ while (UNIV_UNLIKELY(is_referenced()))
+ LF_BACKOFF();
+ }
+ else
+ ut_ad(read_only || !rsegs.m_redo.rseg);
+
+ if (read_only || !rsegs.m_redo.rseg)
+ {
+ MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+ }
+ else
+ {
+ commit_tables();
+ MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+ is_recovered= false;
+ }
+
+ if (UNIV_LIKELY(!dict_operation))
+ release_locks();
+ }
+
+ if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+ {
+ ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+ trx_commit_cleanup(undo);
+ }
+
+ if (mtr)
+ {
+ /* NOTE that we could possibly make a group commit more efficient
+ here: call std::this_thread::yield() here to allow also other trxs to come
+ to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if the OS
+ does not crash. We may also flush the log files to disk, making
+ the transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group commit is that a group of transactions
+ gather behind a trx doing a physical disk write to log files, and
+ when that physical write has been completed, one of those
+ transactions does a write which commits the whole group. Note that
+ this group commit will only bring benefit if there are > 2 users
+ in the database. Then at least 2 users can gather behind one doing
+ the physical log write to disk.
+
+ If we are calling trx_t::commit() under prepare_commit_mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the mutex. This is to make the group commit
+ algorithm to work. Otherwise, the prepare_commit mutex would
+ serialize all commits and prevent a group of transactions from
+ gathering. */
+
+ commit_lsn= undo_no || !xid.is_null() ? mtr->commit_lsn() : 0;
+ if (commit_lsn && !flush_log_later && srv_flush_log_at_trx_commit)
+ {
+ trx_flush_log_if_needed(commit_lsn, this);
+ commit_lsn= 0;
+ }
+ }
+
+ savepoints_discard();
+
+ if (fts_trx)
+ trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+ /* Serialization history has been written and the transaction is
+ committed in memory, which makes this commit ordered. Release commit
+ order critical section. */
+ if (wsrep)
+ {
+ wsrep= false;
+ wsrep_commit_ordered(mysql_thd);
+ }
+#endif /* WITH_WSREP */
+ lock.was_chosen_as_deadlock_victim= false;
+}
+
+void trx_t::commit_cleanup()
+{
+ ut_ad(!dict_operation);
+ ut_ad(!was_dict_operation);
+
+ if (is_bulk_insert())
+ for (auto &t : mod_tables)
+ delete t.second.bulk_store;
+
+ mutex.wr_lock();
+ state= TRX_STATE_NOT_STARTED;
+ mod_tables.clear();
+
+ check_foreigns= true;
+ check_unique_secondary= true;
+ assert_freed();
+ trx_init(this);
+ mutex.wr_unlock();
+
+ ut_a(error_state == DB_SUCCESS);
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr mini-transaction (if there are any persistent modifications) */
+TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr)
+{
+ ut_ad(!mtr || mtr->is_active());
+ ut_d(bool aborted= in_rollback && error_state == DB_DEADLOCK);
+ ut_ad(!mtr == (aborted || !has_logged_persistent()));
+ ut_ad(!mtr || !aborted);
+
+ if (fts_trx && undo_no)
+ {
+ ut_a(!is_autocommit_non_locking());
+ /* MDEV-24088 FIXME: Invoke fts_commit() earlier (before possible
+ XA PREPARE), so that we will be able to return an error and rollback
+ the transaction, instead of violating consistency!
+
+ The original claim about DB_DUPLICATE KEY was:
+ This is a possible scenario if there is a crash between
+ insert to DELETED table committing and transaction committing. The
+ fix would be able to return error from this function */
+ if (ut_d(dberr_t error=) fts_commit(this))
+ ut_ad(error == DB_DUPLICATE_KEY || error == DB_LOCK_WAIT_TIMEOUT);
+ }
+
+#ifdef ENABLED_DEBUG_SYNC
+ const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+ if (mtr)
+ {
+ if (UNIV_UNLIKELY(apply_online_log))
+ apply_log();
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this log
+ sequence number. The transaction becomes 'durable' when we write
+ the log to disk, but in the logical sense the commit in the
+ file-based data structures (undo logs etc.) happens here.
+
+ NOTE that transaction numbers do not necessarily come in
+ exactly the same order as commit lsn's, if the transactions have
+ different rollback segments. However, if a transaction T2 is
+ able to see modifications made by a transaction T1, T2 will always
+ get a bigger transaction number and a bigger commit lsn than T1. */
+ write_serialisation_history(mtr);
+ }
+ else if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+ {
+ ut_ad(id);
+ ut_ad(!rsegs.m_redo.undo);
+ rseg->release();
+ }
+
+#ifdef ENABLED_DEBUG_SYNC
+ if (debug_sync)
+ DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+ commit_in_memory(mtr);
+}
+
+
+void trx_t::commit_persist()
+{
+ mtr_t *mtr= nullptr;
+ mtr_t local_mtr;
+
+ if (has_logged_persistent())
+ {
+ mtr= &local_mtr;
+ local_mtr.start();
+ }
+ commit_low(mtr);
+}
+
+
+void trx_t::commit()
+{
+ ut_ad(!was_dict_operation);
+ ut_d(was_dict_operation= dict_operation);
+ dict_operation= false;
+ commit_persist();
+#ifdef UNIV_DEBUG
+ if (!was_dict_operation)
+ for (const auto &p : mod_tables) ut_ad(!p.second.is_dropped());
+#endif /* UNIV_DEBUG */
+ ut_d(was_dict_operation= false);
+ commit_cleanup();
+}
+
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* We are reading trx->state without holding trx->mutex
+ here, because the commit or rollback should be invoked for a
+ running (or recovered prepared) transaction that is associated
+ with the current thread. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, true);
+ /* fall through */
+
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ trx->lock.wait_thr = NULL;
+ return;
+
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ commit_node_t* node;
+
+ node = static_cast<commit_node_t*>(thr->run_node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ trx_t* trx;
+
+ node->state = COMMIT_NODE_WAIT;
+
+ trx = thr_get_trx(thr);
+
+ ut_a(trx->lock.wait_thr == NULL);
+
+ trx_commit_or_rollback_prepare(trx);
+
+ trx->commit();
+ ut_ad(trx->lock.wait_thr == NULL);
+
+ thr = NULL;
+ } else {
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ return DB_SUCCESS;
+ case TRX_STATE_ACTIVE:
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ trx->op_info = "committing";
+ trx->commit();
+ trx->op_info = "";
+ return(DB_SUCCESS);
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+ ut_error;
+ return(DB_CORRUPTION);
+}
+
+/** Durably write log until trx->commit_lsn
+(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */
+void trx_commit_complete_for_mysql(trx_t *trx)
+{
+ const lsn_t lsn= trx->commit_lsn;
+ if (!lsn)
+ return;
+ switch (srv_flush_log_at_trx_commit) {
+ case 0:
+ return;
+ case 1:
+ if (trx->active_commit_ordered)
+ return;
+ }
+ trx_flush_log_if_needed(lsn, trx);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ut_a(trx);
+
+ switch (trx->state) {
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ case TRX_STATE_NOT_STARTED:
+ trx->undo_no = 0;
+ /* fall through */
+ case TRX_STATE_ACTIVE:
+ if (trx->fts_trx != NULL) {
+ fts_savepoint_laststmt_refresh(trx);
+ }
+
+ if (trx->is_bulk_insert()) {
+ /* MDEV-25036 FIXME: we support buffered
+ insert only for the first insert statement */
+ trx->error_state = trx->bulk_insert_apply();
+ /* Allow a subsequent INSERT into an empty table
+ if !unique_checks && !foreign_key_checks. */
+ return;
+ }
+
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+ trx->end_bulk_insert();
+ return;
+ }
+
+ ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+ FILE* f,
+ /*!< in: output stream */
+ const trx_t* trx,
+ /*!< in: transaction */
+ ulint max_query_len,
+ /*!< in: max query length to print,
+ or 0 to use the default max length */
+ ulint n_rec_locks,
+ /*!< in: trx->lock.n_rec_locks */
+ ulint n_trx_locks,
+ /*!< in: length of trx->lock.trx_locks */
+ ulint heap_size)
+ /*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+ if (const trx_id_t id = trx->id) {
+ fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+ } else {
+ fprintf(f, "TRANSACTION (%p)", trx);
+ }
+
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ fputs(", not started", f);
+ goto state_ok;
+ case TRX_STATE_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong) difftime(time(NULL), trx->start_time));
+ goto state_ok;
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ goto state_ok;
+ }
+ fprintf(f, ", state %lu", (ulong) trx->state);
+ ut_ad(0);
+state_ok:
+ const char* op_info = trx->op_info;
+
+ if (*op_info) {
+ putc(' ', f);
+ fputs(op_info, f);
+ }
+
+ if (trx->is_recovered) {
+ fputs(" recovered trx", f);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ bool newline = true;
+
+ if (trx->in_rollback) { /* dirty read for performance reasons */
+ fputs("ROLLING BACK ", f);
+ } else if (trx->lock.wait_lock) {
+ fputs("LOCK WAIT ", f);
+ } else {
+ newline = false;
+ }
+
+ if (n_trx_locks > 0 || heap_size > 400) {
+ newline = true;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu,"
+ " %lu row lock(s)",
+ (ulong) n_trx_locks,
+ (ulong) heap_size,
+ (ulong) n_rec_locks);
+ }
+
+ if (trx->undo_no != 0) {
+ newline = true;
+ fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(
+ f, trx->mysql_thd, static_cast<uint>(max_query_len));
+ }
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.latch.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ lock_sys.assert_locked();
+
+ trx_print_low(f, trx, max_query_len,
+ trx->lock.n_rec_locks,
+ UT_LIST_GET_LEN(trx->lock.trx_locks),
+ mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+TRANSACTIONAL_TARGET
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ const trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print,
+ or 0 to use the default max length */
+{
+ ulint n_rec_locks, n_trx_locks, heap_size;
+ {
+ TMLockMutexGuard g{SRW_LOCK_CALL};
+ n_rec_locks= trx->lock.n_rec_locks;
+ n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks);
+ heap_size= mem_heap_get_size(trx->lock.lock_heap);
+ }
+
+ trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size);
+}
+
+/** Prepare a transaction.
+@return log sequence number that makes the XA PREPARE durable
+@retval 0 if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+ ut_ad(!trx->is_recovered);
+
+ mtr_t mtr;
+
+ if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+ ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+ mtr.commit();
+ }
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (!undo) {
+ /* There were no changes to persistent tables. */
+ return(0);
+ }
+
+ ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+ mtr.start();
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE to
+ TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the file-based
+ world, at the serialization point of lsn. */
+ trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+
+ /* Make the XA PREPARE durable. */
+ mtr.commit();
+ ut_ad(mtr.commit_lsn() > 0);
+ return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+TRANSACTIONAL_TARGET
+static
+void
+trx_prepare(
+/*========*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ /* Only fresh user transactions can be prepared.
+ Recovered transactions cannot. */
+ ut_a(!trx->is_recovered);
+
+ lsn_t lsn = trx_prepare_low(trx);
+
+ ut_a(trx->state == TRX_STATE_ACTIVE);
+ {
+ TMTrxGuard tg{*trx};
+ trx->state = TRX_STATE_PREPARED;
+ }
+
+ if (lsn) {
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ We must not be holding any mutexes or latches here. */
+ if (auto f = srv_flush_log_at_trx_commit) {
+ log_write_up_to(lsn, (f & 1) && srv_file_flush_method
+ != SRV_NOSYNC);
+ }
+
+ if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
+ || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ /* Do not release any locks at the
+ SERIALIZABLE isolation level. */
+ } else if (!trx->mysql_thd
+ || thd_sql_command(trx->mysql_thd)
+ != SQLCOM_XA_PREPARE) {
+ /* Do not release locks for XA COMMIT ONE PHASE
+ or for internal distributed transactions
+ (XID::get_my_xid() would be nonzero). */
+ } else {
+ lock_release_on_prepare(trx);
+ }
+ }
+}
+
+/** XA PREPARE a transaction.
+@param[in,out] trx transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+ trx_start_if_not_started_xa(trx, false);
+
+ trx->op_info = "preparing";
+
+ trx_prepare(trx);
+
+ trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+ XID *xid_list;
+ uint len;
+ uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+ trx_recover_for_mysql_callback_arg *arg)
+{
+ DBUG_ASSERT(arg->len > 0);
+ element->mutex.wr_lock();
+ if (trx_t *trx= element->trx)
+ {
+ /*
+ The state of a read-write transaction can only change from ACTIVE to
+ PREPARED while we are holding the element->mutex. But since it is
+ executed at startup no state change should occur.
+ */
+ if (trx_state_eq(trx, TRX_STATE_PREPARED))
+ {
+ ut_ad(trx->is_recovered);
+ ut_ad(trx->id);
+ if (arg->count == 0)
+ ib::info() << "Starting recovery for XA transactions...";
+ XID& xid= arg->xid_list[arg->count];
+ if (arg->count++ < arg->len)
+ {
+ trx->state= TRX_STATE_PREPARED_RECOVERED;
+ ib::info() << "Transaction " << trx->id
+ << " in prepared state after recovery";
+ ib::info() << "Transaction contains changes to " << trx->undo_no
+ << " rows";
+ xid= trx->xid;
+ }
+ }
+ }
+ element->mutex.wr_unlock();
+ /* Do not terminate upon reaching arg->len; count all transactions */
+ return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+ void*)
+{
+ element->mutex.wr_lock();
+ if (trx_t *trx= element->trx)
+ {
+ if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+ trx->state= TRX_STATE_PREPARED;
+ }
+ element->mutex.wr_unlock();
+ return false;
+}
+
+
+/**
+ Find prepared transaction objects for recovery.
+
+ @param[out] xid_list prepared transactions
+ @param[in] len number of slots in xid_list
+
+ @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+ trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ /* Fill xid_list with PREPARED transactions. */
+ trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+ if (arg.count)
+ {
+ ib::info() << arg.count
+ << " transactions in prepared state after recovery";
+ /* After returning the full list, reset the state, because
+ init_server_components() wants to recover the collection of
+ transactions twice, by first calling tc_log->open() and then
+ ha_recover() directly. */
+ if (arg.count <= len)
+ trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+ }
+ return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+ const XID *xid;
+ trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+ trx_get_trx_by_xid_callback_arg *arg)
+{
+ my_bool found= 0;
+ element->mutex.wr_lock();
+ if (trx_t *trx= element->trx)
+ {
+ trx->mutex_lock();
+ if (trx->is_recovered &&
+ (trx_state_eq(trx, TRX_STATE_PREPARED) ||
+ trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+ arg->xid->eq(&trx->xid))
+ {
+#ifdef WITH_WSREP
+ /* The commit of a prepared recovered Galera
+ transaction needs a valid trx->xid for
+ invoking trx_sys_update_wsrep_checkpoint(). */
+ if (!wsrep_is_wsrep_xid(&trx->xid))
+#endif /* WITH_WSREP */
+ /* Invalidate the XID, so that subsequent calls will not find it. */
+ trx->xid.null();
+ arg->trx= trx;
+ found= 1;
+ }
+ trx->mutex_unlock();
+ }
+ element->mutex.wr_unlock();
+ return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in] xid X/Open XA transaction identifier
+@return transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+ trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+ if (xid)
+ trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+ return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+ trx_t* trx, /*!< in/out: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (trx->id == 0 && read_write) {
+ /* If the transaction is tagged as read-only then
+ it can only write to temp tables and for such
+ transactions we don't want to move them to the
+ trx_sys_t::rw_trx_hash. */
+ if (!trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ }
+ return;
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ bool read_write) /*!< in: true if read write transaction */
+{
+ switch (trx->state) {
+ case TRX_STATE_NOT_STARTED:
+ trx_start_low(trx, read_write);
+ return;
+
+ case TRX_STATE_ACTIVE:
+ if (read_write && trx->id == 0 && !trx->read_only) {
+ trx_set_rw_mode(trx);
+ }
+ return;
+
+ case TRX_STATE_PREPARED:
+ case TRX_STATE_PREPARED_RECOVERED:
+ case TRX_STATE_COMMITTED_IN_MEMORY:
+ break;
+ }
+
+ ut_error;
+}
+
+/**
+Start a transaction for internal processing.
+@param trx transaction
+@param read_write whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write)
+{
+ trx->will_lock= true;
+ trx_start_low(trx, read_write);
+}
+
+/** Start a transaction for a DDL operation.
+@param trx transaction */
+void trx_start_for_ddl_low(trx_t *trx)
+{
+ /* Flag this transaction as a dictionary operation, so that
+ the data dictionary will be locked in crash recovery. */
+ trx->dict_operation= true;
+ trx_start_internal_low(trx, true);
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+ trx_t* trx) /*!< in/out: transaction that is RW */
+{
+ ut_ad(trx->rsegs.m_redo.rseg == 0);
+ ut_ad(!trx->is_autocommit_non_locking());
+ ut_ad(!trx->read_only);
+ ut_ad(trx->id == 0);
+
+ if (high_level_read_only) {
+ return;
+ }
+
+ trx_assign_rseg_low(trx);
+
+ /* So that we can see our own changes. */
+ if (trx->read_view.is_open()) {
+ trx->read_view.set_creator_trx_id(trx->id);
+ }
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..203edd9f
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1478 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+ When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+ When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ return page_no == block->page.id().page_no()
+ ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->page.frame)
+ : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to first record
+@retval nullptr if none exists */
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+ return start == trx_undo_page_get_end(block, page_no, offset)
+ ? nullptr : block->page.frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in] page undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to last record
+@retval NULL if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+ uint16_t offset)
+{
+ uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+ return trx_undo_page_get_start(block, page_no, offset) == end
+ ? nullptr
+ : block->page.frame + mach_read_from_2(block->page.frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+ uint32_t page_no, uint16_t offset,
+ bool shared, mtr_t *mtr)
+{
+ uint32_t prev_page_no= mach_read_from_4(TRX_UNDO_PAGE_HDR +
+ TRX_UNDO_PAGE_NODE +
+ FLST_PREV + FIL_ADDR_PAGE +
+ block->page.frame);
+
+ if (prev_page_no == FIL_NULL)
+ return nullptr;
+
+ block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+ 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+
+ return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the previous undo log record.
+@param[in] block undo log page
+@param[in] rec undo log record
+@param[in] page_no undo log header page number
+@param[in] offset undo log header page offset
+@return pointer to record
+@retval NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+ uint32_t page_no, uint16_t offset)
+{
+ ut_ad(block->page.frame == page_align(rec));
+ return
+ rec == block->page.frame + trx_undo_page_get_start(block, page_no, offset)
+ ? nullptr
+ : block->page.frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out] block undo log page
+@param[in] rec undo record offset in the page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+ uint16_t offset, bool shared, mtr_t *mtr)
+{
+ if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+ block->page.frame + rec,
+ page_no, offset))
+ return prev;
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+ shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out] block undo log page
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out] mtr mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
+ uint32_t page_no, uint16_t offset,
+ ulint mode, mtr_t *mtr)
+{
+ if (page_no == block->page.id().page_no() &&
+ mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG))
+ return nullptr;
+
+ uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ FLST_NEXT + FIL_ADDR_PAGE +
+ block->page.frame);
+ if (next == FIL_NULL)
+ return nullptr;
+
+ block= buf_page_get_gen(page_id_t(block->page.id().space(), next), 0, mode,
+ nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+ return block ? trx_undo_page_get_first_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the first record in an undo log.
+@param[in] space undo log header space
+@param[in] page_no undo log header page number
+@param[in] offset undo log header offset on page
+@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out] block undo log page
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return undo log record, the page latched
+@retval nullptr if none */
+static trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+ uint16_t offset, ulint mode, const buf_block_t*& block,
+ mtr_t *mtr, dberr_t *err)
+{
+ block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
+ nullptr, BUF_GET, mtr, err);
+ if (!block)
+ return nullptr;
+
+ if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+ return rec;
+
+ return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+ mtr);
+}
+
+inline void UndorecApplier::apply_undo_rec(const trx_undo_rec_t *rec)
+{
+ undo_rec= rec;
+ if (!undo_rec)
+ return;
+ offset= page_offset(undo_rec);
+
+ bool updated_extern= false;
+ undo_no_t undo_no= 0;
+ table_id_t table_id= 0;
+ undo_rec= trx_undo_rec_get_pars(undo_rec, &type,
+ &cmpl_info,
+ &updated_extern, &undo_no, &table_id);
+ dict_sys.freeze(SRW_LOCK_CALL);
+ dict_table_t *table= dict_sys.find_table(table_id);
+ dict_sys.unfreeze();
+
+ ut_ad(table);
+ if (!table->is_active_ddl())
+ return;
+
+ dict_index_t *index= dict_table_get_first_index(table);
+ const dtuple_t *undo_tuple;
+ switch (type) {
+ default:
+ ut_ad("invalid type" == 0);
+ MY_ASSERT_UNREACHABLE();
+ case TRX_UNDO_INSERT_REC:
+ undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+ insert:
+ log_insert(*undo_tuple, index);
+ break;
+ case TRX_UNDO_UPD_EXIST_REC:
+ case TRX_UNDO_UPD_DEL_REC:
+ case TRX_UNDO_DEL_MARK_REC:
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ byte info_bits;
+ undo_rec= trx_undo_update_rec_get_sys_cols(
+ undo_rec, &trx_id, &roll_ptr, &info_bits);
+
+ undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+ undo_rec= trx_undo_update_rec_get_update(undo_rec, index, type, trx_id,
+ roll_ptr, info_bits,
+ heap, &update);
+ if (type == TRX_UNDO_UPD_DEL_REC)
+ goto insert;
+ log_update(*undo_tuple, index);
+ }
+
+ clear_undo_rec();
+}
+
+/** Apply any changes to tables for which online DDL is in progress. */
+ATTRIBUTE_COLD void trx_t::apply_log()
+{
+ const trx_undo_t *undo= rsegs.m_redo.undo;
+ if (!undo || !undo_no)
+ return;
+ page_id_t page_id{rsegs.m_redo.rseg->space->id, undo->hdr_page_no};
+ page_id_t next_page_id(page_id);
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t *block= buf_page_get(page_id, 0, RW_S_LATCH, &mtr);
+ if (UNIV_UNLIKELY(!block))
+ {
+ mtr.commit();
+ return;
+ }
+
+ UndorecApplier log_applier(page_id, id);
+
+ for (;;)
+ {
+ trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_id.page_no(),
+ undo->hdr_offset);
+ while (rec)
+ {
+ block->page.fix();
+ mtr.commit();
+ /* Since we are the only thread who could write to this undo page,
+ it is safe to dereference rec while only holding a buffer-fix. */
+ log_applier.apply_undo_rec(rec);
+ mtr.start();
+ mtr.page_lock(block, RW_S_LATCH);
+ rec= trx_undo_page_get_next_rec(block, page_offset(rec),
+ page_id.page_no(), undo->hdr_offset);
+ }
+
+ uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+ FLST_NEXT + FIL_ADDR_PAGE +
+ block->page.frame);
+ if (next == FIL_NULL)
+ break;
+ next_page_id.set_page_no(next);
+ mtr.commit();
+ mtr.start();
+ block= buf_page_get_gen(next_page_id, 0, RW_S_LATCH, block, BUF_GET, &mtr);
+ if (UNIV_UNLIKELY(!block))
+ break;
+ log_applier.assign_next(next_page_id);
+ }
+ mtr.commit();
+ apply_online_log= false;
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param block undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+ mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.page.frame),
+ FIL_PAGE_UNDO_LOG);
+ static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.page.frame,
+ 0, 2);
+ mach_write_to_2(my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame),
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame,
+ 2);
+ /* The following corresponds to flst_zero_both(), but without writing log. */
+ memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+ FIL_ADDR_BYTE + block.page.frame, 0, 2);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+ memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+ FIL_ADDR_BYTE + block.page.frame, 0, 2);
+ static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+ TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+ /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+ memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+ block.page.frame, 0,
+ srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+ FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+ ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+ if (trx_rseg_n_slots_debug)
+ max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+ for (ulint i= 0; i < max_slots; i++)
+ if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+ return i;
+
+ return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out] space tablespace
+@param[in,out] rseg_hdr rollback segment header (x-latched)
+@param[out] id undo slot number
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+ dberr_t *err, mtr_t *mtr)
+{
+ buf_block_t* block;
+ uint32_t n_reserved;
+
+ const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ib::warn() << "Cannot find a free slot for an undo log. Do"
+ " you have too many active transactions running"
+ " concurrently?";
+
+ *err = DB_TOO_MANY_CONCURRENT_TRXS;
+ return NULL;
+ }
+
+ ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+ *err = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return NULL;
+ }
+
+ /* Allocate a new file segment for the undo log */
+ block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ mtr, err, true);
+
+ space->release_free_extents(n_reserved);
+
+ if (!block) {
+ return block;
+ }
+
+ mtr->undo_create(*block);
+ trx_undo_page_init(*block);
+
+ mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+ + block->page.frame,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->page.frame, 0U);
+
+ flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + block->page.frame, mtr);
+
+ *err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+ mtr);
+
+ *id = slot_no;
+ mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+ + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->page.frame,
+ block->page.id().page_no());
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out] undo_page undo log segment header page
+@param[in] trx_id transaction identifier
+@param[in,out] mtr mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+ mtr_t* mtr)
+{
+ /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+ repurposed after upgrading to MariaDB 10.3. */
+ byte *undo_type= my_assume_aligned<2>
+ (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->page.frame);
+ ut_ad(mach_read_from_2(undo_type) <= 2);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+ byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+ undo_page->page.frame);
+ const uint16_t free= mach_read_from_2(start + 2);
+ static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+ "compatibility");
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+ mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+ /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+ So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+ But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+ memcpy_aligned<2>(start + 2, start, 2);
+ mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+ start, 2);
+ uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ undo_page->page.frame);
+ ut_ad(prev_log < free);
+ alignas(4) byte buf[4];
+ mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+ mach_write_to_2(buf + 2, free);
+ static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+ static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+ mtr->memcpy<mtr_t::MAYBE_NOP>
+ (*undo_page, my_assume_aligned<4>
+ (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame), buf, 4);
+ if (prev_log)
+ mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG +
+ undo_page->page.frame, free);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+ undo_page->page.frame, trx_id);
+ if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO +
+ undo_page->page.frame) != 0))
+ mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
+
+ /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+ mach_write_to_2(buf, 1);
+ memcpy_aligned<2>(buf + 2, start, 2);
+ static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+ "compatibility");
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+ undo_page->page.frame, buf, 4);
+ /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+ if (prev_log)
+ {
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+ undo_page->page.frame, prev_log);
+ static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+ "compatibility");
+ mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+ static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+ FLST_NODE_SIZE, "compatibility");
+ }
+ else
+ mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+ TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+ return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out] block undo header page
+@param[in] offset undo header record offset
+@param[in] xid distributed transaction identifier
+@param[in,out] mtr mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+ const XID &xid, mtr_t *mtr)
+{
+ DBUG_ASSERT(xid.gtrid_length > 0);
+ DBUG_ASSERT(xid.bqual_length >= 0);
+ DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+ DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+ static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+ "gtrid and bqual don't fit xid data");
+ DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+ block->page.frame) == offset);
+
+ trx_ulogf_t* log_hdr= block->page.frame + offset;
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+ static_cast<uint32_t>(xid.formatID));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+ static_cast<uint32_t>(xid.gtrid_length));
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+ static_cast<uint32_t>(xid.bqual_length));
+ const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+ + xid.bqual_length);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block,
+ &block->page.frame[offset + TRX_UNDO_XA_XID],
+ xid.data, xid_length);
+ if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+ mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+ XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+ xid->formatID=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_FORMAT));
+
+ xid->gtrid_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+ xid->bqual_length=static_cast<long>(mach_read_from_4(
+ log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+ memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any page latch
+@param[out] err error code
+@return X-latched block if success
+@retval nullptr on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+{
+ buf_block_t *new_block= nullptr;
+ uint32_t n_reserved;
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ trx_rseg_t *rseg= undo->rseg;
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+ buf_block_t *header_block=
+ buf_page_get_gen(page_id_t{rseg->space->id, undo->hdr_page_no},
+ 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+ if (!header_block)
+ goto func_exit;
+ *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr);
+
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ goto func_exit;
+
+ new_block=
+ fseg_alloc_free_page_general(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+ header_block->page.frame,
+ undo->top_page_no + 1, FSP_UP, true,
+ mtr, mtr, err);
+ rseg->space->release_free_extents(n_reserved);
+
+ if (!new_block)
+ goto func_exit;
+
+ undo->last_page_no= new_block->page.id().page_no();
+
+ mtr->undo_create(*new_block);
+ trx_undo_page_init(*new_block);
+ *err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ new_block= nullptr;
+ else
+ {
+ undo->size++;
+ rseg->curr_size++;
+ }
+
+func_exit:
+ rseg->latch.wr_unlock();
+ return new_block;
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ bool in_history, /*!< in: TRUE if the undo log is in the history
+ list */
+ uint32_t hdr_page_no, /*!< in: header page number */
+ uint32_t page_no, /*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr, /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+ dberr_t* err) /*!< out: error code */
+{
+ ut_a(hdr_page_no != page_no);
+
+ buf_block_t* undo_block = buf_page_get_gen(page_id_t(rseg->space->id,
+ page_no),
+ 0, RW_X_LATCH, nullptr,
+ BUF_GET, mtr, err);
+ if (UNIV_UNLIKELY(!undo_block)) {
+ return FIL_NULL;
+ }
+ buf_block_t* header_block = buf_page_get_gen(page_id_t(rseg->space->id,
+ hdr_page_no),
+ 0, RW_X_LATCH, nullptr,
+ BUF_GET, mtr, err);
+ if (UNIV_UNLIKELY(!header_block)) {
+ return FIL_NULL;
+ }
+
+ *err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+ mtr);
+
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return FIL_NULL;
+ }
+
+ *err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ + header_block->page.frame,
+ rseg->space, page_no, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return FIL_NULL;
+ }
+ buf_page_free(rseg->space, page_no, mtr);
+
+ const fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + header_block->page.frame);
+ rseg->curr_size--;
+
+ if (!in_history) {
+ } else if (buf_block_t* rseg_header = rseg->get(mtr, err)) {
+ byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+ + rseg_header->page.frame;
+ uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+ ut_ad(hist_size > 0);
+ mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+ } else {
+ return FIL_NULL;
+ }
+
+ return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out] undo undo log
+@param[in,out] mtr mini-transaction that does not hold any undo log page
+ or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+{
+ ut_ad(undo->hdr_page_no != undo->last_page_no);
+ ut_ad(undo->size > 0);
+ undo->size--;
+
+ dberr_t err;
+ undo->last_page_no= trx_undo_free_page(undo->rseg, false, undo->hdr_page_no,
+ undo->last_page_no, mtr, &err);
+ return err;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out] undo undo log
+@param[in] limit all undo logs after this limit will be discarded
+@param[in] is_temp whether this is temporary undo log
+@return error code */
+static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit,
+ bool is_temp)
+{
+ ut_ad(is_temp == !undo.rseg->is_persistent());
+
+ for (mtr_t mtr;;)
+ {
+ mtr.start();
+ if (is_temp)
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+ trx_undo_rec_t *trunc_here= nullptr;
+ undo.rseg->latch.wr_lock(SRW_LOCK_CALL);
+ dberr_t err;
+ buf_block_t *undo_block=
+ buf_page_get_gen(page_id_t{undo.rseg->space->id, undo.last_page_no},
+ 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
+ if (UNIV_UNLIKELY(!undo_block))
+ goto func_exit;
+
+ for (trx_undo_rec_t *rec=
+ trx_undo_page_get_last_rec(undo_block,
+ undo.hdr_page_no, undo.hdr_offset);
+ rec; )
+ {
+ if (trx_undo_rec_get_undo_no(rec) < limit)
+ goto func_exit;
+ /* Truncate at least this record off, maybe more */
+ trunc_here= rec;
+ rec= trx_undo_page_get_prev_rec(undo_block, rec,
+ undo.hdr_page_no, undo.hdr_offset);
+ }
+
+ if (undo.last_page_no != undo.hdr_page_no)
+ {
+ err= trx_undo_free_last_page(&undo, &mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ goto func_exit;
+ undo.rseg->latch.wr_unlock();
+ mtr.commit();
+ continue;
+ }
+
+func_exit:
+ undo.rseg->latch.wr_unlock();
+
+ if (trunc_here && err == DB_SUCCESS)
+ mtr.write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+ undo_block->page.frame,
+ ulint(trunc_here - undo_block->page.frame));
+
+ mtr.commit();
+ return err;
+ }
+}
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx)
+{
+ if (trx_undo_t *undo= trx.rsegs.m_redo.undo)
+ {
+ ut_ad(undo->rseg == trx.rsegs.m_redo.rseg);
+ if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, false))
+ return err;
+ }
+
+ if (trx_undo_t *undo = trx.rsegs.m_noredo.undo)
+ {
+ ut_ad(undo->rseg == trx.rsegs.m_noredo.rseg);
+ if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, true))
+ return err;
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out] rseg rollback segment
+@param[in] hdr_page_no header page number
+@param[in] hdr_offset header offset on the page
+@param[in] limit first undo number to preserve
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
+trx_undo_truncate_start(
+ trx_rseg_t* rseg,
+ uint32_t hdr_page_no,
+ uint16_t hdr_offset,
+ undo_no_t limit)
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ mtr_t mtr;
+
+ if (!limit) {
+ return DB_SUCCESS;
+ }
+loop:
+ mtr_start(&mtr);
+
+ if (!rseg->is_persistent()) {
+ mtr.set_log_mode(MTR_LOG_NO_REDO);
+ }
+
+ dberr_t err;
+ const buf_block_t* undo_page;
+ rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+ RW_X_LATCH, undo_page, &mtr, &err);
+ if (rec == NULL) {
+ /* Already empty */
+done:
+ mtr.commit();
+ return err;
+ }
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+ goto done;
+ }
+
+ if (undo_page->page.id().page_no() == hdr_page_no) {
+ uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+ + undo_page->page.frame);
+ if (end == 0) {
+ end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE
+ + undo_page->page.frame);
+ }
+
+ mtr.write<2>(*undo_page, undo_page->page.frame + hdr_offset
+ + TRX_UNDO_LOG_START, end);
+ } else {
+ trx_undo_free_page(rseg, true, hdr_page_no,
+ undo_page->page.id().page_no(), &mtr, &err);
+ if (err != DB_SUCCESS) {
+ goto done;
+ }
+ }
+
+ mtr.commit();
+ goto loop;
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out] rseg rollback segment
+@param[in] id rollback segment slot
+@param[in] page_no undo log segment page number
+@return the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no)
+{
+ mtr_t mtr;
+ XID xid;
+
+ ut_ad(id < TRX_RSEG_N_SLOTS);
+
+ mtr.start();
+ const buf_block_t* block = buf_page_get(
+ page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr);
+ if (UNIV_UNLIKELY(!block)) {
+corrupted:
+ mtr.commit();
+ return nullptr;
+ }
+
+ const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE
+ + block->page.frame);
+ if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+ sql_print_error("InnoDB: unsupported undo header type %u",
+ type);
+ goto corrupted;
+ }
+
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->page.frame);
+ if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+ offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+ sql_print_error("InnoDB: invalid undo header offset %u",
+ offset);
+ goto corrupted;
+ }
+
+ const trx_ulogf_t* const undo_header = block->page.frame + offset;
+ uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->page.frame);
+
+ const trx_id_t trx_id= mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+ if (trx_id >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+ goto corrupted;
+ }
+ /* We will increment rseg->needs_purge, like trx_undo_reuse_cached()
+ would do it, to avoid trouble on rollback or XA COMMIT. */
+ trx_id_t trx_no = trx_id + 1;
+
+ switch (state) {
+ case TRX_UNDO_ACTIVE:
+ case TRX_UNDO_PREPARED:
+ if (UNIV_LIKELY(type != 1)) {
+ break;
+ }
+ sql_print_error("InnoDB: upgrade from older version than"
+ " MariaDB 10.3 requires clean shutdown");
+ goto corrupted;
+ default:
+ sql_print_error("InnoDB: unsupported undo header state %u",
+ state);
+ goto corrupted;
+ case TRX_UNDO_CACHED:
+ if (UNIV_UNLIKELY(type != 0)) {
+ /* This undo page was not updated by MariaDB
+ 10.3 or later. The TRX_UNDO_TRX_NO field may
+ contain garbage. */
+ break;
+ }
+ goto read_trx_no;
+ case TRX_UNDO_TO_PURGE:
+ if (UNIV_UNLIKELY(type == 1)) {
+ goto corrupted_type;
+ }
+ read_trx_no:
+ trx_no = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+ if (trx_no >> 48) {
+ sql_print_error("InnoDB: corrupted TRX_NO %llx",
+ trx_no);
+ goto corrupted;
+ }
+ if (trx_no < trx_id) {
+ trx_no = trx_id;
+ }
+ }
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ if (undo_header[TRX_UNDO_XID_EXISTS]) {
+ trx_undo_read_xid(undo_header, &xid);
+ } else {
+ xid.null();
+ }
+
+ if (trx_no > rseg->needs_purge) {
+ rseg->needs_purge = trx_no;
+ }
+
+ trx_undo_t* undo = trx_undo_mem_create(
+ rseg, id, trx_id, &xid, page_no, offset);
+ if (!undo) {
+ return undo;
+ }
+
+ undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+ undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + block->page.frame);
+
+ fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ const buf_block_t* last = buf_page_get(
+ page_id_t(rseg->space->id, undo->last_page_no), 0,
+ RW_X_LATCH, &mtr);
+
+ if (UNIV_UNLIKELY(!last)) {
+ ut_free(undo);
+ goto corrupted;
+ }
+
+ if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+ last, page_no, offset)) {
+ undo->top_offset = static_cast<uint16_t>(
+ rec - last->page.frame);
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ ut_ad(!undo->empty());
+ } else {
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+ }
+
+ undo->state = state;
+
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(rseg->undo_list, undo);
+ } else {
+ UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+ }
+
+ mtr.commit();
+ return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification */
+ uint32_t page_no,/*!< in: undo log header page number */
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+ ut_a(id < TRX_RSEG_N_SLOTS);
+
+ undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ undo->id = id;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->top_undo_no = IB_ID_MAX;
+ undo->top_page_no = page_no;
+ undo->guess_block = NULL;
+ ut_ad(undo->empty());
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /*!< in: undo log to init */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ uint16_t offset) /*!< in: undo log header byte offset on page */
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->top_undo_no = IB_ID_MAX;
+ ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] undo undo log object
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return undo log block
+@retval NULL on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+ dberr_t* err, mtr_t* mtr)
+{
+ ulint id;
+ buf_block_t* block = rseg->get(mtr, err);
+
+ if (block) {
+ block = trx_undo_seg_create(rseg->space, block, &id, err, mtr);
+ }
+
+ if (!block) {
+ return NULL;
+ }
+
+ rseg->curr_size++;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ *undo = trx_undo_mem_create(rseg, id, trx->id, &trx->xid,
+ block->page.id().page_no(), offset);
+ if (*undo == NULL) {
+ *err = DB_OUT_OF_MEMORY;
+ /* FIXME: this will not free the undo block to the file */
+ return NULL;
+ } else if (rseg != trx->rsegs.m_redo.rseg) {
+ return block;
+ }
+
+ if (trx->dict_operation) {
+ (*undo)->dict_operation = true;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+ block->page.frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+ block->page.frame + offset
+ + TRX_UNDO_TABLE_ID, 0U);
+ }
+
+ *err = DB_SUCCESS;
+ return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out] trx transaction
+@param[in,out] rseg rollback segment
+@param[out] pundo the undo log memory object
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the undo log block
+@retval NULL if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+ mtr_t* mtr, dberr_t *err)
+{
+ ut_ad(rseg->is_persistent());
+ ut_ad(rseg->is_referenced());
+ ut_ad(rseg == trx->rsegs.m_redo.rseg);
+
+ if (rseg->needs_purge <= trx->id) {
+ /* trx_purge_truncate_history() checks
+ purge_sys.sees(rseg.needs_purge)
+ so we need to compensate for that.
+ The rseg->needs_purge after crash
+ recovery would be at least trx->id + 1,
+ because that is the minimum possible value
+ assigned by trx_serialise() on commit. */
+ rseg->needs_purge = trx->id + 1;
+ }
+
+ trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+ if (!undo) {
+ return NULL;
+ }
+
+ ut_ad(undo->size == 1);
+ ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = buf_page_get_gen(page_id_t(undo->rseg->space->id,
+ undo->hdr_page_no),
+ 0, RW_X_LATCH, nullptr, BUF_GET,
+ mtr, err);
+ if (!block) {
+ return NULL;
+ }
+
+ UT_LIST_REMOVE(rseg->undo_cached, undo);
+
+ *pundo = undo;
+
+ uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+ trx_undo_mem_init_for_reuse(undo, trx->id, &trx->xid, offset);
+
+ if (trx->dict_operation) {
+ undo->dict_operation = TRUE;
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+ block->page.frame + offset
+ + TRX_UNDO_DICT_TRANS, 1U);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+ block->page.frame + offset
+ + TRX_UNDO_TABLE_ID, 0U);
+ }
+
+ return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out] trx transaction
+@param[out] err error code
+@param[in,out] mtr mini-transaction
+@return the undo log block
+@retval NULL on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+ ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+ trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+ if (undo) {
+ return buf_page_get_gen(
+ page_id_t(undo->rseg->space->id, undo->last_page_no),
+ 0, RW_X_LATCH, undo->guess_block,
+ BUF_GET, mtr, err);
+ }
+
+ *err = DB_SUCCESS;
+ trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ buf_block_t* block = trx_undo_reuse_cached(
+ trx, rseg, &trx->rsegs.m_redo.undo, mtr, err);
+
+ if (!block) {
+ block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+ err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+ }
+
+ UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+ rseg->latch.wr_unlock();
+ return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@tparam is_temp whether this is temporary undo log
+@param[in,out] trx transaction
+@param[in] rseg rollback segment
+@param[out] undo the undo log
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the undo log block
+@retval nullptr on error */
+template<bool is_temp>
+buf_block_t*
+trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+ mtr_t *mtr, dberr_t *err)
+{
+ ut_ad(is_temp == (rseg == trx->rsegs.m_noredo.rseg));
+ ut_ad(is_temp || rseg == trx->rsegs.m_redo.rseg);
+ ut_ad(undo == (is_temp
+ ? &trx->rsegs.m_noredo.undo
+ : &trx->rsegs.m_redo.undo));
+ ut_ad(mtr->get_log_mode()
+ == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+ if (*undo) {
+ return buf_page_get_gen(
+ page_id_t(rseg->space->id, (*undo)->last_page_no),
+ 0, RW_X_LATCH, (*undo)->guess_block,
+ BUF_GET, mtr, err);
+ }
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_too_many_trx",
+ *err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+ );
+
+ *err = DB_SUCCESS;
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ buf_block_t* block;
+ if (is_temp) {
+ ut_ad(!UT_LIST_GET_LEN(rseg->undo_cached));
+ } else {
+ block = trx_undo_reuse_cached(trx, rseg, undo, mtr, err);
+ if (block) {
+ goto got_block;
+ }
+ }
+ block = trx_undo_create(trx, rseg, undo, err, mtr);
+ ut_ad(!block == (*err != DB_SUCCESS));
+ if (!block) {
+ goto func_exit;
+ }
+
+got_block:
+ UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+ rseg->latch.wr_unlock();
+ return block;
+}
+
+template buf_block_t*
+trx_undo_assign_low<false>(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+ mtr_t *mtr, dberr_t *err);
+template buf_block_t*
+trx_undo_assign_low<true>(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+ mtr_t *mtr, dberr_t *err);
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out] trx transaction
+@param[in,out] undo undo log
+@param[in] rollback false=XA PREPARE, true=XA ROLLBACK
+@param[in,out] mtr mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+ mtr_t *mtr)
+{
+ ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+ RW_X_LATCH, mtr);
+ if (UNIV_UNLIKELY(!block)) {
+ /* In case of !rollback the undo header page
+ corruption would leave the transaction object in an
+ unexpected (active) state. */
+ ut_a(rollback);
+ return;
+ }
+
+ if (rollback) {
+ ut_ad(undo->state == TRX_UNDO_PREPARED);
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->page.frame, TRX_UNDO_ACTIVE);
+ return;
+ }
+
+ /*------------------------------*/
+ ut_ad(undo->state == TRX_UNDO_ACTIVE);
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = trx->xid;
+ /*------------------------------*/
+
+ mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+ + block->page.frame, undo->state);
+ uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+ + block->page.frame);
+ mtr->write<1>(*block, block->page.frame + offset + TRX_UNDO_XID_EXISTS,
+ 1U);
+
+ trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+ if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+ switch (undo->state) {
+ case TRX_UNDO_PREPARED:
+ break;
+ case TRX_UNDO_CACHED:
+ case TRX_UNDO_TO_PURGE:
+ ut_ad(trx_state_eq(trx,
+ TRX_STATE_COMMITTED_IN_MEMORY));
+ /* fall through */
+ case TRX_UNDO_ACTIVE:
+ /* trx_t::commit_state() assigns
+ trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+ ut_a(!srv_was_started
+ || srv_read_only_mode
+ || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+ || srv_fast_shutdown);
+ break;
+ default:
+ ut_error;
+ }
+
+ UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+ if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+ ut_a(undo->state == TRX_UNDO_PREPARED);
+
+ UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+ ut_free(undo);
+ undo = NULL;
+ }
+}