diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:04:16 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:04:16 +0000 |
commit | a68fb2d8219f6bccc573009600e9f23e89226a5e (patch) | |
tree | d742d35d14ae816e99293d2b01face30e9f3a46b /storage/innobase/trx | |
parent | Initial commit. (diff) | |
download | mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.tar.xz mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.zip |
Adding upstream version 1:10.6.11.upstream/1%10.6.11upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/trx')
-rw-r--r-- | storage/innobase/trx/trx0i_s.cc | 1471 | ||||
-rw-r--r-- | storage/innobase/trx/trx0purge.cc | 1416 | ||||
-rw-r--r-- | storage/innobase/trx/trx0rec.cc | 2426 | ||||
-rw-r--r-- | storage/innobase/trx/trx0roll.cc | 927 | ||||
-rw-r--r-- | storage/innobase/trx/trx0rseg.cc | 713 | ||||
-rw-r--r-- | storage/innobase/trx/trx0sys.cc | 357 | ||||
-rw-r--r-- | storage/innobase/trx/trx0trx.cc | 2180 | ||||
-rw-r--r-- | storage/innobase/trx/trx0undo.cc | 1581 |
8 files changed, 11071 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc new file mode 100644 index 00000000..2dc39118 --- /dev/null +++ b/storage/innobase/trx/trx0i_s.cc @@ -0,0 +1,1471 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0i_s.cc +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables fetch code. + +The code below fetches information needed to fill those +3 dynamic tables and uploads it into a "transactions +table cache" for later retrieval. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#include "trx0i_s.h" +#include "buf0buf.h" +#include "dict0dict.h" +#include "ha0storage.h" +#include "hash0hash.h" +#include "lock0iter.h" +#include "lock0lock.h" +#include "mem0mem.h" +#include "page0page.h" +#include "rem0rec.h" +#include "row0row.h" +#include "srv0srv.h" +#include "trx0sys.h" +#include "que0que.h" +#include "trx0purge.h" +#include "sql_class.h" + +/** Initial number of rows in the table cache */ +#define TABLE_CACHE_INITIAL_ROWSNUM 1024 + +/** @brief The maximum number of chunks to allocate for a table cache. + +The rows of a table cache are stored in a set of chunks. When a new +row is added a new chunk is allocated if necessary. Assuming that the +first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each +subsequent is N/2 where N is the number of rows we have allocated till +now, then 39th chunk would accommodate 1677416425 rows and all chunks +would accommodate 3354832851 rows. */ +#define MEM_CHUNKS_IN_TABLE_CACHE 39 + +/** The following are some testing auxiliary macros. Do not enable them +in a production environment. */ +/* @{ */ + +#if 0 +/** If this is enabled then lock folds will always be different +resulting in equal rows being put in a different cells of the hash +table. Checking for duplicates will be flawed because different +fold will be calculated when a row is searched in the hash table. */ +#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT +#endif + +#if 0 +/** This effectively kills the search-for-duplicate-before-adding-a-row +function, but searching in the hash is still performed. It will always +be assumed that lock is not present and insertion will be performed in +the hash table. */ +#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T +#endif + +#if 0 +/** This aggressively repeats adding each row many times. Depending on +the above settings this may be noop or may result in lots of rows being +added. */ +#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES +#endif + +#if 0 +/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash +table search is not performed at all. */ +#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS +#endif + +#if 0 +/** Do not insert each row into the hash table, duplicates may appear +if this is enabled, also if this is enabled searching into the hash is +noop because it will be empty. */ +#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE +#endif +/* @} */ + +/** Memory limit passed to ha_storage_put_memlim(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_STORAGE(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd) + +/** Memory limit in table_cache_create_empty_row(). +@param cache hash storage +@return maximum allowed allocation size */ +#define MAX_ALLOWED_FOR_ALLOC(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd \ + - ha_storage_get_size((cache)->storage)) + +/** Memory for each table in the intermediate buffer is allocated in +separate chunks. These chunks are considered to be concatenated to +represent one flat array of rows. */ +struct i_s_mem_chunk_t { + ulint offset; /*!< offset, in number of rows */ + ulint rows_allocd; /*!< the size of this chunk, in number + of rows */ + void* base; /*!< start of the chunk */ +}; + +/** This represents one table's cache. */ +struct i_s_table_cache_t { + ulint rows_used; /*!< number of used rows */ + ulint rows_allocd; /*!< number of allocated rows */ + ulint row_size; /*!< size of a single row */ + i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of + memory chunks that stores the + rows */ +}; + +/** This structure describes the intermediate buffer */ +struct trx_i_s_cache_t { + srw_lock rw_lock; /*!< read-write lock protecting this */ + Atomic_relaxed<ulonglong> last_read; + /*!< last time the cache was read; + measured in nanoseconds */ + i_s_table_cache_t innodb_trx; /*!< innodb_trx table */ + i_s_table_cache_t innodb_locks; /*!< innodb_locks table */ + i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */ +/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */ +#define LOCKS_HASH_CELLS_NUM 10000 + hash_table_t locks_hash; /*!< hash table used to eliminate + duplicate entries in the + innodb_locks table */ +/** Initial size of the cache storage */ +#define CACHE_STORAGE_INITIAL_SIZE 1024 +/** Number of hash cells in the cache storage */ +#define CACHE_STORAGE_HASH_CELLS 2048 + ha_storage_t* storage; /*!< storage for external volatile + data that may become unavailable + when we release + lock_sys.latch */ + ulint mem_allocd; /*!< the amount of memory + allocated with mem_alloc*() */ + bool is_truncated; /*!< this is true if the memory + limit was hit and thus the data + in the cache is truncated */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +static trx_i_s_cache_t trx_i_s_cache_static; +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static; + +/** @return the heap number of a record lock +@retval 0xFFFF for table locks */ +static uint16_t wait_lock_get_heap_no(const lock_t *lock) +{ + return !lock->is_table() + ? static_cast<uint16_t>(lock_rec_find_set_bit(lock)) + : uint16_t{0xFFFF}; +} + +/*******************************************************************//** +Initializes the members of a table cache. */ +static +void +table_cache_init( +/*=============*/ + i_s_table_cache_t* table_cache, /*!< out: table cache */ + size_t row_size) /*!< in: the size of a + row */ +{ + ulint i; + + table_cache->rows_used = 0; + table_cache->rows_allocd = 0; + table_cache->row_size = row_size; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + table_cache->chunks[i].base = NULL; + } +} + +/*******************************************************************//** +Frees a table cache. */ +static +void +table_cache_free( +/*=============*/ + i_s_table_cache_t* table_cache) /*!< in/out: table cache */ +{ + ulint i; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + if (table_cache->chunks[i].base) { + ut_free(table_cache->chunks[i].base); + table_cache->chunks[i].base = NULL; + } + } +} + +/*******************************************************************//** +Returns an empty row from a table cache. The row is allocated if no more +empty rows are available. The number of used rows is incremented. +If the memory limit is hit then NULL is returned and nothing is +allocated. +@return empty row, or NULL if out of memory */ +static +void* +table_cache_create_empty_row( +/*=========================*/ + i_s_table_cache_t* table_cache, /*!< in/out: table cache */ + trx_i_s_cache_t* cache) /*!< in/out: cache to record + how many bytes are + allocated */ +{ + ulint i; + void* row; + + ut_a(table_cache->rows_used <= table_cache->rows_allocd); + + if (table_cache->rows_used == table_cache->rows_allocd) { + + /* rows_used == rows_allocd means that new chunk needs + to be allocated: either no more empty rows in the + last allocated chunk or nothing has been allocated yet + (rows_num == rows_allocd == 0); */ + + i_s_mem_chunk_t* chunk; + ulint req_bytes; + ulint got_bytes; + ulint req_rows; + ulint got_rows; + + /* find the first not allocated chunk */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].base == NULL) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + have been allocated :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + /* allocate the chunk we just found */ + + if (i == 0) { + + /* first chunk, nothing is allocated yet */ + req_rows = TABLE_CACHE_INITIAL_ROWSNUM; + } else { + + /* Memory is increased by the formula + new = old + old / 2; We are trying not to be + aggressive here (= using the common new = old * 2) + because the allocated memory will not be freed + until InnoDB exit (it is reused). So it is better + to once allocate the memory in more steps, but + have less unused/wasted memory than to use less + steps in allocation (which is done once in a + lifetime) but end up with lots of unused/wasted + memory. */ + req_rows = table_cache->rows_allocd / 2; + } + req_bytes = req_rows * table_cache->row_size; + + if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) { + + return(NULL); + } + + chunk = &table_cache->chunks[i]; + + got_bytes = req_bytes; + chunk->base = ut_malloc_nokey(req_bytes); + + got_rows = got_bytes / table_cache->row_size; + + cache->mem_allocd += got_bytes; + +#if 0 + printf("allocating chunk %d req bytes=%lu, got bytes=%lu," + " row size=%lu," + " req rows=%lu, got rows=%lu\n", + i, req_bytes, got_bytes, + table_cache->row_size, + req_rows, got_rows); +#endif + + chunk->rows_allocd = got_rows; + + table_cache->rows_allocd += got_rows; + + /* adjust the offset of the next chunk */ + if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) { + + table_cache->chunks[i + 1].offset + = chunk->offset + chunk->rows_allocd; + } + + /* return the first empty row in the newly allocated + chunk */ + row = chunk->base; + } else { + + char* chunk_start; + ulint offset; + + /* there is an empty row, no need to allocate new + chunks */ + + /* find the first chunk that contains allocated but + empty/unused rows */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd + > table_cache->rows_used) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + are full, but + table_cache->rows_used != table_cache->rows_allocd means + exactly the opposite - there are allocated but + empty/unused rows :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + chunk_start = (char*) table_cache->chunks[i].base; + offset = table_cache->rows_used + - table_cache->chunks[i].offset; + + row = chunk_start + offset * table_cache->row_size; + } + + table_cache->rows_used++; + + return(row); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Validates a row in the locks cache. +@return TRUE if valid */ +static +ibool +i_s_locks_row_validate( +/*===================*/ + const i_s_locks_row_t* row) /*!< in: row to validate */ +{ + ut_ad(row->lock_mode); + ut_ad(row->lock_table != NULL); + ut_ad(row->lock_table_id != 0); + + if (!row->lock_index) { + /* table lock */ + ut_ad(!row->lock_data); + ut_ad(row->lock_page == page_id_t(0, 0)); + ut_ad(!row->lock_rec); + } else { + /* record lock */ + /* row->lock_data == NULL if buf_page_try_get() == NULL */ + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Fills i_s_trx_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_trx_row( +/*=========*/ + i_s_trx_row_t* row, /*!< out: result object + that's filled */ + const trx_t* trx, /*!< in: transaction to + get data from */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + corresponding row in + innodb_locks if trx is + waiting or NULL if trx + is not waiting */ + trx_i_s_cache_t* cache) /*!< in/out: cache into + which to copy volatile + strings */ +{ + const char* s; + + lock_sys.assert_locked(); + + const lock_t* wait_lock = trx->lock.wait_lock; + + row->trx_id = trx->id; + row->trx_started = trx->start_time; + if (trx->in_rollback) { + row->trx_state = "ROLLING BACK"; + } else if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) { + row->trx_state = "COMMITTING"; + } else if (wait_lock) { + row->trx_state = "LOCK WAIT"; + } else { + row->trx_state = "RUNNING"; + } + + row->requested_lock_row = requested_lock_row; + ut_ad(requested_lock_row == NULL + || i_s_locks_row_validate(requested_lock_row)); + + ut_ad(!wait_lock == !requested_lock_row); + + const my_hrtime_t suspend_time= trx->lock.suspend_time; + row->trx_wait_started = wait_lock ? hrtime_to_time(suspend_time) : 0; + + row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx)); + + if (trx->mysql_thd == NULL) { + /* For internal transactions e.g., purge and transactions + being recovered at startup there is no associated MySQL + thread data structure. */ + row->trx_mysql_thread_id = 0; + row->trx_query = NULL; + goto thd_done; + } + + row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd); + + char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1]; + if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query, + sizeof query)) { + row->trx_query = static_cast<const char*>( + ha_storage_put_memlim( + cache->storage, query, stmt_len + 1, + MAX_ALLOWED_FOR_STORAGE(cache))); + + row->trx_query_cs = thd_charset(trx->mysql_thd); + + if (row->trx_query == NULL) { + + return(FALSE); + } + } else { + + row->trx_query = NULL; + } + +thd_done: + row->trx_operation_state = trx->op_info; + + row->trx_tables_in_use = trx->n_mysql_tables_in_use; + + row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock); + + /* These are protected by lock_sys.latch (which we are holding) + and sometimes also trx->mutex. */ + + row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks); + + row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap); + + row->trx_rows_locked = trx->lock.n_rec_locks; + + row->trx_rows_modified = trx->undo_no; + + row->trx_isolation_level = trx->isolation_level; + + row->trx_unique_checks = (ibool) trx->check_unique_secondary; + + row->trx_foreign_key_checks = (ibool) trx->check_foreigns; + + s = trx->detailed_error; + + if (s != NULL && s[0] != '\0') { + + TRX_I_S_STRING_COPY(s, + row->trx_foreign_key_error, + TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache); + + if (row->trx_foreign_key_error == NULL) { + + return(FALSE); + } + } else { + row->trx_foreign_key_error = NULL; + } + + row->trx_is_read_only = trx->read_only; + + row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking(); + + return(TRUE); +} + +/*******************************************************************//** +Format the nth field of "rec" and put it in "buf". The result is always +NUL-terminated. Returns the number of bytes that were written to "buf" +(including the terminating NUL). +@return end of the result */ +static +ulint +put_nth_field( +/*==========*/ + char* buf, /*!< out: buffer */ + ulint buf_size,/*!< in: buffer size in bytes */ + ulint n, /*!< in: number of field */ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets)/*!< in: record offsets, returned + by rec_get_offsets() */ +{ + const byte* data; + ulint data_len; + dict_field_t* dict_field; + ulint ret; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (buf_size == 0) { + + return(0); + } + + ret = 0; + + if (n > 0) { + /* we must append ", " before the actual data */ + + if (buf_size < 3) { + + buf[0] = '\0'; + return(1); + } + + memcpy(buf, ", ", 3); + + buf += 2; + buf_size -= 2; + ret += 2; + } + + /* now buf_size >= 1 */ + + data = rec_get_nth_field(rec, offsets, n, &data_len); + + dict_field = dict_index_get_nth_field(index, n); + + ret += row_raw_format((const char*) data, data_len, + dict_field, buf, buf_size); + + return(ret); +} + +/*******************************************************************//** +Fills the "lock_data" member of i_s_locks_row_t object. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +fill_lock_data( +/*===========*/ + const char** lock_data,/*!< out: "lock_data" to fill */ + const lock_t* lock, /*!< in: lock used to find the data */ + ulint heap_no,/*!< in: rec num used to find the data */ + trx_i_s_cache_t* cache) /*!< in/out: cache where to store + volatile data */ +{ + ut_a(!lock->is_table()); + + switch (heap_no) { + case PAGE_HEAP_NO_INFIMUM: + case PAGE_HEAP_NO_SUPREMUM: + *lock_data = ha_storage_put_str_memlim( + cache->storage, + heap_no == PAGE_HEAP_NO_INFIMUM + ? "infimum pseudo-record" + : "supremum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + return(*lock_data != NULL); + } + + mtr_t mtr; + + const buf_block_t* block; + const page_t* page; + const rec_t* rec; + ulint n_fields; + mem_heap_t* heap; + rec_offs offsets_onstack[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets; + char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; + ulint buf_used; + ulint i; + + mtr_start(&mtr); + + block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr); + + if (block == NULL) { + + *lock_data = NULL; + + mtr_commit(&mtr); + + return(TRUE); + } + + page = reinterpret_cast<const page_t*>(buf_block_get_frame(block)); + + rec_offs_init(offsets_onstack); + offsets = offsets_onstack; + + rec = page_find_rec_with_heap_no(page, heap_no); + + const dict_index_t* index = lock->index; + ut_ad(index->is_primary() || !dict_index_is_online_ddl(index)); + + n_fields = dict_index_get_n_unique(index); + + ut_a(n_fields > 0); + + heap = NULL; + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + n_fields, &heap); + + /* format and store the data */ + + buf_used = 0; + for (i = 0; i < n_fields; i++) { + + buf_used += put_nth_field( + buf + buf_used, sizeof(buf) - buf_used, + i, index, rec, offsets) - 1; + } + + *lock_data = (const char*) ha_storage_put_memlim( + cache->storage, buf, buf_used + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + + if (heap != NULL) { + + /* this means that rec_get_offsets() has created a new + heap and has stored offsets in it; check that this is + really the case and free the heap */ + ut_a(offsets != offsets_onstack); + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + if (*lock_data == NULL) { + + return(FALSE); + } + + return(TRUE); +} + +/** @return the table of a lock */ +static const dict_table_t *lock_get_table(const lock_t &lock) +{ + if (lock.is_table()) + return lock.un_member.tab_lock.table; + ut_ad(lock.index->is_primary() || !dict_index_is_online_ddl(lock.index)); + return lock.index->table; +} + +/*******************************************************************//** +Fills i_s_locks_row_t object. Returns its first argument. +If memory can not be allocated then FALSE is returned. +@return false if allocation fails */ +static bool fill_locks_row( + i_s_locks_row_t* row, /*!< out: result object that's filled */ + const lock_t* lock, /*!< in: lock to get data from */ + uint16_t heap_no,/*!< in: lock's record number + or 0 if the lock + is a table lock */ + trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy + volatile strings */ +{ + row->lock_trx_id = lock->trx->id; + const bool is_gap_lock = lock->is_gap(); + ut_ad(!is_gap_lock || !lock->is_table()); + switch (lock->mode()) { + case LOCK_S: + row->lock_mode = uint8_t(1 + is_gap_lock); + break; + case LOCK_X: + row->lock_mode = uint8_t(3 + is_gap_lock); + break; + case LOCK_IS: + row->lock_mode = uint8_t(5 + is_gap_lock); + break; + case LOCK_IX: + row->lock_mode = uint8_t(7 + is_gap_lock); + break; + case LOCK_AUTO_INC: + row->lock_mode = 9; + break; + default: + ut_ad("unknown lock mode" == 0); + row->lock_mode = 0; + } + + const dict_table_t* table= lock_get_table(*lock); + + row->lock_table = ha_storage_put_str_memlim( + cache->storage, table->name.m_name, + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_table == NULL) { + + return false; + } + + if (!lock->is_table()) { + row->lock_index = ha_storage_put_str_memlim( + cache->storage, lock->index->name, + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_index == NULL) { + + return false; + } + + row->lock_page = lock->un_member.rec_lock.page_id; + row->lock_rec = heap_no; + + if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) { + + /* memory could not be allocated */ + return false; + } + } else { + row->lock_index = NULL; + + row->lock_page = page_id_t(0, 0); + row->lock_rec = 0; + + row->lock_data = NULL; + } + + row->lock_table_id = table->id; + + row->hash_chain.value = row; + ut_ad(i_s_locks_row_validate(row)); + + return true; +} + +/*******************************************************************//** +Fills i_s_lock_waits_row_t object. Returns its first argument. +@return result object that's filled */ +static +i_s_lock_waits_row_t* +fill_lock_waits_row( +/*================*/ + i_s_lock_waits_row_t* row, /*!< out: result object + that's filled */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + ut_ad(i_s_locks_row_validate(requested_lock_row)); + ut_ad(i_s_locks_row_validate(blocking_lock_row)); + + row->requested_lock_row = requested_lock_row; + row->blocking_lock_row = blocking_lock_row; + + return(row); +} + +/*******************************************************************//** +Calculates a hash fold for a lock. For a record lock the fold is +calculated from 4 elements, which uniquely identify a lock at a given +point in time: transaction id, space id, page number, record number. +For a table lock the fold is table's id. +@return fold */ +static +ulint +fold_lock( +/*======*/ + const lock_t* lock, /*!< in: lock object to fold */ + ulint heap_no)/*!< in: lock's record number + or 0xFFFF if the lock + is a table lock */ +{ +#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT + static ulint fold = 0; + + return(fold++); +#else + ulint ret; + + if (!lock->is_table()) { + ut_a(heap_no != 0xFFFF); + ret = ut_fold_ulint_pair((ulint) lock->trx->id, + lock->un_member.rec_lock.page_id. + fold()); + ret = ut_fold_ulint_pair(ret, heap_no); + } else { + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == 0xFFFF); + + ret = (ulint) lock_get_table(*lock)->id; + } + + return(ret); +#endif +} + +/*******************************************************************//** +Checks whether i_s_locks_row_t object represents a lock_t object. +@return TRUE if they match */ +static +ibool +locks_row_eq_lock( +/*==============*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + const lock_t* lock, /*!< in: lock object */ + ulint heap_no)/*!< in: lock's record number + or 0xFFFF if the lock + is a table lock */ +{ + ut_ad(i_s_locks_row_validate(row)); +#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T + return(0); +#else + if (!lock->is_table()) { + ut_a(heap_no != 0xFFFF); + + return(row->lock_trx_id == lock->trx->id + && row->lock_page == lock->un_member.rec_lock.page_id + && row->lock_rec == heap_no); + } else { + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == 0xFFFF); + + return(row->lock_trx_id == lock->trx->id + && row->lock_table_id == lock_get_table(*lock)->id); + } +#endif +} + +/*******************************************************************//** +Searches for a row in the innodb_locks cache that has a specified id. +This happens in O(1) time since a hash table is used. Returns pointer to +the row or NULL if none is found. +@return row or NULL */ +static +i_s_locks_row_t* +search_innodb_locks( +/*================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + const lock_t* lock, /*!< in: lock to search for */ + uint16_t heap_no)/*!< in: lock's record number + or 0xFFFF if the lock + is a table lock */ +{ + i_s_hash_chain_t* hash_chain; + + HASH_SEARCH( + /* hash_chain->"next" */ + next, + /* the hash table */ + &cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* the type of the next variable */ + i_s_hash_chain_t*, + /* auxiliary variable */ + hash_chain, + /* assertion on every traversed item */ + ut_ad(i_s_locks_row_validate(hash_chain->value)), + /* this determines if we have found the lock */ + locks_row_eq_lock(hash_chain->value, lock, heap_no)); + + if (hash_chain == NULL) { + + return(NULL); + } + /* else */ + + return(hash_chain->value); +} + +/*******************************************************************//** +Adds new element to the locks cache, enlarging it if necessary. +Returns a pointer to the added row. If the row is already present then +no row is added and a pointer to the existing row is returned. +If row can not be allocated then NULL is returned. +@return row */ +static +i_s_locks_row_t* +add_lock_to_cache( +/*==============*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const lock_t* lock, /*!< in: the element to add */ + uint16_t heap_no)/*!< in: lock's record number + or 0 if the lock + is a table lock */ +{ + i_s_locks_row_t* dst_row; + +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + ulint i; + for (i = 0; i < 10000; i++) { +#endif +#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS + /* quit if this lock is already present */ + dst_row = search_innodb_locks(cache, lock, heap_no); + if (dst_row != NULL) { + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); + } +#endif + + dst_row = (i_s_locks_row_t*) + table_cache_create_empty_row(&cache->innodb_locks, cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(NULL); + } + + if (!fill_locks_row(dst_row, lock, heap_no, cache)) { + + /* memory could not be allocated */ + cache->innodb_locks.rows_used--; + return(NULL); + } + +#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE + HASH_INSERT( + /* the type used in the hash chain */ + i_s_hash_chain_t, + /* hash_chain->"next" */ + next, + /* the hash table */ + &cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* add this data to the hash */ + &dst_row->hash_chain); +#endif +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + } /* for()-loop */ +#endif + + ut_ad(i_s_locks_row_validate(dst_row)); + return(dst_row); +} + +/*******************************************************************//** +Adds new pair of locks to the lock waits cache. +If memory can not be allocated then FALSE is returned. +@return FALSE if allocation fails */ +static +ibool +add_lock_wait_to_cache( +/*===================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + i_s_lock_waits_row_t* dst_row; + + dst_row = (i_s_lock_waits_row_t*) + table_cache_create_empty_row(&cache->innodb_lock_waits, + cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(FALSE); + } + + fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row); + + return(TRUE); +} + +/*******************************************************************//** +Adds transaction's relevant (important) locks to cache. +If the transaction is waiting, then the wait lock is added to +innodb_locks and a pointer to the added row is returned in +requested_lock_row, otherwise requested_lock_row is set to NULL. +If rows can not be allocated then FALSE is returned and the value of +requested_lock_row is undefined. +@return FALSE if allocation fails */ +static +ibool +add_trx_relevant_locks_to_cache( +/*============================*/ + trx_i_s_cache_t* cache, /*!< in/out: cache */ + const trx_t* trx, /*!< in: transaction */ + i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the + requested lock row, or NULL or + undefined */ +{ + lock_sys.assert_locked(); + + /* If transaction is waiting we add the wait lock and all locks + from another transactions that are blocking the wait lock. */ + if (const lock_t *wait_lock = trx->lock.wait_lock) { + + const lock_t* curr_lock; + i_s_locks_row_t* blocking_lock_row; + lock_queue_iterator_t iter; + + uint16_t wait_lock_heap_no + = wait_lock_get_heap_no(wait_lock); + + /* add the requested lock */ + *requested_lock_row = add_lock_to_cache(cache, wait_lock, + wait_lock_heap_no); + + /* memory could not be allocated */ + if (*requested_lock_row == NULL) { + + return(FALSE); + } + + /* then iterate over the locks before the wait lock and + add the ones that are blocking it */ + + lock_queue_iterator_reset(&iter, wait_lock, ULINT_UNDEFINED); + + for (curr_lock = lock_queue_iterator_get_prev(&iter); + curr_lock != NULL; + curr_lock = lock_queue_iterator_get_prev(&iter)) { + + if (lock_has_to_wait(wait_lock, curr_lock)) { + + /* add the lock that is + blocking wait_lock */ + blocking_lock_row + = add_lock_to_cache( + cache, curr_lock, + /* heap_no is the same + for the wait and waited + locks */ + wait_lock_heap_no); + + /* memory could not be allocated */ + if (blocking_lock_row == NULL) { + + return(FALSE); + } + + /* add the relation between both locks + to innodb_lock_waits */ + if (!add_lock_wait_to_cache( + cache, *requested_lock_row, + blocking_lock_row)) { + + /* memory could not be allocated */ + return(FALSE); + } + } + } + } else { + + *requested_lock_row = NULL; + } + + return(TRUE); +} + +/** The minimum time that a cache must not be updated after it has been +read for the last time; measured in nanoseconds. We use this technique +to ensure that SELECTs which join several INFORMATION SCHEMA tables read +the same version of the cache. */ +#define CACHE_MIN_IDLE_TIME_NS 100000000 /* 0.1 sec */ + +/*******************************************************************//** +Checks if the cache can safely be updated. +@return whether the cache can be updated */ +static bool can_cache_be_updated(trx_i_s_cache_t* cache) +{ + /* cache->last_read is only updated when a shared rw lock on the + whole cache is being held (see trx_i_s_cache_end_read()) and + we are currently holding an exclusive rw lock on the cache. + So it is not possible for last_read to be updated while we are + reading it. */ + return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS; +} + +/*******************************************************************//** +Declare a cache empty, preparing it to be filled up. Not all resources +are freed because they can be reused. */ +static +void +trx_i_s_cache_clear( +/*================*/ + trx_i_s_cache_t* cache) /*!< out: cache to clear */ +{ + cache->innodb_trx.rows_used = 0; + cache->innodb_locks.rows_used = 0; + cache->innodb_lock_waits.rows_used = 0; + + cache->locks_hash.clear(); + + ha_storage_empty(&cache->storage); +} + + +/** + Add transactions to innodb_trx's cache. + + We also add all locks that are relevant to each transaction into + innodb_locks' and innodb_lock_waits' caches. +*/ + +static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx) +{ + i_s_locks_row_t *requested_lock_row; + +#ifdef UNIV_DEBUG + { + const auto state= trx->state; + + if (trx->is_autocommit_non_locking()) + { + ut_ad(trx->read_only); + ut_ad(!trx->is_recovered); + ut_ad(trx->mysql_thd); + ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE); + } + else + ut_ad(state == TRX_STATE_ACTIVE || + state == TRX_STATE_PREPARED || + state == TRX_STATE_PREPARED_RECOVERED || + state == TRX_STATE_COMMITTED_IN_MEMORY); + } +#endif /* UNIV_DEBUG */ + + if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row)) + { + if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>( + table_cache_create_empty_row(&cache->innodb_trx, cache))) + { + if (fill_trx_row(trx_row, trx, requested_lock_row, cache)) + return; + --cache->innodb_trx.rows_used; + } + } + + /* memory could not be allocated */ + cache->is_truncated= true; +} + + +/** + Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the + table cache buffer. Cache must be locked for write. +*/ + +static void fetch_data_into_cache(trx_i_s_cache_t *cache) +{ + LockMutexGuard g{SRW_LOCK_CALL}; + trx_i_s_cache_clear(cache); + + /* Capture the state of transactions */ + trx_sys.trx_list.for_each([cache](trx_t &trx) { + if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED && + &trx != (purge_sys.query ? purge_sys.query->trx : nullptr)) + { + trx.mutex_lock(); + if (trx.state != TRX_STATE_NOT_STARTED) + fetch_data_into_cache_low(cache, &trx); + trx.mutex_unlock(); + } + }); + cache->is_truncated= false; +} + + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +Called from handler/i_s.cc. +@return 0 - fetched, 1 - not */ +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache) /*!< in/out: cache */ +{ + if (!can_cache_be_updated(cache)) { + + return(1); + } + + /* We need to read trx_sys and record/table lock queues */ + fetch_data_into_cache(cache); + + /* update cache last read time */ + cache->last_read = my_interval_timer(); + + return(0); +} + +/*******************************************************************//** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +bool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + return(cache->is_truncated); +} + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache) /*!< out: cache to init */ +{ + /* The latching is done in the following order: + acquire trx_i_s_cache_t::rw_lock, rwlock + acquire exclusive lock_sys.latch + release exclusive lock_sys.latch + release trx_i_s_cache_t::rw_lock + acquire trx_i_s_cache_t::rw_lock, rdlock + release trx_i_s_cache_t::rw_lock */ + + cache->rw_lock.SRW_LOCK_INIT(trx_i_s_cache_lock_key); + + cache->last_read = 0; + + table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t)); + table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t)); + table_cache_init(&cache->innodb_lock_waits, + sizeof(i_s_lock_waits_row_t)); + + cache->locks_hash.create(LOCKS_HASH_CELLS_NUM); + + cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE, + CACHE_STORAGE_HASH_CELLS); + + cache->mem_allocd = 0; + + cache->is_truncated = false; +} + +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache) /*!< in, own: cache to free */ +{ + cache->rw_lock.destroy(); + + cache->locks_hash.free(); + ha_storage_free(cache->storage); + table_cache_free(&cache->innodb_trx); + table_cache_free(&cache->innodb_locks); + table_cache_free(&cache->innodb_lock_waits); +} + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + cache->rw_lock.rd_lock(SRW_LOCK_CALL); +} + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + cache->last_read = my_interval_timer(); + cache->rw_lock.rd_unlock(); +} + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + cache->rw_lock.wr_lock(SRW_LOCK_CALL); +} + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache) /*!< in: cache */ +{ + cache->rw_lock.wr_unlock(); +} + +/*******************************************************************//** +Selects a INFORMATION SCHEMA table cache from the whole cache. +@return table cache */ +static +i_s_table_cache_t* +cache_select_table( +/*===============*/ + trx_i_s_cache_t* cache, /*!< in: whole cache */ + enum i_s_table table) /*!< in: which table */ +{ + switch (table) { + case I_S_INNODB_TRX: + return &cache->innodb_trx; + case I_S_INNODB_LOCKS: + return &cache->innodb_locks; + case I_S_INNODB_LOCK_WAITS: + return &cache->innodb_lock_waits; + } + + ut_error; + return NULL; +} + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table) /*!< in: which table */ +{ + i_s_table_cache_t* table_cache; + + table_cache = cache_select_table(cache, table); + + return(table_cache->rows_used); +} + +/*******************************************************************//** +Retrieves the nth row (zero-based) in the cache for a given +INFORMATION SCHEMA table. +@return row */ +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n) /*!< in: row number */ +{ + i_s_table_cache_t* table_cache; + ulint i; + void* row; + + table_cache = cache_select_table(cache, table); + + ut_a(n < table_cache->rows_used); + + row = NULL; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd > n) { + + row = (char*) table_cache->chunks[i].base + + (n - table_cache->chunks[i].offset) + * table_cache->row_size; + break; + } + } + + ut_a(row != NULL); + + return(row); +} + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size)/*!< in: size of the lock id + buffer */ +{ + int res_len; + + /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */ + + if (row->lock_index) { + /* record lock */ + res_len = snprintf(lock_id, lock_id_size, + TRX_ID_FMT + ":%u:%u:%u", + row->lock_trx_id, row->lock_page.space(), + row->lock_page.page_no(), row->lock_rec); + } else { + /* table lock */ + res_len = snprintf(lock_id, lock_id_size, + TRX_ID_FMT":" UINT64PF, + row->lock_trx_id, + row->lock_table_id); + } + + /* the typecast is safe because snprintf(3) never returns + negative result */ + ut_a(res_len >= 0); + ut_a((ulint) res_len < lock_id_size); + + return(lock_id); +} diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc new file mode 100644 index 00000000..625d3223 --- /dev/null +++ b/storage/innobase/trx/trx0purge.cc @@ -0,0 +1,1416 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0purge.cc +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0purge.h" +#include "fsp0fsp.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "que0que.h" +#include "row0purge.h" +#include "row0upd.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include <mysql/service_wsrep.h> + +#include <unordered_map> + +/** Maximum allowable purge history length. <=0 means 'infinite'. */ +ulong srv_max_purge_lag = 0; + +/** Max DML user threads delay in micro-seconds. */ +ulong srv_max_purge_lag_delay = 0; + +/** The global data structure coordinating a purge */ +purge_sys_t purge_sys; + +#ifdef UNIV_DEBUG +my_bool srv_purge_view_update_only_debug; +#endif /* UNIV_DEBUG */ + +/** Sentinel value */ +static const TrxUndoRsegs NullElement; + +/** Default constructor */ +TrxUndoRsegsIterator::TrxUndoRsegsIterator() + : m_rsegs(NullElement), m_iter(m_rsegs.begin()) +{ +} + +/** Sets the next rseg to purge in purge_sys. +Executed in the purge coordinator thread. +@return whether anything is to be purged */ +TRANSACTIONAL_INLINE inline bool TrxUndoRsegsIterator::set_next() +{ + mysql_mutex_lock(&purge_sys.pq_mutex); + + /* Only purge consumes events from the priority queue, user + threads only produce the events. */ + + /* Check if there are more rsegs to process in the + current element. */ + if (m_iter != m_rsegs.end()) { + /* We are still processing rollback segment from + the same transaction and so expected transaction + number shouldn't increase. Undo the increment of + expected commit done by caller assuming rollback + segments from given transaction are done. */ + purge_sys.tail.trx_no = (*m_iter)->last_trx_no(); + } else if (!purge_sys.purge_queue.empty()) { + m_rsegs = purge_sys.purge_queue.top(); + purge_sys.purge_queue.pop(); + ut_ad(purge_sys.purge_queue.empty() + || purge_sys.purge_queue.top() != m_rsegs); + m_iter = m_rsegs.begin(); + } else { + /* Queue is empty, reset iterator. */ + purge_sys.rseg = NULL; + mysql_mutex_unlock(&purge_sys.pq_mutex); + m_rsegs = NullElement; + m_iter = m_rsegs.begin(); + return false; + } + + purge_sys.rseg = *m_iter++; + mysql_mutex_unlock(&purge_sys.pq_mutex); + + /* We assume in purge of externally stored fields that space + id is in the range of UNDO tablespace space ids */ + ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE + || srv_is_undo_tablespace(purge_sys.rseg->space->id)); + + trx_id_t last_trx_no; + { +#ifdef SUX_LOCK_GENERIC + purge_sys.rseg->latch.rd_lock(SRW_LOCK_CALL); +#else + transactional_shared_lock_guard<srw_spin_lock> rg + {purge_sys.rseg->latch}; +#endif + last_trx_no = purge_sys.rseg->last_trx_no(); + + purge_sys.hdr_offset = purge_sys.rseg->last_offset(); + purge_sys.hdr_page_no = purge_sys.rseg->last_page_no; + +#ifdef SUX_LOCK_GENERIC + purge_sys.rseg->latch.rd_unlock(); +#endif + } + + /* Only the purge coordinator task will access this object + purge_sys.rseg_iter, or any of purge_sys.hdr_page_no, + purge_sys.tail, purge_sys.head, or modify purge_sys.view. */ + ut_ad(last_trx_no == m_rsegs.trx_no); + ut_a(purge_sys.hdr_page_no != FIL_NULL); + ut_a(purge_sys.tail.trx_no <= last_trx_no); + purge_sys.tail.trx_no = last_trx_no; + + return(true); +} + +/** Build a purge 'query' graph. The actual purge is performed by executing +this query graph. +@return own: the query graph */ +static +que_t* +purge_graph_build() +{ + ut_a(srv_n_purge_threads > 0); + + trx_t* trx = trx_create(); + ut_ad(!trx->id); + trx->start_time = time(NULL); + trx->start_time_micro = microsecond_interval_timer(); + trx->state = TRX_STATE_ACTIVE; + trx->op_info = "purge trx"; + + mem_heap_t* heap = mem_heap_create(512); + que_fork_t* fork = que_fork_create(heap); + fork->trx = trx; + + for (auto i = innodb_purge_threads_MAX; i; i--) { + que_thr_t* thr = que_thr_create(fork, heap, NULL); + thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t))) + purge_node_t(thr); + } + + return(fork); +} + +/** Initialise the purge system. */ +void purge_sys_t::create() +{ + ut_ad(this == &purge_sys); + ut_ad(!heap); + ut_ad(!enabled()); + m_paused= 0; + m_SYS_paused= 0; + query= purge_graph_build(); + next_stored= false; + rseg= NULL; + page_no= 0; + offset= 0; + hdr_page_no= 0; + hdr_offset= 0; + latch.SRW_LOCK_INIT(trx_purge_latch_key); + end_latch.init(); + mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr); + truncate.current= NULL; + truncate.last= NULL; + heap= mem_heap_create(4096); +} + +/** Close the purge subsystem on shutdown. */ +void purge_sys_t::close() +{ + ut_ad(this == &purge_sys); + if (!heap) + return; + + ut_ad(!enabled()); + trx_t* trx = query->trx; + que_graph_free(query); + ut_ad(!trx->id); + ut_ad(trx->state == TRX_STATE_ACTIVE); + trx->state= TRX_STATE_NOT_STARTED; + trx->free(); + latch.destroy(); + end_latch.destroy(); + mysql_mutex_destroy(&pq_mutex); + mem_heap_free(heap); + heap= nullptr; +} + +/** Determine if the history of a transaction is purgeable. +@param trx_id transaction identifier +@return whether the history is purgeable */ +TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const +{ + bool purgeable; +#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION + purgeable= false; + if (xbegin()) + { + if (!latch.is_write_locked()) + { + purgeable= view.changes_visible(trx_id); + xend(); + } + else + xabort(); + } + else +#endif + { + latch.rd_lock(SRW_LOCK_CALL); + purgeable= view.changes_visible(trx_id); + latch.rd_unlock(); + } + return purgeable; +} + +/*================ UNDO LOG HISTORY LIST =============================*/ + +/** Prepend the history list with an undo log. +Remove the undo log segment from the rseg slot if it is too big for reuse. +@param[in] trx transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ +void +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) +{ + DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")", + trx->id, trx_id_t{trx->rw_trx_hash_element->no})); + ut_ad(undo == trx->rsegs.m_redo.undo); + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + ut_ad(undo->rseg == rseg); + buf_block_t* rseg_header = rseg->get(mtr, nullptr); + /* We are in transaction commit; we cannot return an error. If the + database is corrupted, it is better to crash it than to + intentionally violate ACID by committing something that is known to + be corrupted. */ + ut_ad(rseg_header); + buf_block_t* undo_page = trx_undo_set_state_at_finish( + undo, mtr); + trx_ulogf_t* undo_header = undo_page->page.frame + + undo->hdr_offset; + + ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1); + + if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rseg_header->page.frame))) { + /* This database must have been upgraded from + before MariaDB 10.3.5. */ + trx_rseg_format_upgrade(rseg_header, mtr); + } + + if (undo->state != TRX_UNDO_CACHED) { + /* The undo log segment will not be reused */ + ut_a(undo->id < TRX_RSEG_N_SLOTS); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(rseg_header, + TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff); + + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); + + uint32_t hist_size = mach_read_from_4( + TRX_RSEG_HISTORY_SIZE + TRX_RSEG + + rseg_header->page.frame); + + ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST + + undo_page->page.frame)); + + mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rseg_header->page.frame, + hist_size + undo->size); + mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + + rseg_header->page.frame, + trx_sys.get_max_trx_id()); + } + + /* After the purge thread has been given permission to exit, + we may roll back transactions (trx->undo_no==0) + in THD::cleanup() invoked from unlink_thd() in fast shutdown, + or in trx_rollback_recovered() in slow shutdown. + + Before any transaction-generating background threads or the + purge have been started, we can + start transactions in row_merge_drop_temp_indexes(), + and roll back recovered transactions. + + Arbitrary user transactions may be executed when all the undo log + related background processes (including purge) are disabled due to + innodb_force_recovery=2 or innodb_force_recovery=3. + DROP TABLE may be executed at any innodb_force_recovery level. + + During fast shutdown, we may also continue to execute + user transactions. */ + ut_ad(srv_undo_sources + || trx->undo_no == 0 + || (!purge_sys.enabled() + && (srv_is_being_started + || trx_rollback_is_active + || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)) + || srv_fast_shutdown); + +#ifdef WITH_WSREP + if (wsrep_is_wsrep_xid(&trx->xid)) { + trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr); + } +#endif + + if (trx->mysql_log_file_name && *trx->mysql_log_file_name) { + /* Update the latest MySQL binlog name and offset info + in rollback segment header if MySQL binlogging is on + or the database server is a MySQL replication save. */ + trx_rseg_update_binlog_offset(rseg_header, trx, mtr); + } + + /* Add the log as the first in the history list */ + + /* We are in transaction commit; we cannot return an error + when detecting corruption. It is better to crash the server + than to intentionally violate ACID by committing something + that is known to be corrupted. */ + ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page, + static_cast<uint16_t>(undo->hdr_offset + + TRX_UNDO_HISTORY_NODE), + mtr) == DB_SUCCESS); + + mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, + undo_header + TRX_UNDO_TRX_NO, + trx->rw_trx_hash_element->no); + mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header + + TRX_UNDO_NEEDS_PURGE, 1U); + + if (rseg->last_page_no == FIL_NULL) { + rseg->last_page_no = undo->hdr_page_no; + rseg->set_last_commit(undo->hdr_offset, + trx->rw_trx_hash_element->no); + rseg->set_needs_purge(); + } + + rseg->history_size++; + + if (undo->state == TRX_UNDO_CACHED) { + UT_LIST_ADD_FIRST(rseg->undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + ut_free(undo); + } + + undo = NULL; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Remove undo log header from the history list. +@param[in,out] rseg rollback segment header page +@param[in] log undo log segment header page +@param[in] offset byte offset in the undo log segment header page +@param[in,out] mtr mini-transaction */ +static dberr_t trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log, + uint16_t offset, mtr_t *mtr) +{ + return flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY, log, + uint16_t(offset + TRX_UNDO_HISTORY_NODE), mtr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Free an undo log segment, and remove the header from the history list. +@param[in,out] rseg rollback segment +@param[in] hdr_addr file address of log_hdr +@return error code */ +static dberr_t trx_purge_free_segment(trx_rseg_t *rseg, fil_addr_t hdr_addr) +{ + const page_id_t hdr_page_id{rseg->space->id, hdr_addr.page}; + mtr_t mtr; + mtr.start(); + + /* We only need the latch to maintain rseg->curr_size. To follow the + latching order, we must acquire it before acquiring any related + page latch. */ + rseg->latch.wr_lock(SRW_LOCK_CALL); + + dberr_t err; + buf_block_t *rseg_hdr= rseg->get(&mtr, &err); + if (!rseg_hdr) + goto func_exit; + if (buf_block_t *block= buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, + &mtr, &err)) + { + /* Mark the last undo log totally purged, so that if the system + crashes, the tail of the undo log will not get accessed again. The + list of pages in the undo log tail gets inconsistent during the + freeing of the segment, and therefore purge should not try to + access them again. */ + mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->page.frame + + hdr_addr.boffset + TRX_UNDO_NEEDS_PURGE, 0U); + while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + block->page.frame, &mtr)) + { + rseg->latch.wr_unlock(); + rseg_hdr->fix(); + block->fix(); + mtr.commit(); + mtr.start(); + mtr.flag_modified(); + rseg->latch.wr_lock(SRW_LOCK_CALL); + rseg_hdr->page.lock.x_lock(); + block->page.lock.x_lock(); + mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX); + mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY); + } + + /* The page list may now be inconsistent, but the length field + stored in the list base node tells us how big it was before we + started the freeing. */ + const uint32_t seg_size= + flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame); + + /* We may free the undo log segment header page; it must be freed + within the same mtr as the undo log header is removed from the + history list: otherwise, in case of a database crash, the segment + could become inaccessible garbage in the file space. */ + err= trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + goto func_exit; + byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->page.frame; + if (UNIV_UNLIKELY(mach_read_from_4(hist) < seg_size)) + { + err= DB_CORRUPTION; + goto func_exit; + } + mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size); + + /* Here we assume that a file segment with just the header page + can be freed in a few steps, so that the buffer pool is not + flooded with bufferfixed pages: see the note in fsp0fsp.cc. */ + while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + block->page.frame, &mtr)); + + ut_ad(rseg->curr_size >= seg_size); + + rseg->history_size--; + rseg->curr_size -= seg_size; + } + +func_exit: + rseg->latch.wr_unlock(); + mtr.commit(); + return err; +} + +/** Remove unnecessary history data from a rollback segment. +@param[in,out] rseg rollback segment +@param[in] limit truncate anything before this +@return error code */ +static +dberr_t +trx_purge_truncate_rseg_history( + trx_rseg_t& rseg, + const purge_sys_t::iterator& limit) +{ + fil_addr_t hdr_addr; + mtr_t mtr; + + mtr.start(); + ut_ad(rseg.is_persistent()); + rseg.latch.wr_lock(SRW_LOCK_CALL); + + dberr_t err; + buf_block_t* rseg_hdr = rseg.get(&mtr, &err); + if (!rseg_hdr) { + goto func_exit; + } + + hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY + + rseg_hdr->page.frame); + hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset + - TRX_UNDO_HISTORY_NODE); + +loop: + if (hdr_addr.page == FIL_NULL) { +func_exit: + rseg.latch.wr_unlock(); + mtr.commit(); + return err; + } + + buf_block_t* block = buf_page_get_gen(page_id_t(rseg.space->id, + hdr_addr.page), + 0, RW_X_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, + &mtr, &err); + if (!block) { + goto func_exit; + } + + const trx_id_t undo_trx_no = mach_read_from_8( + block->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO); + + if (undo_trx_no >= limit.trx_no) { + if (undo_trx_no == limit.trx_no) { + err = trx_undo_truncate_start( + &rseg, hdr_addr.page, + hdr_addr.boffset, limit.undo_no); + } + + goto func_exit; + } + + fil_addr_t prev_hdr_addr = flst_get_prev_addr( + block->page.frame + hdr_addr.boffset + TRX_UNDO_HISTORY_NODE); + prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset + - TRX_UNDO_HISTORY_NODE); + + if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + block->page.frame) + == TRX_UNDO_TO_PURGE + && !mach_read_from_2(block->page.frame + hdr_addr.boffset + + TRX_UNDO_NEXT_LOG)) { + + /* We can free the whole log segment */ + + rseg.latch.wr_unlock(); + mtr.commit(); + + /* calls the trx_purge_remove_log_hdr() + inside trx_purge_free_segment(). */ + err = trx_purge_free_segment(&rseg, hdr_addr); + if (err != DB_SUCCESS) { + return err; + } + } else { + /* Remove the log hdr from the rseg history. */ + err = trx_purge_remove_log_hdr(rseg_hdr, block, + hdr_addr.boffset, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + + rseg.history_size--; + rseg.latch.wr_unlock(); + mtr.commit(); + } + + mtr.start(); + rseg.latch.wr_lock(SRW_LOCK_CALL); + + hdr_addr = prev_hdr_addr; + + rseg_hdr = rseg.get(&mtr, &err); + if (!rseg_hdr) { + goto func_exit; + } + + goto loop; +} + +/** Cleanse purge queue to remove the rseg that reside in undo-tablespace +marked for truncate. +@param[in] space undo tablespace being truncated */ +static void trx_purge_cleanse_purge_queue(const fil_space_t& space) +{ + typedef std::vector<TrxUndoRsegs> purge_elem_list_t; + purge_elem_list_t purge_elem_list; + + mysql_mutex_lock(&purge_sys.pq_mutex); + + /* Remove rseg instances that are in the purge queue before we start + truncate of corresponding UNDO truncate. */ + while (!purge_sys.purge_queue.empty()) { + purge_elem_list.push_back(purge_sys.purge_queue.top()); + purge_sys.purge_queue.pop(); + } + + for (purge_elem_list_t::iterator it = purge_elem_list.begin(); + it != purge_elem_list.end(); + ++it) { + + for (TrxUndoRsegs::iterator it2 = it->begin(); + it2 != it->end(); + ++it2) { + if ((*it2)->space == &space) { + it->erase(it2); + break; + } + } + + if (!it->empty()) { + purge_sys.purge_queue.push(*it); + } + } + + mysql_mutex_unlock(&purge_sys.pq_mutex); +} + +#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__ +# if defined __arm__ || defined __aarch64__ +/* Work around an internal compiler error in GCC 4.8.5 */ +__attribute__((optimize(0))) +# endif +#endif +/** +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! +*/ +TRANSACTIONAL_TARGET static void trx_purge_truncate_history() +{ + ut_ad(purge_sys.head <= purge_sys.tail); + purge_sys_t::iterator &head= purge_sys.head.trx_no + ? purge_sys.head : purge_sys.tail; + + if (head.trx_no >= purge_sys.low_limit_no()) + { + /* This is sometimes necessary. TODO: find out why. */ + head.trx_no= purge_sys.low_limit_no(); + head.undo_no= 0; + } + + dberr_t err= DB_SUCCESS; + for (auto &rseg : trx_sys.rseg_array) + if (rseg.space) + if (dberr_t e= trx_purge_truncate_rseg_history(rseg, head)) + err= e; + + if (err != DB_SUCCESS || srv_undo_tablespaces_active < 2) + return; + + while (srv_undo_log_truncate) + { + if (!purge_sys.truncate.current) + { + const ulint threshold= + ulint(srv_max_undo_log_size >> srv_page_size_shift); + for (ulint i= purge_sys.truncate.last + ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0, + j= i;; ) + { + const auto space_id= srv_undo_space_id_start + i; + ut_ad(srv_is_undo_tablespace(space_id)); + fil_space_t *space= fil_space_get(space_id); + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + + if (space && space->get_size() > threshold) + { + purge_sys.truncate.current= space; + break; + } + + ++i; + i %= srv_undo_tablespaces_active; + if (i == j) + return; + } + } + + fil_space_t &space= *purge_sys.truncate.current; + /* Undo tablespace always are a single file. */ + fil_node_t *file= UT_LIST_GET_FIRST(space.chain); + /* The undo tablespace files are never closed. */ + ut_ad(file->is_open()); + + DBUG_LOG("undo", "marking for truncate: " << file->name); + + for (auto &rseg : trx_sys.rseg_array) + if (rseg.space == &space) + /* Once set, this rseg will not be allocated to subsequent + transactions, but we will wait for existing active + transactions to finish. */ + rseg.set_skip_allocation(); + + for (auto &rseg : trx_sys.rseg_array) + { + if (rseg.space != &space) + continue; +#ifdef SUX_LOCK_GENERIC + rseg.latch.rd_lock(SRW_LOCK_CALL); +#else + transactional_shared_lock_guard<srw_spin_lock> g{rseg.latch}; +#endif + ut_ad(rseg.skip_allocation()); + if (rseg.is_referenced()) + { +not_free: +#ifdef SUX_LOCK_GENERIC + rseg.latch.rd_unlock(); +#endif + return; + } + + if (rseg.curr_size != 1) + { + /* Check if all segments are cached and safe to remove. */ + ulint cached= 0; + for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached); undo; + undo= UT_LIST_GET_NEXT(undo_list, undo)) + { + if (head.trx_no < undo->trx_id) + goto not_free; + else + cached+= undo->size; + } + + ut_ad(rseg.curr_size > cached); + + if (rseg.curr_size > cached + 1) + goto not_free; + } + +#ifdef SUX_LOCK_GENERIC + rseg.latch.rd_unlock(); +#endif + } + + ib::info() << "Truncating " << file->name; + trx_purge_cleanse_purge_queue(space); + + log_free_check(); + + mtr_t mtr; + mtr.start(); + mtr.x_lock_space(&space); + + /* Lock all modified pages of the tablespace. + + During truncation, we do not want any writes to the file. + + If a log checkpoint was completed at LSN earlier than our + mini-transaction commit and the server was killed, then + discarding the to-be-trimmed pages without flushing would + break crash recovery. */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + if (bpage->id().space() == space.id && + bpage->oldest_modification() != 1) + { + ut_ad(bpage->frame); + auto block= reinterpret_cast<buf_block_t*>(bpage); + if (!bpage->lock.x_lock_try()) + { + /* Let buf_pool_t::release_freed_page() proceed. */ + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + std::this_thread::yield(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + rescan: + bpage= UT_LIST_GET_LAST(buf_pool.flush_list); + continue; + } + buf_pool.flush_hp.set(prev); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!block->index); /* There is no AHI on undo tablespaces. */ +#endif + bpage->fix(); + ut_ad(!bpage->is_io_fixed()); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + if (bpage->oldest_modification() > 1) + { + bpage->reset_oldest_modification(); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + } + else + { + bpage->unfix(); + bpage->lock.x_unlock(); + } + + if (prev != buf_pool.flush_hp.get()) + /* Rescan, because we may have lost the position. */ + goto rescan; + } + + bpage= prev; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + /* Re-initialize tablespace, in a single mini-transaction. */ + const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; + + /* Adjust the tablespace metadata. */ + mysql_mutex_lock(&fil_system.mutex); + space.set_stopping(); + space.is_being_truncated= true; + if (space.crypt_data) + { + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + fil_space_crypt_close_tablespace(&space); + space.release(); + } + else + mysql_mutex_unlock(&fil_system.mutex); + + for (auto i= 6000; space.referenced(); + std::this_thread::sleep_for(std::chrono::milliseconds(10))) + { + if (!--i) + { + mtr.commit(); + ib::error() << "Failed to freeze UNDO tablespace " << file->name; + return; + } + } + + /* Associate the undo tablespace with mtr. + During mtr::commit_shrink(), InnoDB can use the undo + tablespace object to clear all freed ranges */ + mtr.set_named_space(&space); + mtr.trim_pages(page_id_t(space.id, size)); + ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS); + mysql_mutex_lock(&fil_system.mutex); + space.size= file->size= size; + mysql_mutex_unlock(&fil_system.mutex); + + for (auto &rseg : trx_sys.rseg_array) + { + if (rseg.space != &space) + continue; + + dberr_t err; + buf_block_t *rblock= trx_rseg_header_create(&space, + &rseg - trx_sys.rseg_array, + trx_sys.get_max_trx_id(), + &mtr, &err); + ut_a(rblock); + /* These were written by trx_rseg_header_create(). */ + ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rblock->page.frame)); + ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rblock->page.frame)); + rseg.reinit(rblock->page.id().page_no()); + } + + mtr.commit_shrink(space); + + /* No mutex; this is only updated by the purge coordinator. */ + export_vars.innodb_undo_truncations++; + + if (purge_sys.rseg && purge_sys.rseg->last_page_no == FIL_NULL) + { + /* If purge_sys.rseg is pointing to rseg that was recently + truncated then move to next rseg element. + + Note: Ideally purge_sys.rseg should be NULL because purge should + complete processing of all the records but srv_purge_batch_size + can force the purge loop to exit before all the records are purged. */ + purge_sys.rseg= nullptr; + purge_sys.next_stored= false; + } + + DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc"; + log_buffer_flush_to_disk(); + DBUG_SUICIDE();); + + for (auto &rseg : trx_sys.rseg_array) + if (rseg.space == &space) + rseg.clear_skip_allocation(); + + ib::info() << "Truncated " << file->name; + purge_sys.truncate.last= purge_sys.truncate.current; + ut_ad(&space == purge_sys.truncate.current); + purge_sys.truncate.current= nullptr; + } +} + +/***********************************************************************//** +Updates the last not yet purged history log info in rseg when we have purged +a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */ +static void trx_purge_rseg_get_next_history_log( + ulint* n_pages_handled)/*!< in/out: number of UNDO pages + handled */ +{ + fil_addr_t prev_log_addr; + mtr_t mtr; + + mtr.start(); + + purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL); + + ut_a(purge_sys.rseg->last_page_no != FIL_NULL); + + purge_sys.tail.trx_no= purge_sys.rseg->last_trx_no() + 1; + purge_sys.tail.undo_no= 0; + purge_sys.next_stored= false; + + if (const buf_block_t* undo_page= + buf_page_get_gen(page_id_t(purge_sys.rseg->space->id, + purge_sys.rseg->last_page_no), + 0, RW_S_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, &mtr)) + { + const trx_ulogf_t *log_hdr= + undo_page->page.frame + purge_sys.rseg->last_offset(); + /* Increase the purge page count by one for every handled log */ + ++*n_pages_handled; + prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE); + prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset - + TRX_UNDO_HISTORY_NODE); + } + else + prev_log_addr.page= FIL_NULL; + + const bool empty= prev_log_addr.page == FIL_NULL; + + if (empty) + /* No logs left in the history list */ + purge_sys.rseg->last_page_no= FIL_NULL; + + purge_sys.rseg->latch.wr_unlock(); + mtr.commit(); + + if (empty) + return; + + /* Read the previous log header. */ + mtr.start(); + + byte needs_purge= 0; + trx_id_t trx_no= 0; + + if (const buf_block_t* undo_page= + buf_page_get_gen(page_id_t(purge_sys.rseg->space->id, prev_log_addr.page), + 0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr)) + { + const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset; + + trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1); + needs_purge= log_hdr[TRX_UNDO_NEEDS_PURGE + 1]; + } + + mtr.commit(); + + if (UNIV_UNLIKELY(!trx_no)) + return; + + purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL); + purge_sys.rseg->last_page_no= prev_log_addr.page; + purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no); + + if (needs_purge) + purge_sys.rseg->set_needs_purge(); + else + purge_sys.rseg->clear_needs_purge(); + + /* Purge can also produce events, however these are already ordered + in the rollback segment and any user generated event will be greater + than the events that Purge produces. ie. Purge can never produce + events from an empty rollback segment. */ + + mysql_mutex_lock(&purge_sys.pq_mutex); + purge_sys.purge_queue.push(*purge_sys.rseg); + mysql_mutex_unlock(&purge_sys.pq_mutex); + purge_sys.rseg->latch.wr_unlock(); +} + +/** Position the purge sys "iterator" on the undo record to use for purging. */ +static void trx_purge_read_undo_rec() +{ + uint16_t offset; + uint32_t page_no; + ib_uint64_t undo_no; + + purge_sys.hdr_offset = purge_sys.rseg->last_offset(); + page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no; + + if (purge_sys.rseg->needs_purge()) { + mtr_t mtr; + mtr.start(); + const buf_block_t* undo_page; + if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec( + *purge_sys.rseg->space, purge_sys.hdr_page_no, + purge_sys.hdr_offset, RW_S_LATCH, + undo_page, &mtr, nullptr)) { + + offset = page_offset(undo_rec); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + page_no = undo_page->page.id().page_no(); + } else { + offset = 0; + undo_no = 0; + } + + mtr.commit(); + } else { + offset = 0; + undo_no = 0; + } + + purge_sys.offset = offset; + purge_sys.page_no = page_no; + purge_sys.tail.undo_no = undo_no; + + purge_sys.next_stored = true; +} + +/***********************************************************************//** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +TRANSACTIONAL_TARGET static void trx_purge_choose_next_log() +{ + ut_ad(!purge_sys.next_stored); + + if (purge_sys.rseg_iter.set_next()) { + trx_purge_read_undo_rec(); + } else { + /* There is nothing to do yet. */ + std::this_thread::yield(); + } +} + +/***********************************************************************//** +Gets the next record to purge and updates the info in the purge system. +@return copy of an undo log record +@retval -1 if there is nothing to purge +@retval nullptr on corruption */ +static +trx_undo_rec_t* +trx_purge_get_next_rec( +/*===================*/ + ulint* n_pages_handled,/*!< in/out: number of UNDO pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + mtr_t mtr; + + ut_ad(purge_sys.next_stored); + ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no()); + + const page_id_t page_id{purge_sys.rseg->space->id, purge_sys.page_no}; + const uint16_t offset = purge_sys.offset; + + if (offset == 0) { + /* It is the dummy undo log record, which means that there is + no need to purge this undo log */ + + trx_purge_rseg_get_next_history_log(n_pages_handled); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + return reinterpret_cast<trx_undo_rec_t*>(-1); + } + + mtr.start(); + + const buf_block_t* undo_page + = buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, &mtr); + if (UNIV_UNLIKELY(!undo_page)) { +corrupted: + mtr.commit(); + return nullptr; + } + + const buf_block_t* rec2_page = undo_page; + + const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec( + undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset); + + if (rec2 == NULL) { + rec2 = trx_undo_get_next_rec(rec2_page, offset, + purge_sys.hdr_page_no, + purge_sys.hdr_offset, &mtr); + } + + if (rec2 == NULL) { + mtr_commit(&mtr); + + trx_purge_rseg_get_next_history_log(n_pages_handled); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + mtr_start(&mtr); + + undo_page = buf_page_get_gen(page_id, 0, RW_S_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, + &mtr); + if (UNIV_UNLIKELY(!undo_page)) { + goto corrupted; + } + } else { + purge_sys.offset = page_offset(rec2); + purge_sys.page_no = rec2_page->page.id().page_no(); + purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2); + + if (undo_page != rec2_page) { + /* We advance to a new page of the undo log: */ + (*n_pages_handled)++; + } + } + + trx_undo_rec_t* rec_copy = trx_undo_rec_copy(undo_page->page.frame + + offset, heap); + + mtr.commit(); + return rec_copy; +} + +/********************************************************************//** +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. +@return copy of an undo log record +@retval -1 if the whole undo log can skipped in purge +@retval nullptr if nothing is left, or on corruption */ +static MY_ATTRIBUTE((warn_unused_result)) +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + roll_ptr_t* roll_ptr, /*!< out: roll pointer to undo record */ + ulint* n_pages_handled,/*!< in/out: number of UNDO log pages + handled */ + mem_heap_t* heap) /*!< in: memory heap where copied */ +{ + if (!purge_sys.next_stored) { + trx_purge_choose_next_log(); + + if (!purge_sys.next_stored) { + DBUG_PRINT("ib_purge", + ("no logs left in the history list")); + return nullptr; + } + } + + if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) { + return nullptr; + } + + /* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n", + pthread_self(), iter->trx_no, iter->undo_no); */ + + *roll_ptr = trx_undo_build_roll_ptr( + /* row_purge_record_func() will later set + ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */ + false, + trx_sys.rseg_id(purge_sys.rseg, true), + purge_sys.page_no, purge_sys.offset); + + /* The following call will advance the stored values of the + purge iterator. */ + + return trx_purge_get_next_rec(n_pages_handled, heap); +} + +/** Run a purge batch. +@param n_purge_threads number of purge threads +@return number of undo log pages handled in the batch */ +static +ulint +trx_purge_attach_undo_recs(ulint n_purge_threads) +{ + que_thr_t* thr; + ulint i; + ulint n_pages_handled = 0; + ulint n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs); + + ut_a(n_purge_threads > 0); + + purge_sys.head = purge_sys.tail; + +#ifdef UNIV_DEBUG + i = 0; + /* Debug code to validate some pre-requisites and reset done flag. */ + for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); + thr != NULL && i < n_purge_threads; + thr = UT_LIST_GET_NEXT(thrs, thr), ++i) { + + purge_node_t* node; + + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + ut_ad(node->undo_recs.empty()); + ut_ad(!node->in_progress); + ut_d(node->in_progress = true); + } + + /* There should never be fewer nodes than threads, the inverse + however is allowed because we only use purge threads as needed. */ + ut_ad(i == n_purge_threads); +#endif + + /* Fetch and parse the UNDO records. The UNDO records are added + to a per purge node vector. */ + thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); + ut_a(n_thrs > 0 && thr != NULL); + + ut_ad(purge_sys.head <= purge_sys.tail); + + i = 0; + + std::unordered_map<table_id_t, purge_node_t*> table_id_map; + mem_heap_empty(purge_sys.heap); + + while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) { + purge_node_t* node; + trx_purge_rec_t purge_rec; + + /* Get the purge node. */ + node = (purge_node_t*) thr->child; + ut_a(que_node_get_type(node) == QUE_NODE_PURGE); + + /* Track the max {trx_id, undo_no} for truncating the + UNDO logs once we have purged the records. */ + + if (purge_sys.head <= purge_sys.tail) { + purge_sys.head = purge_sys.tail; + } + + /* Fetch the next record, and advance the purge_sys.tail. */ + purge_rec.undo_rec = trx_purge_fetch_next_rec( + &purge_rec.roll_ptr, &n_pages_handled, + purge_sys.heap); + + if (purge_rec.undo_rec == NULL) { + break; + } else if (purge_rec.undo_rec + == reinterpret_cast<trx_undo_rec_t*>(-1)) { + continue; + } + + table_id_t table_id = trx_undo_rec_get_table_id( + purge_rec.undo_rec); + + purge_node_t *& table_node = table_id_map[table_id]; + + if (table_node) { + node = table_node; + } else { + thr = UT_LIST_GET_NEXT(thrs, thr); + + if (!(++i % n_purge_threads)) { + thr = UT_LIST_GET_FIRST( + purge_sys.query->thrs); + } + + ut_a(thr != NULL); + table_node = node; + } + + node->undo_recs.push(purge_rec); + + if (n_pages_handled >= srv_purge_batch_size) { + break; + } + } + + ut_ad(purge_sys.head <= purge_sys.tail); + + return(n_pages_handled); +} + +/*******************************************************************//** +Calculate the DML delay required. +@return delay in microseconds or ULINT_MAX */ +static +ulint +trx_purge_dml_delay(void) +/*=====================*/ +{ + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + ulint delay = 0; /* in microseconds; default: no delay */ + + /* If purge lag is set then calculate the new DML delay. */ + + if (srv_max_purge_lag > 0) { + double ratio = static_cast<double>(trx_sys.history_size()) / + static_cast<double>(srv_max_purge_lag); + + if (ratio > 1.0) { + /* If the history list length exceeds the + srv_max_purge_lag, the data manipulation + statements are delayed by at least 5000 + microseconds. */ + delay = (ulint) ((ratio - .5) * 10000); + } + + if (delay > srv_max_purge_lag_delay) { + delay = srv_max_purge_lag_delay; + } + + MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay); + } + + return(delay); +} + +extern tpool::waitable_task purge_worker_task; + +/** Wait for pending purge jobs to complete. */ +static void trx_purge_wait_for_workers_to_complete() +{ + const bool notify_wait{purge_worker_task.is_running()}; + + if (notify_wait) + tpool::tpool_wait_begin(); + + purge_worker_task.wait(); + + if (notify_wait) + tpool::tpool_wait_end(); + + /* There should be no outstanding tasks as long + as the worker threads are active. */ + ut_ad(srv_get_task_queue_length() == 0); +} + +/** Update end_view at the end of a purge batch. */ +TRANSACTIONAL_INLINE void purge_sys_t::clone_end_view() +{ + /* This is only invoked only by the purge coordinator, + which is the only thread that can modify our inputs head, tail, view. + Therefore, we only need to protect end_view from concurrent reads. */ + + /* Limit the end_view similar to what trx_purge_truncate_history() does. */ + const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no; +#ifdef SUX_LOCK_GENERIC + end_latch.wr_lock(); +#else + transactional_lock_guard<srw_spin_lock_low> g(end_latch); +#endif + end_view= view; + end_view.clamp_low_limit_id(trx_no); +#ifdef SUX_LOCK_GENERIC + end_latch.wr_unlock(); +#endif +} + +/** +Run a purge batch. +@param n_tasks number of purge tasks to submit to the queue +@param truncate whether to truncate the history at the end of the batch +@return number of undo log pages handled in the batch */ +TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, bool truncate) +{ + que_thr_t* thr = NULL; + ulint n_pages_handled; + + ut_ad(n_tasks > 0); + + srv_dml_needed_delay = trx_purge_dml_delay(); + + purge_sys.clone_oldest_view(); + +#ifdef UNIV_DEBUG + if (srv_purge_view_update_only_debug) { + return(0); + } +#endif /* UNIV_DEBUG */ + + /* Fetch the UNDO recs that need to be purged. */ + n_pages_handled = trx_purge_attach_undo_recs(n_tasks); + + /* Submit tasks to workers queue if using multi-threaded purge. */ + for (ulint i = n_tasks; --i; ) { + thr = que_fork_scheduler_round_robin(purge_sys.query, thr); + ut_a(thr); + srv_que_task_enqueue_low(thr); + srv_thread_pool->submit_task(&purge_worker_task); + } + + thr = que_fork_scheduler_round_robin(purge_sys.query, thr); + + que_run_threads(thr); + + trx_purge_wait_for_workers_to_complete(); + + purge_sys.clone_end_view(); + + if (truncate) { + trx_purge_truncate_history(); + } + + MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); + MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled); + + return(n_pages_handled); +} diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc new file mode 100644 index 00000000..dc24f083 --- /dev/null +++ b/storage/innobase/trx/trx0rec.cc @@ -0,0 +1,2426 @@ +/***************************************************************************** + +Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rec.cc +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rec.h" +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "ut0mem.h" +#include "row0ext.h" +#include "row0upd.h" +#include "que0que.h" +#include "trx0purge.h" +#include "trx0rseg.h" +#include "row0row.h" +#include "row0mysql.h" +#include "row0ins.h" + +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */ +const dtuple_t trx_undo_metadata = { + /* This also works for REC_INFO_METADATA_ALTER, because the + delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */ + REC_INFO_METADATA_ADD, 0, 0, + NULL, 0, NULL +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif /* UNIV_DEBUG */ +}; + +/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ + +/** Calculate the free space left for extending an undo log record. +@param undo_block undo log page +@param ptr current end of the undo page +@return bytes left */ +static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr) +{ + ut_ad(ptr >= + &undo_block->page.frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]); + /* The 10 is supposed to be an extra safety margin (and needed for + compatibility with older versions) */ + lint left= srv_page_size - (ptr - undo_block->page.frame) - + (10 + FIL_PAGE_DATA_END); + ut_ad(left >= 0); + return left < 0 ? 0 : static_cast<ulint>(left); +} + +/**********************************************************************//** +Set the next and previous pointers in the undo page for the undo record +that was written to ptr. Update the first free value by the number of bytes +written for this undo record. +@return offset of the inserted entry on the page if succeeded, 0 if fail */ +static +uint16_t +trx_undo_page_set_next_prev_and_add( +/*================================*/ + buf_block_t* undo_block, /*!< in/out: undo log page */ + byte* ptr, /*!< in: ptr up to where data has been + written on this undo page. */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(page_align(ptr) == undo_block->page.frame); + + if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2)) + return 0; + + byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE + + undo_block->page.frame); + + const uint16_t first_free= mach_read_from_2(ptr_to_first_free); + + /* Write offset of the previous undo log record */ + memcpy(ptr, ptr_to_first_free, 2); + ptr += 2; + + const uint16_t end_of_rec= static_cast<uint16_t> + (ptr - undo_block->page.frame); + + /* Update the offset to first free undo record */ + mach_write_to_2(ptr_to_first_free, end_of_rec); + /* Write offset of the next undo log record */ + memcpy(undo_block->page.frame + first_free, ptr_to_first_free, 2); + const byte *start= undo_block->page.frame + first_free + 2; + + mtr->undo_append(*undo_block, start, ptr - start - 2); + return first_free; +} + +/** Virtual column undo log version. To distinguish it from a length value +in 5.7.8 undo log, it starts with 0xF1 */ +static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1; + +/** Write virtual column index info (index id and column position in index) +to the undo log +@param[in,out] undo_block undo log page +@param[in] table the table +@param[in] pos the virtual column position +@param[in] ptr undo log record being written +@param[in] first_v_col whether this is the first virtual column + which could start with a version marker +@return new undo log pointer */ +static +byte* +trx_undo_log_v_idx( + buf_block_t* undo_block, + const dict_table_t* table, + ulint pos, + byte* ptr, + bool first_v_col) +{ + ut_ad(pos < table->n_v_def); + dict_v_col_t* vcol = dict_table_get_nth_v_col(table, pos); + byte* old_ptr; + + ut_ad(!vcol->v_indexes.empty()); + + ulint size = first_v_col ? 1 + 2 : 2; + const ulint avail = trx_undo_left(undo_block, ptr); + + /* The mach_write_compressed(ptr, flen) in + trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */ + if (avail < size + 5) { + return(NULL); + } + + ulint n_idx = 0; + for (const auto& v_index : vcol->v_indexes) { + n_idx++; + /* FIXME: index->id is 64 bits! */ + size += mach_get_compressed_size(uint32_t(v_index.index->id)); + size += mach_get_compressed_size(v_index.nth_field); + } + + size += mach_get_compressed_size(n_idx); + + if (avail < size + 5) { + return(NULL); + } + + ut_d(const byte* orig_ptr = ptr); + + if (first_v_col) { + /* write the version marker */ + mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1); + + ptr += 1; + } + + old_ptr = ptr; + + ptr += 2; + + ptr += mach_write_compressed(ptr, n_idx); + + for (const auto& v_index : vcol->v_indexes) { + ptr += mach_write_compressed( + /* FIXME: index->id is 64 bits! */ + ptr, uint32_t(v_index.index->id)); + + ptr += mach_write_compressed(ptr, v_index.nth_field); + } + + ut_ad(orig_ptr + size == ptr); + + mach_write_to_2(old_ptr, ulint(ptr - old_ptr)); + + return(ptr); +} + +/** Read virtual column index from undo log, and verify the column is still +indexed, and return its position +@param[in] table the table +@param[in] ptr undo log pointer +@param[out] col_pos the column number or FIL_NULL + if the column is not indexed any more +@return remaining part of undo log record after reading these values */ +static +const byte* +trx_undo_read_v_idx_low( + const dict_table_t* table, + const byte* ptr, + uint32_t* col_pos) +{ + ulint len = mach_read_from_2(ptr); + const byte* old_ptr = ptr; + + *col_pos = FIL_NULL; + + ptr += 2; + + ulint num_idx = mach_read_next_compressed(&ptr); + + ut_ad(num_idx > 0); + + dict_index_t* clust_index = dict_table_get_first_index(table); + + for (ulint i = 0; i < num_idx; i++) { + index_id_t id = mach_read_next_compressed(&ptr); + ulint pos = mach_read_next_compressed(&ptr); + dict_index_t* index = dict_table_get_next_index(clust_index); + + while (index != NULL) { + /* Return if we find a matching index. + TODO: in the future, it might be worth to add + checks on other indexes */ + if (index->id == id) { + const dict_col_t* col = dict_index_get_nth_col( + index, pos); + ut_ad(col->is_virtual()); + const dict_v_col_t* vcol = reinterpret_cast< + const dict_v_col_t*>(col); + *col_pos = vcol->v_pos; + return(old_ptr + len); + } + + index = dict_table_get_next_index(index); + } + } + + return(old_ptr + len); +} + +/** Read virtual column index from undo log or online log if the log +contains such info, and in the undo log case, verify the column is +still indexed, and output its position +@param[in] table the table +@param[in] ptr undo log pointer +@param[in] first_v_col if this is the first virtual column, which + has the version marker +@param[in,out] is_undo_log this function is used to parse both undo log, + and online log for virtual columns. So + check to see if this is undo log. When + first_v_col is true, is_undo_log is output, + when first_v_col is false, is_undo_log is input +@param[out] field_no the column number, or FIL_NULL if not indexed +@return remaining part of undo log record after reading these values */ +const byte* +trx_undo_read_v_idx( + const dict_table_t* table, + const byte* ptr, + bool first_v_col, + bool* is_undo_log, + uint32_t* field_no) +{ + /* Version marker only put on the first virtual column */ + if (first_v_col) { + /* Undo log has the virtual undo log marker */ + *is_undo_log = (mach_read_from_1(ptr) + == VIRTUAL_COL_UNDO_FORMAT_1); + + if (*is_undo_log) { + ptr += 1; + } + } + + if (*is_undo_log) { + ptr = trx_undo_read_v_idx_low(table, ptr, field_no); + } else { + *field_no -= REC_MAX_N_FIELDS; + } + + return(ptr); +} + +/** Reports in the undo log of an insert of virtual columns. +@param[in] undo_block undo log page +@param[in] table the table +@param[in] row dtuple contains the virtual columns +@param[in,out] ptr log ptr +@return true if write goes well, false if out of space */ +static +bool +trx_undo_report_insert_virtual( + buf_block_t* undo_block, + dict_table_t* table, + const dtuple_t* row, + byte** ptr) +{ + byte* start = *ptr; + bool first_v_col = true; + + if (trx_undo_left(undo_block, *ptr) < 2) { + return(false); + } + + /* Reserve 2 bytes to write the number + of bytes the stored fields take in this + undo record */ + *ptr += 2; + + for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table); + col_no++) { + const dict_v_col_t* col + = dict_table_get_nth_v_col(table, col_no); + + if (col->m_col.ord_part) { + + /* make sure enought space to write the length */ + if (trx_undo_left(undo_block, *ptr) < 5) { + return(false); + } + + ulint pos = col_no; + pos += REC_MAX_N_FIELDS; + *ptr += mach_write_compressed(*ptr, pos); + + *ptr = trx_undo_log_v_idx(undo_block, table, + col_no, *ptr, first_v_col); + first_v_col = false; + + if (*ptr == NULL) { + return(false); + } + + const dfield_t* vfield = dtuple_get_nth_v_field( + row, col->v_pos); + switch (ulint flen = vfield->len) { + case 0: case UNIV_SQL_NULL: + if (trx_undo_left(undo_block, *ptr) < 5) { + return(false); + } + + *ptr += mach_write_compressed(*ptr, flen); + break; + default: + ulint max_len + = dict_max_v_field_len_store_undo( + table, col_no); + + if (flen > max_len) { + flen = max_len; + } + + if (trx_undo_left(undo_block, *ptr) + < flen + 5) { + return(false); + } + *ptr += mach_write_compressed(*ptr, flen); + + memcpy(*ptr, vfield->data, flen); + *ptr += flen; + } + } + } + + /* Always mark the end of the log with 2 bytes length field */ + mach_write_to_2(start, ulint(*ptr - start)); + + return(true); +} + +/** Reports in the undo log of an insert of a clustered index record. +@param undo_block undo log page +@param trx transaction +@param index clustered index +@param clust_entry index entry which will be inserted to the + clustered index +@param mtr mini-transaction +@param write_empty write empty table undo log record +@return offset of the inserted entry on the page if succeed, 0 if fail */ +static +uint16_t +trx_undo_page_report_insert( + buf_block_t* undo_block, + trx_t* trx, + dict_index_t* index, + const dtuple_t* clust_entry, + mtr_t* mtr, + bool write_empty) +{ + ut_ad(index->is_primary()); + /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes + TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote + TRX_UNDO_INSERT == 1 into insert_undo pages, + or TRX_UNDO_UPDATE == 2 into update_undo pages. */ + ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + undo_block->page.frame) <= 2); + + uint16_t first_free = mach_read_from_2(my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE + + undo_block->page.frame)); + byte* ptr = undo_block->page.frame + first_free; + + if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) { + /* Not enough space for writing the general parameters */ + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + *ptr++ = TRX_UNDO_INSERT_REC; + ptr += mach_u64_write_much_compressed(ptr, trx->undo_no); + ptr += mach_u64_write_much_compressed(ptr, index->table->id); + + if (write_empty) { + /* Table is in bulk operation */ + undo_block->page.frame[first_free + 2] = TRX_UNDO_EMPTY; + goto done; + } + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the record + to be inserted in the clustered index */ + if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) { + ut_ad(clust_entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(undo_block->page.frame[first_free + 2] + == TRX_UNDO_INSERT_REC); + undo_block->page.frame[first_free + 2] + = TRX_UNDO_INSERT_METADATA; + goto done; + } + + for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) { + + const dfield_t* field = dtuple_get_nth_field(clust_entry, i); + ulint flen = dfield_get_len(field); + + if (trx_undo_left(undo_block, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + switch (flen) { + case 0: case UNIV_SQL_NULL: + break; + default: + if (trx_undo_left(undo_block, ptr) < flen) { + + return(0); + } + + memcpy(ptr, dfield_get_data(field), flen); + ptr += flen; + } + } + + if (index->table->n_v_cols) { + if (!trx_undo_report_insert_virtual( + undo_block, index->table, clust_entry, &ptr)) { + return(0); + } + } + +done: + return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr)); +} + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +const byte* +trx_undo_rec_get_pars( +/*==================*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + ulint* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ +{ + ulint type_cmpl; + + type_cmpl = undo_rec[2]; + const byte *ptr = undo_rec + 3; + + *updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN); + type_cmpl &= ~TRX_UNDO_UPD_EXTERN; + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); + ut_ad(*type >= TRX_UNDO_RENAME_TABLE); + ut_ad(*type <= TRX_UNDO_EMPTY); + *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; + + *undo_no = mach_read_next_much_compressed(&ptr); + *table_id = mach_read_next_much_compressed(&ptr); + ut_ad(*table_id); + + return ptr; +} + +/** Read from an undo log record a non-virtual column value. +@param ptr pointer to remaining part of the undo record +@param field stored field +@param len length of the field, or UNIV_SQL_NULL +@param orig_len original length of the locally stored part +of an externally stored column, or 0 +@return remaining part of undo log record after reading these values */ +const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field, + uint32_t *len, uint32_t *orig_len) +{ + *len = mach_read_next_compressed(&ptr); + *orig_len = 0; + + switch (*len) { + case UNIV_SQL_NULL: + *field = NULL; + break; + case UNIV_EXTERN_STORAGE_FIELD: + *orig_len = mach_read_next_compressed(&ptr); + *len = mach_read_next_compressed(&ptr); + *field = ptr; + ptr += *len & ~SPATIAL_STATUS_MASK; + + ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(*len > *orig_len); + /* @see dtuple_convert_big_rec() */ + ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE); + + /* we do not have access to index->table here + ut_ad(dict_table_has_atomic_blobs(index->table) + || *len >= col->max_prefix + + BTR_EXTERN_FIELD_REF_SIZE); + */ + + *len += UNIV_EXTERN_STORAGE_FIELD; + break; + default: + *field = ptr; + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += (*len - UNIV_EXTERN_STORAGE_FIELD) + & ~SPATIAL_STATUS_MASK; + } else { + ptr += *len; + } + } + + return ptr; +} + +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +const byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + const byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t**ref, /*!< out, own: row reference */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ +{ + ut_ad(index->is_primary()); + + const ulint ref_len = dict_index_get_n_unique(index); + + dtuple_t* tuple = dtuple_create(heap, ref_len); + *ref = tuple; + + dict_index_copy_types(tuple, index, ref_len); + + for (ulint i = 0; i < ref_len; i++) { + const byte* field; + uint32_t len, orig_len; + + dfield_t* dfield = dtuple_get_nth_field(tuple, i); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield_set_data(dfield, field, len); + } + + return ptr; +} + +/** Skip a row reference from an undo log record. +@param ptr part of an update undo log record +@param index clustered index +@return pointer to remaining part of undo record */ +static const byte *trx_undo_rec_skip_row_ref(const byte *ptr, + const dict_index_t *index) +{ + ut_ad(index->is_primary()); + + ulint ref_len = dict_index_get_n_unique(index); + + for (ulint i = 0; i < ref_len; i++) { + const byte* field; + uint32_t len, orig_len; + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + } + + return(ptr); +} + +/** Fetch a prefix of an externally stored column, for writing to the undo +log of an update or delete marking of a clustered index record. +@param[out] ext_buf buffer to hold the prefix data and BLOB pointer +@param[in] prefix_len prefix size to store in the undo log +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] field an externally stored column +@param[in,out] len input: length of field; output: used length of +ext_buf +@return ext_buf */ +static +byte* +trx_undo_page_fetch_ext( + byte* ext_buf, + ulint prefix_len, + ulint zip_size, + const byte* field, + ulint* len) +{ + /* Fetch the BLOB. */ + ulint ext_len = btr_copy_externally_stored_field_prefix( + ext_buf, prefix_len, zip_size, field, *len); + /* BLOBs should always be nonempty. */ + ut_a(ext_len); + /* Append the BLOB pointer to the prefix. */ + memcpy(ext_buf + ext_len, + field + *len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE; + return(ext_buf); +} + +/** Writes to the undo log a prefix of an externally stored column. +@param[out] ptr undo log position, at least 15 bytes must be +available +@param[out] ext_buf a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT() + size, or NULL when should not fetch a longer + prefix +@param[in] prefix_len prefix size to store in the undo log +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] field the locally stored part of the externally +stored column +@param[in,out] len length of field, in bytes +@param[in] spatial_status whether the column is used by spatial index or + regular index +@return undo log position */ +static +byte* +trx_undo_page_report_modify_ext( + byte* ptr, + byte* ext_buf, + ulint prefix_len, + ulint zip_size, + const byte** field, + ulint* len, + spatial_status_t spatial_status) +{ + ulint spatial_len= 0; + + switch (spatial_status) { + case SPATIAL_UNKNOWN: + case SPATIAL_NONE: + break; + + case SPATIAL_MIXED: + case SPATIAL_ONLY: + spatial_len = DATA_MBR_LEN; + break; + } + + /* Encode spatial status into length. */ + spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT; + + if (spatial_status == SPATIAL_ONLY) { + /* If the column is only used by gis index, log its + MBR is enough.*/ + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD + + spatial_len); + + return(ptr); + } + + if (ext_buf) { + ut_a(prefix_len > 0); + + /* If an ordering column is externally stored, we will + have to store a longer prefix of the field. In this + case, write to the log a marker followed by the + original length and the real length of the field. */ + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD); + + ptr += mach_write_compressed(ptr, *len); + + *field = trx_undo_page_fetch_ext(ext_buf, prefix_len, + zip_size, *field, len); + + ptr += mach_write_compressed(ptr, *len + spatial_len); + } else { + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD + + *len + spatial_len); + } + + return(ptr); +} + +/** Get MBR from a Geometry column stored externally +@param[out] mbr MBR to fill +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] field field contain the geometry data +@param[in,out] len length of field, in bytes +*/ +static +void +trx_undo_get_mbr_from_ext( +/*======================*/ + double* mbr, + ulint zip_size, + const byte* field, + ulint* len) +{ + uchar* dptr = NULL; + ulint dlen; + mem_heap_t* heap = mem_heap_create(100); + + dptr = btr_copy_externally_stored_field( + &dlen, field, zip_size, *len, heap); + + if (dlen <= GEO_DATA_HEADER_SIZE) { + for (uint i = 0; i < SPDIMS; ++i) { + mbr[i * 2] = DBL_MAX; + mbr[i * 2 + 1] = -DBL_MAX; + } + } else { + rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE, + static_cast<uint>(dlen + - GEO_DATA_HEADER_SIZE), SPDIMS, mbr); + } + + mem_heap_free(heap); +} + +/**********************************************************************//** +Reports in the undo log of an update or delete marking of a clustered index +record. +@return byte offset of the inserted undo log entry on the page if +succeed, 0 if fail */ +static +uint16_t +trx_undo_page_report_modify( +/*========================*/ + buf_block_t* undo_block, /*!< in: undo log page */ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: clustered index where update or + delete marking is done */ + const rec_t* rec, /*!< in: clustered index record which + has NOT yet been modified */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector which tells the + columns to be updated; in the case of + a delete, this should be set to NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const dtuple_t* row, /*!< in: clustered index row contains + virtual column info */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(index->is_primary()); + ut_ad(rec_offs_validate(rec, index, offsets)); + /* MariaDB 10.3.1+ in trx_undo_page_init() always initializes + TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote + TRX_UNDO_INSERT == 1 into insert_undo pages, + or TRX_UNDO_UPDATE == 2 into update_undo pages. */ + ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + + undo_block->page.frame) <= 2); + + byte* ptr_to_first_free = my_assume_aligned<2>( + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->page.frame); + + const uint16_t first_free = mach_read_from_2(ptr_to_first_free); + byte *ptr = undo_block->page.frame + first_free; + + if (trx_undo_left(undo_block, ptr) < 50) { + /* NOTE: the value 50 must be big enough so that the general + fields written below fit on the undo log page */ + return 0; + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + dict_table_t* table = index->table; + const byte* field; + ulint flen; + ulint col_no; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + trx_id_t trx_id; + ibool ignore_prefix = FALSE; + byte ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE]; + bool first_v_col = true; + + /* Store first some general parameters to the undo log */ + + if (!update) { + ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table))); + type_cmpl = TRX_UNDO_DEL_MARK_REC; + } else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing update_undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets)); + + type_cmpl = TRX_UNDO_UPD_DEL_REC; + /* We are about to update a delete marked record. + We don't typically need the prefix in this case unless + the delete marking is done by the same transaction + (which we check below). */ + ignore_prefix = TRUE; + } else { + type_cmpl = TRX_UNDO_UPD_EXIST_REC; + } + + type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; + type_cmpl_ptr = ptr; + + *ptr++ = (byte) type_cmpl; + ptr += mach_u64_write_much_compressed(ptr, trx->undo_no); + + ptr += mach_u64_write_much_compressed(ptr, table->id); + + /*----------------------------------------*/ + /* Store the state of the info bits */ + + *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); + + /* Store the values of the system columns */ + field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen); + ut_ad(flen == DATA_TRX_ID_LEN); + + trx_id = trx_read_trx_id(field); + + /* If it is an update of a delete marked record, then we are + allowed to ignore blob prefixes if the delete marking was done + by some other trx as it must have committed by now for us to + allow an over-write. */ + if (trx_id == trx->id) { + ignore_prefix = false; + } + ptr += mach_u64_write_compressed(ptr, trx_id); + + field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen); + ut_ad(flen == DATA_ROLL_PTR_LEN); + ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN)); + + ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field)); + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the + record which will be modified in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + /* The ordering columns must not be instant added columns. */ + ut_ad(!rec_offs_nth_default(offsets, i)); + field = rec_get_nth_field(rec, offsets, i, &flen); + + /* The ordering columns must not be stored externally. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + ut_ad(dict_index_get_nth_col(index, i)->ord_part); + + if (trx_undo_left(undo_block, ptr) < 5) { + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_block, ptr) < flen) { + return(0); + } + + memcpy(ptr, field, flen); + ptr += flen; + } + } + + /*----------------------------------------*/ + /* Save to the undo log the old values of the columns to be updated. */ + + if (update) { + if (trx_undo_left(undo_block, ptr) < 5) { + return(0); + } + + ulint n_updated = upd_get_n_fields(update); + + /* If this is an online update while an inplace alter table + is in progress and the table has virtual column, we will + need to double check if there are any non-indexed columns + being registered in update vector in case they will be indexed + in new table */ + if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) { + for (i = 0; i < upd_get_n_fields(update); i++) { + upd_field_t* fld = upd_get_nth_field( + update, i); + ulint pos = fld->field_no; + + /* These columns must not have an index + on them */ + if (upd_fld_is_virtual_col(fld) + && dict_table_get_nth_v_col( + table, pos)->v_indexes.empty()) { + n_updated--; + } + } + } + + i = 0; + + if (UNIV_UNLIKELY(update->is_alter_metadata())) { + ut_ad(update->n_fields >= 1); + ut_ad(!upd_fld_is_virtual_col(&update->fields[0])); + ut_ad(update->fields[0].field_no + == index->first_user_field()); + ut_ad(!dfield_is_ext(&update->fields[0].new_val)); + ut_ad(!dfield_is_null(&update->fields[0].new_val)); + /* The instant ADD COLUMN metadata record does not + contain the BLOB. Do not write anything for it. */ + i = !rec_is_alter_metadata(rec, *index); + n_updated -= i; + } + + ptr += mach_write_compressed(ptr, n_updated); + + for (; i < upd_get_n_fields(update); i++) { + if (trx_undo_left(undo_block, ptr) < 5) { + return 0; + } + + upd_field_t* fld = upd_get_nth_field(update, i); + + bool is_virtual = upd_fld_is_virtual_col(fld); + ulint max_v_log_len = 0; + + ulint pos = fld->field_no; + const dict_col_t* col = NULL; + + if (is_virtual) { + /* Skip the non-indexed column, during + an online alter table */ + if (dict_index_is_online_ddl(index) + && dict_table_get_nth_v_col( + table, pos)->v_indexes.empty()) { + continue; + } + + /* add REC_MAX_N_FIELDS to mark this + is a virtual col */ + ptr += mach_write_compressed( + ptr, pos + REC_MAX_N_FIELDS); + + if (trx_undo_left(undo_block, ptr) < 15) { + return 0; + } + + ut_ad(fld->field_no < table->n_v_def); + + ptr = trx_undo_log_v_idx(undo_block, table, + fld->field_no, ptr, + first_v_col); + if (ptr == NULL) { + return(0); + } + first_v_col = false; + + max_v_log_len + = dict_max_v_field_len_store_undo( + table, fld->field_no); + + field = static_cast<byte*>( + fld->old_v_val->data); + flen = fld->old_v_val->len; + + /* Only log sufficient bytes for index + record update */ + if (flen != UNIV_SQL_NULL) { + flen = ut_min( + flen, max_v_log_len); + } + + goto store_len; + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(pos >= index->first_user_field()); + ut_ad(rec_is_metadata(rec, *index)); + + if (rec_is_alter_metadata(rec, *index)) { + ut_ad(update->is_alter_metadata()); + + field = rec_offs_n_fields(offsets) + > pos + && !rec_offs_nth_default( + offsets, pos) + ? rec_get_nth_field( + rec, offsets, + pos, &flen) + : index->instant_field_value( + pos - 1, &flen); + + if (pos == index->first_user_field()) { + ut_ad(rec_offs_nth_extern( + offsets, pos)); + ut_ad(flen == FIELD_REF_SIZE); + goto write_field; + } + col = dict_index_get_nth_col(index, + pos - 1); + } else if (!update->is_alter_metadata()) { + goto get_field; + } else { + /* We are converting an ADD COLUMN + metadata record to an ALTER TABLE + metadata record, with BLOB. Subtract + the missing metadata BLOB field. */ + ut_ad(pos > index->first_user_field()); + --pos; + goto get_field; + } + } else { +get_field: + col = dict_index_get_nth_col(index, pos); + field = rec_get_nth_cfield( + rec, index, offsets, pos, &flen); + } +write_field: + /* Write field number to undo log */ + ptr += mach_write_compressed(ptr, pos); + + if (trx_undo_left(undo_block, ptr) < 15) { + return 0; + } + + if (rec_offs_n_fields(offsets) > pos + && rec_offs_nth_extern(offsets, pos)) { + ut_ad(col || pos == index->first_user_field()); + ut_ad(col || update->is_alter_metadata()); + ut_ad(col + || rec_is_alter_metadata(rec, *index)); + ulint prefix_len = col + ? dict_max_field_len_store_undo( + table, col) + : 0; + + ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE + <= sizeof ext_buf); + + ptr = trx_undo_page_report_modify_ext( + ptr, + col + && col->ord_part + && !ignore_prefix + && flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + ? ext_buf : NULL, prefix_len, + table->space->zip_size(), + &field, &flen, SPATIAL_UNKNOWN); + + *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; + } else { +store_len: + ptr += mach_write_compressed(ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_block, ptr) < flen) { + return(0); + } + + memcpy(ptr, field, flen); + ptr += flen; + } + + /* Also record the new value for virtual column */ + if (is_virtual) { + field = static_cast<byte*>(fld->new_val.data); + flen = fld->new_val.len; + if (flen != UNIV_SQL_NULL) { + flen = ut_min( + flen, max_v_log_len); + } + + if (trx_undo_left(undo_block, ptr) < 15) { + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_block, ptr) + < flen) { + return(0); + } + + memcpy(ptr, field, flen); + ptr += flen; + } + } + } + } + + /* Reset the first_v_col, so to put the virtual column undo + version marker again, when we log all the indexed columns */ + first_v_col = true; + + /*----------------------------------------*/ + /* In the case of a delete marking, and also in the case of an update + where any ordering field of any index changes, store the values of all + columns which occur as ordering fields in any index. This info is used + in the purge of old versions where we use it to build and search the + delete marked index records, to look if we can remove them from the + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. Starting from 5.2, we no longer + store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, + but we can construct the column prefix fields in the index by + fetching the first page of the BLOB that is pointed to by the + clustered index. This works also in crash recovery, because all pages + (including BLOBs) are recovered before anything is rolled back. */ + + if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + byte* old_ptr = ptr; + double mbr[SPDIMS * 2]; + mem_heap_t* row_heap = NULL; + + if (trx_undo_left(undo_block, ptr) < 5) { + return(0); + } + + /* Reserve 2 bytes to write the number of bytes the stored + fields take in this undo record */ + + ptr += 2; + + for (col_no = 0; col_no < dict_table_get_n_cols(table); + col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + + if (!col->ord_part) { + continue; + } + + const ulint pos = dict_index_get_nth_col_pos( + index, col_no, NULL); + /* All non-virtual columns must be present in + the clustered index. */ + ut_ad(pos != ULINT_UNDEFINED); + + const bool is_ext = rec_offs_nth_extern(offsets, pos); + const spatial_status_t spatial_status = is_ext + ? dict_col_get_spatial_status(col) + : SPATIAL_NONE; + + switch (spatial_status) { + case SPATIAL_UNKNOWN: + ut_ad(0); + /* fall through */ + case SPATIAL_MIXED: + case SPATIAL_ONLY: + /* Externally stored spatially indexed + columns will be (redundantly) logged + again, because we did not write the + MBR yet, that is, the previous call to + trx_undo_page_report_modify_ext() + was with SPATIAL_UNKNOWN. */ + break; + case SPATIAL_NONE: + if (!update) { + /* This is a DELETE operation. */ + break; + } + /* Avoid redundantly logging indexed + columns that were updated. */ + + for (i = 0; i < update->n_fields; i++) { + const ulint field_no + = upd_get_nth_field(update, i) + ->field_no; + if (field_no >= index->n_fields + || dict_index_get_nth_field( + index, field_no)->col + == col) { + goto already_logged; + } + } + } + + if (true) { + /* Write field number to undo log */ + if (trx_undo_left(undo_block, ptr) < 5 + 15) { + return(0); + } + + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_cfield( + rec, index, offsets, pos, &flen); + + if (is_ext) { + const dict_col_t* col = + dict_index_get_nth_col( + index, pos); + ulint prefix_len = + dict_max_field_len_store_undo( + table, col); + + ut_a(prefix_len < sizeof ext_buf); + const ulint zip_size + = table->space->zip_size(); + + /* If there is a spatial index on it, + log its MBR */ + if (spatial_status != SPATIAL_NONE) { + ut_ad(DATA_GEOMETRY_MTYPE( + col->mtype)); + + trx_undo_get_mbr_from_ext( + mbr, zip_size, + field, &flen); + } + + ptr = trx_undo_page_report_modify_ext( + ptr, + flen < REC_ANTELOPE_MAX_INDEX_COL_LEN + && !ignore_prefix + ? ext_buf : NULL, prefix_len, + zip_size, + &field, &flen, + spatial_status); + } else { + ptr += mach_write_compressed( + ptr, flen); + } + + if (flen != UNIV_SQL_NULL + && spatial_status != SPATIAL_ONLY) { + if (trx_undo_left(undo_block, ptr) + < flen) { + return(0); + } + + memcpy(ptr, field, flen); + ptr += flen; + } + + if (spatial_status != SPATIAL_NONE) { + if (trx_undo_left(undo_block, ptr) + < DATA_MBR_LEN) { + return(0); + } + + for (int i = 0; i < SPDIMS * 2; + i++) { + mach_double_write( + ptr, mbr[i]); + ptr += sizeof(double); + } + } + } + +already_logged: + continue; + } + + for (col_no = 0; col_no < dict_table_get_n_v_cols(table); + col_no++) { + const dict_v_col_t* col + = dict_table_get_nth_v_col(table, col_no); + + if (col->m_col.ord_part) { + ulint pos = col_no; + ulint max_v_log_len + = dict_max_v_field_len_store_undo( + table, pos); + + /* Write field number to undo log. + Make sure there is enought space in log */ + if (trx_undo_left(undo_block, ptr) < 5) { + return(0); + } + + pos += REC_MAX_N_FIELDS; + ptr += mach_write_compressed(ptr, pos); + + ut_ad(col_no < table->n_v_def); + ptr = trx_undo_log_v_idx(undo_block, table, + col_no, ptr, + first_v_col); + first_v_col = false; + + if (!ptr) { + return(0); + } + + const dfield_t* vfield = NULL; + + if (update) { + ut_ad(!row); + if (update->old_vrow == NULL) { + flen = UNIV_SQL_NULL; + } else { + vfield = dtuple_get_nth_v_field( + update->old_vrow, + col->v_pos); + } + } else if (row) { + vfield = dtuple_get_nth_v_field( + row, col->v_pos); + } else { + ut_ad(0); + } + + if (vfield) { + field = static_cast<byte*>(vfield->data); + flen = vfield->len; + } else { + ut_ad(flen == UNIV_SQL_NULL); + } + + if (flen != UNIV_SQL_NULL) { + flen = ut_min( + flen, max_v_log_len); + } + + ptr += mach_write_compressed(ptr, flen); + + switch (flen) { + case 0: case UNIV_SQL_NULL: + break; + default: + if (trx_undo_left(undo_block, ptr) + < flen) { + return(0); + } + + memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + mach_write_to_2(old_ptr, ulint(ptr - old_ptr)); + + if (row_heap) { + mem_heap_free(row_heap); + } + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + if (trx_undo_left(undo_block, ptr) < 2) { + return(0); + } + + mach_write_to_2(ptr, first_free); + const uint16_t new_free = static_cast<uint16_t>( + ptr + 2 - undo_block->page.frame); + mach_write_to_2(undo_block->page.frame + first_free, new_free); + + mach_write_to_2(ptr_to_first_free, new_free); + + const byte* start = &undo_block->page.frame[first_free + 2]; + mtr->undo_append(*undo_block, start, ptr - start); + return(first_free); +} + +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + const byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + byte* info_bits) /*!< out: info bits state */ +{ + /* Read the state of the info bits */ + *info_bits = *ptr++; + + /* Read the values of the system columns */ + + *trx_id = mach_u64_read_next_compressed(&ptr); + *roll_ptr = mach_u64_read_next_compressed(&ptr); + + return(const_cast<byte*>(ptr)); +} + +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + const byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undo record */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + byte info_bits,/*!< in: info bits from this undo record */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd) /*!< out, own: update vector */ +{ + upd_field_t* upd_field; + upd_t* update; + ulint n_fields; + byte* buf; + bool first_v_col = true; + bool is_undo_log = true; + ulint n_skip_field = 0; + + ut_a(dict_index_is_clust(index)); + + if (type != TRX_UNDO_DEL_MARK_REC) { + n_fields = mach_read_next_compressed(&ptr); + } else { + n_fields = 0; + } + + *upd = update = upd_create(n_fields + 2, heap); + + update->info_bits = info_bits; + + /* Store first trx id and roll ptr to update vector */ + + upd_field = upd_get_nth_field(update, n_fields); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN)); + + mach_write_to_6(buf, trx_id); + + upd_field_set_field_no(upd_field, index->db_trx_id(), index); + dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); + + upd_field = upd_get_nth_field(update, n_fields + 1); + + buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN)); + + trx_write_roll_ptr(buf, roll_ptr); + + upd_field_set_field_no(upd_field, index->db_roll_ptr(), index); + dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); + + /* Store then the updated ordinary columns to the update vector */ + + for (ulint i = 0; i < n_fields; i++) { + const byte* field; + uint32_t len, orig_len; + + upd_field = upd_get_nth_field(update, i); + uint32_t field_no = mach_read_next_compressed(&ptr); + + const bool is_virtual = (field_no >= REC_MAX_N_FIELDS); + + if (is_virtual) { + /* If new version, we need to check index list to figure + out the correct virtual column position */ + ptr = trx_undo_read_v_idx( + index->table, ptr, first_v_col, &is_undo_log, + &field_no); + first_v_col = false; + /* This column could be dropped or no longer indexed */ + if (field_no >= index->n_fields) { + /* Mark this is no longer needed */ + upd_field->field_no = REC_MAX_N_FIELDS; + + ptr = trx_undo_rec_get_col_val( + ptr, &field, &len, &orig_len); + ptr = trx_undo_rec_get_col_val( + ptr, &field, &len, &orig_len); + n_skip_field++; + continue; + } + + upd_field_set_v_field_no( + upd_field, static_cast<uint16_t>(field_no), + index); + } else if (UNIV_UNLIKELY((update->info_bits + & ~REC_INFO_DELETED_FLAG) + == REC_INFO_MIN_REC_FLAG)) { + ut_ad(type == TRX_UNDO_UPD_EXIST_REC); + const uint32_t uf = index->first_user_field(); + ut_ad(field_no >= uf); + + if (update->info_bits != REC_INFO_MIN_REC_FLAG) { + /* Generic instant ALTER TABLE */ + if (field_no == uf) { + upd_field->new_val.type + .metadata_blob_init(); + } else if (field_no >= index->n_fields) { + /* This is reachable during + purge if the table was emptied + and converted to the canonical + format on a later ALTER TABLE. + In this case, + row_purge_upd_exist_or_extern() + would only be interested in + freeing any BLOBs that were + updated, that is, the metadata + BLOB above. Other BLOBs in + the metadata record are never + updated; they are for the + initial DEFAULT values of the + instantly added columns, and + they will never change. + + Note: if the table becomes + empty during ROLLBACK or is + empty during subsequent ALTER + TABLE, and btr_page_empty() is + called to re-create the root + page without the metadata + record, in that case we should + only free the latest version + of BLOBs in the record, + which purge would never touch. */ + field_no = REC_MAX_N_FIELDS; + n_skip_field++; + } else { + dict_col_copy_type( + dict_index_get_nth_col( + index, field_no - 1), + &upd_field->new_val.type); + } + } else { + /* Instant ADD COLUMN...LAST */ + dict_col_copy_type( + dict_index_get_nth_col(index, + field_no), + &upd_field->new_val.type); + } + upd_field->field_no = field_no + & dict_index_t::MAX_N_FIELDS; + } else if (field_no < index->n_fields) { + upd_field_set_field_no(upd_field, + static_cast<uint16_t>(field_no), + index); + } else { + ib::error() << "Trying to access update undo rec" + " field " << field_no + << " in index " << index->name + << " of table " << index->table->name + << " but index has only " + << dict_index_get_n_fields(index) + << " fields " << BUG_REPORT_MSG + << ". Run also CHECK TABLE " + << index->table->name << "." + " n_fields = " << n_fields << ", i = " << i; + + ut_ad(0); + *upd = NULL; + return(NULL); + } + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + upd_field->orig_len = static_cast<uint16_t>(orig_len); + + if (len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else if (len < UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_data(&upd_field->new_val, field, len); + } else { + len -= UNIV_EXTERN_STORAGE_FIELD; + + dfield_set_data(&upd_field->new_val, field, len); + dfield_set_ext(&upd_field->new_val); + } + + ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG + | REC_INFO_MIN_REC_FLAG) + || field_no != index->first_user_field() + || (upd_field->new_val.ext + && upd_field->new_val.len == FIELD_REF_SIZE)); + + if (is_virtual) { + upd_field->old_v_val = static_cast<dfield_t*>( + mem_heap_alloc( + heap, sizeof *upd_field->old_v_val)); + ptr = trx_undo_rec_get_col_val( + ptr, &field, &len, &orig_len); + if (len == UNIV_SQL_NULL) { + dfield_set_null(upd_field->old_v_val); + } else if (len < UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_data( + upd_field->old_v_val, field, len); + } else { + ut_ad(0); + } + } + } + + /* We may have to skip dropped indexed virtual columns. + Also, we may have to trim the update vector of a metadata record + if dict_index_t::clear_instant_alter() was invoked on the table + later, and the number of fields no longer matches. */ + + if (n_skip_field) { + upd_field_t* d = upd_get_nth_field(update, 0); + const upd_field_t* const end = d + n_fields + 2; + + for (const upd_field_t* s = d; s != end; s++) { + if (s->field_no != REC_MAX_N_FIELDS) { + *d++ = *s; + } + } + + ut_ad(d + n_skip_field == end); + update->n_fields = d - upd_get_nth_field(update, 0); + } + + return(const_cast<byte*>(ptr)); +} + +/** Report a RENAME TABLE operation. +@param[in,out] trx transaction +@param[in] table table that is being renamed +@param[in,out] block undo page +@param[in,out] mtr mini-transaction +@return byte offset of the undo log record +@retval 0 in case of failure */ +static +uint16_t +trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table, + buf_block_t* block, mtr_t* mtr) +{ + byte* ptr_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE + + block->page.frame); + const uint16_t first_free = mach_read_from_2(ptr_first_free); + ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END); + byte* const start = block->page.frame + first_free; + size_t len = strlen(table->name.m_name); + const size_t fixed = 2 + 1 + 11 + 11 + 2; + ut_ad(len <= NAME_CHAR_LEN * 5 * 2 + 1); + /* The -10 is used in trx_undo_left() */ + compile_time_assert(NAME_CHAR_LEN * 5 * 2 + fixed + + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE + < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END); + + if (trx_undo_left(block, start) < fixed + len) { + ut_ad(first_free > TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_HDR_SIZE); + return 0; + } + + byte* ptr = start + 2; + *ptr++ = TRX_UNDO_RENAME_TABLE; + ptr += mach_u64_write_much_compressed(ptr, trx->undo_no); + ptr += mach_u64_write_much_compressed(ptr, table->id); + memcpy(ptr, table->name.m_name, len); + ptr += len; + mach_write_to_2(ptr, first_free); + mach_write_to_2(ptr_first_free, ptr + 2 - block->page.frame); + memcpy(start, ptr_first_free, 2); + mtr->undo_append(*block, start + 2, ptr - start - 2); + return first_free; +} + +/** Report a RENAME TABLE operation. +@param[in,out] trx transaction +@param[in] table table that is being renamed +@return DB_SUCCESS or error code */ +dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table) +{ + ut_ad(!trx->read_only); + ut_ad(trx->id); + ut_ad(!table->is_temporary()); + + mtr_t mtr; + dberr_t err; + mtr.start(); + if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) { + trx_undo_t* undo = trx->rsegs.m_redo.undo; + ut_ad(err == DB_SUCCESS); + ut_ad(undo); + for (ut_d(int loop_count = 0);;) { + ut_ad(loop_count++ < 2); + ut_ad(undo->last_page_no + == block->page.id().page_no()); + + if (uint16_t offset = trx_undo_page_report_rename( + trx, table, block, &mtr)) { + undo->top_page_no = undo->last_page_no; + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no++; + undo->guess_block = block; + ut_ad(!undo->empty()); + + err = DB_SUCCESS; + break; + } else { + mtr.commit(); + mtr.start(); + block = trx_undo_add_page(undo, &mtr, &err); + if (!block) { + break; + } + } + } + } + + mtr.commit(); + return err; +} + +TRANSACTIONAL_TARGET ATTRIBUTE_NOINLINE +/** @return whether the transaction holds an exclusive lock on a table */ +static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table) +{ + if (table.is_temporary()) + return true; + + uint32_t n; + +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (table.lock_mutex_is_locked()) + xabort(); + n= table.n_lock_x_or_s; + xend(); + } + else +#endif + { + table.lock_mutex_lock(); + n= table.n_lock_x_or_s; + table.lock_mutex_unlock(); + } + + /* This thread is executing trx. No other thread can modify our table locks + (only record locks might be created, in an implicit-to-explicit conversion). + Hence, no mutex is needed here. */ + if (n) + for (const lock_t *lock : trx.lock.table_locks) + if (lock && lock->type_mode == (LOCK_X | LOCK_TABLE)) + return true; + + return false; +} + +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index; in updates, + may contain a clustered index + record tuple that also contains + virtual columns of the table; + otherwise, NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: case of an update or delete + marking, the record in the clustered + index; NULL if insert */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the + undo log record */ +{ + trx_t* trx; +#ifdef UNIV_DEBUG + int loop_count = 0; +#endif /* UNIV_DEBUG */ + + ut_a(dict_index_is_clust(index)); + ut_ad(!update || rec); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!srv_read_only_mode); + + trx = thr_get_trx(thr); + /* This function must not be invoked during rollback + (of a TRX_STATE_PREPARE transaction or otherwise). */ + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(!trx->in_rollback); + + /* We must determine if this is the first time when this + transaction modifies this table. */ + auto m = trx->mod_tables.emplace(index->table, trx->undo_no); + ut_ad(m.first->second.valid(trx->undo_no)); + + if (m.second && index->table->is_active_ddl()) { + trx->apply_online_log= true; + } + + bool bulk = !rec; + + if (!bulk) { + /* An UPDATE or DELETE must not be covered by an + earlier start_bulk_insert(). */ + ut_ad(!m.first->second.is_bulk_insert()); + } else if (m.first->second.is_bulk_insert()) { + /* Above, the emplace() tried to insert an object with + !is_bulk_insert(). Only an explicit start_bulk_insert() + (below) can set the flag. */ + ut_ad(!m.second); + /* We already wrote a TRX_UNDO_EMPTY record. */ + ut_ad(thr->run_node); + ut_ad(que_node_get_type(thr->run_node) == QUE_NODE_INSERT); + ut_ad(trx->bulk_insert); + return DB_SUCCESS; + } else if (m.second && trx->bulk_insert + && trx_has_lock_x(*trx, *index->table)) { + m.first->second.start_bulk_insert(); + } else { + bulk = false; + } + + mtr_t mtr; + mtr.start(); + trx_undo_t** pundo; + trx_rseg_t* rseg; + const bool is_temp = index->table->is_temporary(); + + if (is_temp) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + + rseg = trx->get_temp_rseg(); + pundo = &trx->rsegs.m_noredo.undo; + } else { + ut_ad(!trx->read_only); + ut_ad(trx->id); + pundo = &trx->rsegs.m_redo.undo; + rseg = trx->rsegs.m_redo.rseg; + } + + dberr_t err; + buf_block_t* undo_block = trx_undo_assign_low(trx, rseg, pundo, + &err, &mtr); + trx_undo_t* undo = *pundo; + ut_ad((err == DB_SUCCESS) == (undo_block != NULL)); + if (UNIV_UNLIKELY(undo_block == NULL)) { +err_exit: + mtr.commit(); + return err; + } + + ut_ad(undo != NULL); + + do { + uint16_t offset = !rec + ? trx_undo_page_report_insert( + undo_block, trx, index, clust_entry, &mtr, + bulk) + : trx_undo_page_report_modify( + undo_block, trx, index, rec, offsets, update, + cmpl_info, clust_entry, &mtr); + + if (UNIV_UNLIKELY(offset == 0)) { + const uint16_t first_free = mach_read_from_2( + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->page.frame); + memset(undo_block->page.frame + first_free, 0, + (srv_page_size - FIL_PAGE_DATA_END) + - first_free); + + if (first_free + == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) { + /* The record did not fit on an empty + undo page. Discard the freshly allocated + page and return an error. */ + + /* When we remove a page from an undo + log, this is analogous to a + pessimistic insert in a B-tree, and we + must reserve the counterpart of the + tree latch, which is the rseg + mutex. We must commit the mini-transaction + first, because it may be holding lower-level + latches, such as SYNC_FSP_PAGE. */ + + mtr.commit(); + mtr.start(); + if (is_temp) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } + + rseg->latch.wr_lock(SRW_LOCK_CALL); + err = trx_undo_free_last_page(undo, &mtr); + rseg->latch.wr_unlock(); + + if (m.second) { + /* We are not going to modify + this table after all. */ + trx->mod_tables.erase(m.first); + } + + if (err == DB_SUCCESS) { + err = DB_UNDO_RECORD_TOO_BIG; + } + goto err_exit; + } else { + /* Write log for clearing the unused + tail of the undo page. It might + contain some garbage from a previously + written record, and mtr_t::write() + will optimize away writes of unchanged + bytes. Failure to write this caused a + recovery failure when we avoided + reading the undo log page from the + data file and initialized it based on + redo log records (which included the + write of the previous garbage). */ + mtr.memset(*undo_block, first_free, + srv_page_size - first_free + - FIL_PAGE_DATA_END, 0); + } + + mtr.commit(); + } else { + /* Success */ + undo->top_page_no = undo_block->page.id().page_no(); + mtr.commit(); + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no++; + undo->guess_block = undo_block; + ut_ad(!undo->empty()); + + if (!is_temp) { + trx_mod_table_time_t& time = m.first->second; + ut_ad(time.valid(undo->top_undo_no)); + + if (!time.is_versioned() + && index->table->versioned_by_id() + && (!rec /* INSERT */ + || (update + && update->affects_versioned()))) { + time.set_versioned(undo->top_undo_no); + } + } + + if (!bulk) { + *roll_ptr = trx_undo_build_roll_ptr( + !rec, trx_sys.rseg_id(rseg, !is_temp), + undo->top_page_no, offset); + } + + return(DB_SUCCESS); + } + + ut_ad(undo_block->page.id().page_no() == undo->last_page_no); + + /* We have to extend the undo log by one page */ + + ut_ad(++loop_count < 2); + mtr.start(); + + if (is_temp) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } + + undo_block = trx_undo_add_page(undo, &mtr, &err); + + DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure", + undo_block = NULL;); + } while (UNIV_LIKELY(undo_block != NULL)); + + if (err != DB_OUT_OF_FILE_SPACE) { + goto err_exit; + } + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + DB_OUT_OF_FILE_SPACE, + //ER_INNODB_UNDO_LOG_FULL, + "No more space left over in %s tablespace for allocating UNDO" + " log pages. Please add new data file to the tablespace or" + " check if filesystem is full or enable auto-extension for" + " the tablespace", + undo->rseg->space == fil_system.sys_space + ? "system" : is_temp ? "temporary" : "undo"); + + goto err_exit; +} + +/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ + +/** Copy an undo record to heap. +@param[in] roll_ptr roll pointer to a record that exists +@param[in,out] heap memory heap where copied */ +static +trx_undo_rec_t* +trx_undo_get_undo_rec_low( + roll_ptr_t roll_ptr, + mem_heap_t* heap) +{ + ulint rseg_id; + uint32_t page_no; + uint16_t offset; + bool is_insert; + mtr_t mtr; + + trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset); + ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO); + ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id]; + ut_ad(rseg->is_persistent()); + + mtr.start(); + + const buf_block_t* undo_page= + buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr); + + trx_undo_rec_t *undo_rec= undo_page + ? trx_undo_rec_copy(undo_page->page.frame + offset, heap) + : nullptr; + + mtr.commit(); + return undo_rec; +} + +/** Copy an undo record to heap, to check if a secondary index record +can be safely purged. +@param trx_id DB_TRX_ID corresponding to roll_ptr +@param name table name +@param roll_ptr DB_ROLL_PTR pointing to the undo log record +@param heap memory heap for allocation +@return copy of the record +@retval nullptr if the version is visible to purge_sys.view */ +static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id, + const table_name_t &name, + roll_ptr_t roll_ptr, + mem_heap_t* heap) +{ + { + purge_sys_t::view_guard check; + if (!check.view().changes_visible(trx_id)) + return trx_undo_get_undo_rec_low(roll_ptr, heap); + } + return nullptr; +} + +/** Copy an undo record to heap. +@param trx_id DB_TRX_ID corresponding to roll_ptr +@param name table name +@param roll_ptr DB_ROLL_PTR pointing to the undo log record +@param heap memory heap for allocation +@return copy of the record +@retval nullptr if the undo log is not available */ +static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id, + const table_name_t &name, + roll_ptr_t roll_ptr, + mem_heap_t *heap) +{ + { + purge_sys_t::end_view_guard check; + if (!check.view().changes_visible(trx_id)) + return trx_undo_get_undo_rec_low(roll_ptr, heap); + } + return nullptr; +} + +/** Build a previous version of a clustered index record. The caller +must hold a latch on the index page of the clustered index record. +@param rec version of a clustered index record +@param index clustered index +@param offsets rec_get_offsets(rec, index) +@param heap memory heap from which the memory needed is + allocated +@param old_vers previous version or NULL if rec is the + first inserted version, or if history data + has been deleted (an error), or if the purge + could have removed the version + though it has not yet done so +@param v_heap memory heap used to create vrow + dtuple if it is not yet created. This heap + diffs from "heap" above in that it could be + prebuilt->old_vers_heap for selection +@param v_row virtual column info, if any +@param v_status status determine if it is going into this + function by purge thread or not. + And if we read "after image" of undo log +@param undo_block undo log block which was cached during + online dml apply or nullptr +@return error code +@retval DB_SUCCESS if previous version was successfully built, +or if it was an insert or the undo record refers to the table before rebuild +@retval DB_MISSING_HISTORY if the history is missing */ +TRANSACTIONAL_TARGET +dberr_t +trx_undo_prev_version_build( + const rec_t *rec, + dict_index_t *index, + rec_offs *offsets, + mem_heap_t *heap, + rec_t **old_vers, + mem_heap_t *v_heap, + dtuple_t **vrow, + ulint v_status) +{ + dtuple_t* entry; + trx_id_t rec_trx_id; + ulint type; + undo_no_t undo_no; + table_id_t table_id; + trx_id_t trx_id; + roll_ptr_t roll_ptr; + upd_t* update; + byte info_bits; + ulint cmpl_info; + bool dummy_extern; + byte* buf; + + ut_ad(!index->table->is_temporary()); + ut_ad(rec_offs_validate(rec, index, offsets)); + + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + + *old_vers = NULL; + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + /* The record rec is the first inserted version */ + return DB_SUCCESS; + } + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + ut_ad(!index->table->skip_alter_undo); + + trx_undo_rec_t* undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY + ? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name, + roll_ptr, heap) + : trx_undo_get_undo_rec(rec_trx_id, index->table->name, + roll_ptr, heap); + if (!undo_rec) { + return DB_MISSING_HISTORY; + } + + const byte *ptr = + trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + + if (table_id != index->table->id) { + /* The table should have been rebuilt, but purge has + not yet removed the undo log records for the + now-dropped old table (table_id). */ + return DB_SUCCESS; + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + /* (a) If a clustered index record version is such that the + trx id stamp in it is bigger than purge_sys.view, then the + BLOBs in that version are known to exist (the purge has not + progressed that far); + + (b) if the version is the first version such that trx id in it + is less than purge_sys.view, and it is not delete-marked, + then the BLOBs in that version are known to exist (the purge + cannot have purged the BLOBs referenced by that version + yet). + + This function does not fetch any BLOBs. The callers might, by + possibly invoking row_ext_create() via row_build(). However, + they should have all needed information in the *old_vers + returned by this function. This is because *old_vers is based + on the transaction undo log records. The function + trx_undo_page_fetch_ext() will write BLOB prefixes to the + transaction undo log that are at least as long as the longest + possible column prefix in a secondary index. Thus, secondary + index entries for *old_vers can be constructed without + dereferencing any BLOB pointers. */ + + ptr = trx_undo_rec_skip_row_ref(ptr, index); + + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, + heap, &update); + ut_a(ptr); + + if (row_upd_changes_field_size_or_external(index, offsets, update)) { + /* We should confirm the existence of disowned external data, + if the previous version record is delete marked. If the trx_id + of the previous record is seen by purge view, we should treat + it as missing history, because the disowned external data + might be purged already. + + The inherited external data (BLOBs) can be freed (purged) + after trx_id was committed, provided that no view was started + before trx_id. If the purge view can see the committed + delete-marked record by trx_id, no transactions need to access + the BLOB. */ + + if (update->info_bits & REC_INFO_DELETED_FLAG + && purge_sys.is_purgeable(trx_id)) { + return DB_SUCCESS; + } + + /* We have to set the appropriate extern storage bits in the + old version of the record: the extern bits in rec for those + fields that update does NOT update, as well as the bits for + those fields that update updates to become externally stored + fields. Store the info: */ + + entry = row_rec_to_index_entry(rec, index, offsets, heap); + /* The page containing the clustered index record + corresponding to entry is latched in mtr. Thus the + following call is safe. */ + if (!row_upd_index_replace_new_col_vals(entry, *index, update, + heap)) { + return (v_status & TRX_UNDO_PREV_IN_PURGE) + ? DB_MISSING_HISTORY : DB_CORRUPTION; + } + + /* Get number of externally stored columns in updated record */ + const ulint n_ext = index->is_primary() + ? dtuple_get_n_ext(entry) : 0; + + buf = static_cast<byte*>(mem_heap_alloc( + heap, rec_get_converted_size(index, entry, n_ext))); + + *old_vers = rec_convert_dtuple_to_rec(buf, index, + entry, n_ext); + } else { + buf = static_cast<byte*>(mem_heap_alloc( + heap, rec_offs_size(offsets))); + + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, true, offsets); + rec_set_bit_field_1(*old_vers, update->info_bits, + rec_offs_comp(offsets) + ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); + for (ulint i = 0; i < update->n_fields; i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + if (upd_fld_is_virtual_col(uf)) { + /* There are no virtual columns in + a clustered index record. */ + continue; + } + const ulint n = uf->field_no; + ut_ad(!dfield_is_ext(&uf->new_val) + == !rec_offs_nth_extern(offsets, n)); + ut_ad(!rec_offs_nth_default(offsets, n)); + + if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) { + if (rec_offs_nth_sql_null(offsets, n)) { + ut_ad(index->table->is_instant()); + ut_ad(n >= index->n_core_fields); + continue; + } + ut_ad(!index->table->not_redundant()); + ulint l = rec_get_1byte_offs_flag(*old_vers) + ? (n + 1) : (n + 1) * 2; + byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES + - l; + *b= byte(*b | REC_1BYTE_SQL_NULL_MASK); + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + continue; + } + + ulint len; + memcpy(rec_get_nth_field(*old_vers, offsets, n, &len), + uf->new_val.data, uf->new_val.len); + if (UNIV_UNLIKELY(len != uf->new_val.len)) { + ut_ad(len == UNIV_SQL_NULL); + ut_ad(!rec_offs_comp(offsets)); + ut_ad(uf->new_val.len + == rec_get_nth_field_size(rec, n)); + ulint l = rec_get_1byte_offs_flag(*old_vers) + ? (n + 1) : (n + 1) * 2; + *(*old_vers - REC_N_OLD_EXTRA_BYTES - l) + &= byte(~REC_1BYTE_SQL_NULL_MASK); + } + } + } + + /* Set the old value (which is the after image of an update) in the + update vector to dtuple vrow */ + if (v_status & TRX_UNDO_GET_OLD_V_VALUE) { + row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update, + false, NULL, NULL); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_dbg); + ut_a(!rec_offs_any_null_extern( + *old_vers, rec_get_offsets(*old_vers, index, offsets_dbg, + index->n_core_fields, + ULINT_UNDEFINED, &heap))); +#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + + if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + if (!(*vrow)) { + *vrow = dtuple_create_with_vcol( + v_heap ? v_heap : heap, + dict_table_get_n_cols(index->table), + dict_table_get_n_v_cols(index->table)); + dtuple_init_v_fld(*vrow); + } + + ut_ad(index->table->n_v_cols); + trx_undo_read_v_cols(index->table, ptr, *vrow, + v_status & TRX_UNDO_PREV_IN_PURGE); + } + + return DB_SUCCESS; +} + +/** Read virtual column value from undo log +@param[in] table the table +@param[in] ptr undo log pointer +@param[in,out] row the dtuple to fill +@param[in] in_purge whether this is called by purge */ +void +trx_undo_read_v_cols( + const dict_table_t* table, + const byte* ptr, + dtuple_t* row, + bool in_purge) +{ + const byte* end_ptr; + bool first_v_col = true; + bool is_undo_log = true; + + end_ptr = ptr + mach_read_from_2(ptr); + ptr += 2; + while (ptr < end_ptr) { + dfield_t* dfield; + const byte* field; + uint32_t field_no, len, orig_len; + + field_no = mach_read_next_compressed( + const_cast<const byte**>(&ptr)); + + const bool is_virtual = (field_no >= REC_MAX_N_FIELDS); + + if (is_virtual) { + ptr = trx_undo_read_v_idx( + table, ptr, first_v_col, &is_undo_log, + &field_no); + first_v_col = false; + } + + ptr = trx_undo_rec_get_col_val( + ptr, &field, &len, &orig_len); + + /* The virtual column is no longer indexed or does not exist. + This needs to put after trx_undo_rec_get_col_val() so the + undo ptr advances */ + if (field_no == FIL_NULL) { + ut_ad(is_virtual); + continue; + } + + if (is_virtual) { + dict_v_col_t* vcol = dict_table_get_nth_v_col( + table, field_no); + + dfield = dtuple_get_nth_v_field(row, vcol->v_pos); + + if (!in_purge + || dfield_get_type(dfield)->mtype == DATA_MISSING) { + dict_col_copy_type( + &vcol->m_col, + dfield_get_type(dfield)); + dfield_set_data(dfield, field, len); + } + } + } + + ut_ad(ptr == end_ptr); +} diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc new file mode 100644 index 00000000..59c9a319 --- /dev/null +++ b/storage/innobase/trx/trx0roll.cc @@ -0,0 +1,927 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0roll.cc +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0roll.h" + +#include <my_service_manager.h> +#include <mysql/service_wsrep.h> + +#include "fsp0fsp.h" +#include "lock0lock.h" +#include "mach0data.h" +#include "pars0pars.h" +#include "que0que.h" +#include "row0mysql.h" +#include "row0undo.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "trx0rec.h" +#include "trx0rseg.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "trx0undo.h" + +#ifdef UNIV_PFS_THREAD +mysql_pfs_key_t trx_rollback_clean_thread_key; +#endif + +/** true if trx_rollback_all_recovered() thread is active */ +bool trx_rollback_is_active; + +/** In crash recovery, the current trx to be rolled back; NULL otherwise */ +const trx_t* trx_roll_crash_recv_trx; + +/** Finish transaction rollback. +@return whether the rollback was completed normally +@retval false if the rollback was aborted by shutdown */ +inline bool trx_t::rollback_finish() +{ + mod_tables.clear(); + apply_online_log= false; + if (UNIV_LIKELY(error_state == DB_SUCCESS)) + { + commit(); + return true; + } + + ut_a(error_state == DB_INTERRUPTED); + ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE); + ut_a(!srv_undo_sources); + ut_ad(srv_fast_shutdown); + ut_d(in_rollback= false); + if (trx_undo_t *&undo= rsegs.m_redo.undo) + { + UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo); + ut_free(undo); + undo= nullptr; + } + if (trx_undo_t *&undo= rsegs.m_noredo.undo) + { + UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo); + ut_free(undo); + undo= nullptr; + } + commit_low(); + commit_cleanup(); + return false; +} + +/** Roll back an active transaction. */ +inline void trx_t::rollback_low(trx_savept_t *savept) +{ + mem_heap_t *heap= mem_heap_create(512); + roll_node_t *roll_node= roll_node_create(heap); + roll_node->savept= savept; + + ut_ad(!in_rollback); +#ifdef UNIV_DEBUG + { + const auto s= state; + ut_ad(s == TRX_STATE_ACTIVE || + s == TRX_STATE_PREPARED || + s == TRX_STATE_PREPARED_RECOVERED); + if (savept) + { + ut_ad(s == TRX_STATE_ACTIVE); + ut_ad(mysql_thd); + ut_ad(!is_recovered); + } + } +#endif + + error_state = DB_SUCCESS; + + if (has_logged()) + { + ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg); + que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap, + nullptr); + ut_a(thr == que_fork_start_command(static_cast<que_fork_t*> + (que_node_get_parent(thr)))); + que_run_threads(thr); + que_run_threads(roll_node->undo_thr); + + /* Free the memory reserved by the undo graph. */ + que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent)); + } + + if (!savept) + { + rollback_finish(); + MONITOR_INC(MONITOR_TRX_ROLLBACK); + } + else + { + /* There must not be partial rollback if transaction was chosen as deadlock + victim. Galera transaction abort can be invoked during partial rollback. */ + ut_ad(!(lock.was_chosen_as_deadlock_victim & 1)); + ut_a(error_state == DB_SUCCESS); + const undo_no_t limit= savept->least_undo_no; + apply_online_log= false; + for (trx_mod_tables_t::iterator i= mod_tables.begin(); + i != mod_tables.end(); ) + { + trx_mod_tables_t::iterator j= i++; + ut_ad(j->second.valid()); + if (j->second.rollback(limit)) + mod_tables.erase(j); + else if (!apply_online_log) + apply_online_log= j->first->is_active_ddl(); + } + MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT); + } + + mem_heap_free(heap); +} + +/** Initiate rollback. +@param savept savepoint +@return error code or DB_SUCCESS */ +dberr_t trx_t::rollback(trx_savept_t *savept) +{ + ut_ad(!mutex_is_owner()); + if (state == TRX_STATE_NOT_STARTED) + { + error_state= DB_SUCCESS; + return DB_SUCCESS; + } + ut_ad(state == TRX_STATE_ACTIVE); +#ifdef WITH_WSREP + if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd)) + wsrep_handle_SR_rollback(nullptr, mysql_thd); +#endif /* WITH_WSREP */ + rollback_low(savept); + return error_state; +} + +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +static +dberr_t +trx_rollback_for_mysql_low( +/*=======================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "rollback"; + + /* If we are doing the XA recovery of prepared transactions, + then the transaction object does not have an InnoDB session + object, and we set a dummy session that we use for all MySQL + transactions. */ + + trx->rollback_low(); + + trx->op_info = ""; + + return(trx->error_state); +} + +/** Rollback a transaction used in MySQL +@param[in, out] trx transaction +@return error code or DB_SUCCESS */ +dberr_t trx_rollback_for_mysql(trx_t* trx) +{ + /* We are reading trx->state without holding trx->mutex + here, because the rollback should be invoked for a running + active MySQL transaction (or recovered prepared transaction) + that is associated with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx->will_lock = false; + ut_ad(trx->mysql_thd); + /* Galera transaction abort can be invoked from MDL acquision + code, so trx->lock.was_chosen_as_deadlock_victim can be set + even if trx->state is TRX_STATE_NOT_STARTED. */ + ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); +#ifdef WITH_WSREP + trx->wsrep= false; + trx->lock.was_chosen_as_deadlock_victim= false; +#endif + return(DB_SUCCESS); + + case TRX_STATE_ACTIVE: + ut_ad(trx->mysql_thd); + ut_ad(!trx->is_recovered); + ut_ad(!trx->is_autocommit_non_locking() || trx->read_only); + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + ut_ad(!trx->is_autocommit_non_locking()); + if (trx->rsegs.m_redo.undo) { + /* The XA ROLLBACK of a XA PREPARE transaction + will consist of multiple mini-transactions. + + As the very first step of XA ROLLBACK, we must + change the undo log state back from + TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order + to ensure that recovery will complete the + rollback. + + Failure to perform this step could cause a + situation where we would roll back part of + a XA PREPARE transaction, the server would be + killed, and finally, the transaction would be + recovered in XA PREPARE state, with some of + the actions already having been rolled back. */ + ut_ad(trx->rsegs.m_redo.undo->rseg + == trx->rsegs.m_redo.rseg); + mtr_t mtr; + mtr.start(); + if (trx_undo_t* undo = trx->rsegs.m_redo.undo) { + trx_undo_set_state_at_prepare(trx, undo, true, + &mtr); + } + /* Write the redo log for the XA ROLLBACK + state change to the global buffer. It is + not necessary to flush the redo log. If + a durable log write of a later mini-transaction + takes place for whatever reason, then this state + change will be durable as well. */ + mtr.commit(); + ut_ad(mtr.commit_lsn() > 0); + } + return(trx_rollback_for_mysql_low(trx)); + + case TRX_STATE_COMMITTED_IN_MEMORY: + ut_ad(!trx->is_autocommit_non_locking()); + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + dberr_t err; + + /* We are reading trx->state without holding trx->mutex + here, because the statement rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->mysql_thd); + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + return(DB_SUCCESS); + + case TRX_STATE_ACTIVE: + ut_ad(trx->mysql_thd); + ut_ad(!trx->is_recovered); + ut_ad(!trx->is_autocommit_non_locking() || trx->read_only); + + trx->op_info = "rollback of SQL statement"; + + err = trx->rollback(&trx->last_sql_stat_start); + + if (trx->fts_trx != NULL) { + fts_savepoint_rollback_last_stmt(trx); + fts_savepoint_laststmt_refresh(trx); + } + + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + trx->end_bulk_insert(); + + trx->op_info = ""; + + return(err); + + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The statement rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Search for a savepoint using name. +@return savepoint if found else NULL */ +static +trx_named_savept_t* +trx_savepoint_find( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; + + for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + savep != NULL; + savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { + if (!strcmp(savep->name, name)) { + return(savep); + } + } + + return(NULL); +} + +/*******************************************************************//** +Frees a single savepoint struct. */ +static +void +trx_roll_savepoint_free( +/*=====================*/ + trx_t* trx, /*!< in: transaction handle */ + trx_named_savept_t* savep) /*!< in: savepoint to free */ +{ + UT_LIST_REMOVE(trx->trx_savepoints, savep); + + ut_free(savep->name); + ut_free(savep); +} + +/** Discard all savepoints starting from a particular savepoint. +@param savept first savepoint to discard */ +void trx_t::savepoints_discard(trx_named_savept_t *savept) +{ + while (savept) + { + auto next= UT_LIST_GET_NEXT(trx_savepoints, savept); + trx_roll_savepoint_free(this, savept); + savept= next; + } +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +trx_rollback_to_savepoint_for_mysql_low( +/*====================================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_named_savept_t* savep, /*!< in/out: savepoint */ + int64_t* mysql_binlog_cache_pos) + /*!< out: the MySQL binlog + cache position corresponding + to this savepoint; MySQL needs + this information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + dberr_t err; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(trx->mysql_thd); + + /* Free all savepoints strictly later than savep. */ + + trx->savepoints_discard(UT_LIST_GET_NEXT(trx_savepoints, savep)); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx->rollback(&savep->savept); + + /* Store the current undo_no of the transaction so that + we know where to roll back if we have to roll back the + next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + return(err); +} + +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + + /* We are reading trx->state without holding trx->mutex + here, because the savepoint rollback should be invoked for a + running active MySQL transaction that is associated with the + current thread. */ + ut_ad(trx->mysql_thd); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep == NULL) { + return(DB_NO_SAVEPOINT); + } + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + ib::error() << "Transaction has a savepoint " + << savep->name + << " though it is not started"; + return(DB_ERROR); + + case TRX_STATE_ACTIVE: + + return(trx_rollback_to_savepoint_for_mysql_low( + trx, savep, mysql_binlog_cache_pos)); + + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The savepoint rollback is only allowed on an ACTIVE + transaction, not a PREPARED or COMMITTED one. */ + break; + } + + ut_error; + return(DB_CORRUPTION); +} + +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + trx_start_if_not_started_xa(trx, false); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx->trx_savepoints, savep); + + ut_free(savep->name); + ut_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = static_cast<trx_named_savept_t*>( + ut_malloc_nokey(sizeof(*savep))); + + savep->name = mem_strdup(savepoint_name); + + savep->savept.least_undo_no = trx->undo_no; + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx->trx_savepoints, savep); + + trx->end_bulk_insert(); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Releases only the named savepoint. Savepoints which were set after this +savepoint are left as is. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ +{ + trx_named_savept_t* savep; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true) + || trx_state_eq(trx, TRX_STATE_PREPARED, true)); + ut_ad(trx->mysql_thd); + + savep = trx_savepoint_find(trx, savepoint_name); + + if (savep != NULL) { + trx_roll_savepoint_free(trx, savep); + } + + return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT); +} + +/*******************************************************************//** +Roll back an active transaction. */ +static +void +trx_rollback_active( +/*================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + roll_node_t* roll_node; + const trx_id_t trx_id = trx->id; + + ut_ad(trx_id); + + heap = mem_heap_create(512); + + fork = que_fork_create(heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap, NULL); + + roll_node = roll_node_create(heap); + + thr->child = roll_node; + roll_node->common.parent = thr; + + trx->graph = fork; + + ut_a(thr == que_fork_start_command(fork)); + + trx_roll_crash_recv_trx = trx; + + const bool dictionary_locked = trx->dict_operation; + + if (dictionary_locked) { + row_mysql_lock_data_dictionary(trx); + } + + que_run_threads(thr); + ut_a(roll_node->undo_thr != NULL); + + que_run_threads(roll_node->undo_thr); + + que_graph_free( + static_cast<que_t*>(roll_node->undo_thr->common.parent)); + + if (UNIV_UNLIKELY(!trx->rollback_finish())) { + ut_ad(!dictionary_locked); + } else { + ib::info() << "Rolled back recovered transaction " << trx_id; + } + + if (dictionary_locked) { + row_mysql_unlock_data_dictionary(trx); + } + + mem_heap_free(heap); + + trx_roll_crash_recv_trx = NULL; +} + + +struct trx_roll_count_callback_arg +{ + uint32_t n_trx; + uint64_t n_rows; + trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {} +}; + + +static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element, + trx_roll_count_callback_arg *arg) +{ + element->mutex.wr_lock(); + if (trx_t *trx= element->trx) + { + if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE)) + { + arg->n_trx++; + arg->n_rows+= trx->undo_no; + } + } + element->mutex.wr_unlock(); + return 0; +} + +/** Report progress when rolling back a row of a recovered transaction. */ +void trx_roll_report_progress() +{ + time_t now = time(NULL); + mysql_mutex_lock(&recv_sys.mutex); + bool report = recv_sys.report(now); + mysql_mutex_unlock(&recv_sys.mutex); + + if (report) { + trx_roll_count_callback_arg arg; + + /* Get number of recovered active transactions and number of + rows they modified. Numbers must be accurate, because only this + thread is allowed to touch recovered transactions. */ + trx_sys.rw_trx_hash.iterate_no_dups( + trx_roll_count_callback, &arg); + + if (arg.n_rows > 0) { + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, + "To roll back: " UINT32PF " transactions, " + UINT64PF " rows", arg.n_trx, arg.n_rows); + } + + ib::info() << "To roll back: " << arg.n_trx + << " transactions, " << arg.n_rows << " rows"; + + } +} + + +static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element, + std::vector<trx_t*> *trx_list) +{ + element->mutex.wr_lock(); + if (trx_t *trx= element->trx) + { + trx->mutex_lock(); + if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered) + trx_list->push_back(trx); + trx->mutex_unlock(); + } + element->mutex.wr_unlock(); + return 0; +} + +/** + Rollback any incomplete transactions which were encountered in crash recovery. + + If the transaction already was committed, then we clean up a possible insert + undo log. If the transaction was not yet committed, then we roll it back. + + Note: For XA recovered transactions, we rely on MySQL to + do rollback. They will be in TRX_STATE_PREPARED state. If the server + is shutdown and they are still lingering in trx_sys_t::trx_list + then the shutdown will hang. + + @param[in] all true=roll back all recovered active transactions; + false=roll back any incomplete dictionary transaction +*/ + +void trx_rollback_recovered(bool all) +{ + std::vector<trx_t*> trx_list; + + ut_a(srv_force_recovery < + ulong(all ? SRV_FORCE_NO_TRX_UNDO : SRV_FORCE_NO_DDL_UNDO)); + + /* + Collect list of recovered ACTIVE transaction ids first. Once collected, no + other thread is allowed to modify or remove these transactions from + rw_trx_hash. + */ + trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback, + &trx_list); + + while (!trx_list.empty()) + { + trx_t *trx= trx_list.back(); + trx_list.pop_back(); + + ut_ad(trx); + ut_d(trx->mutex_lock()); + ut_ad(trx->is_recovered); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_d(trx->mutex_unlock()); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources && + srv_fast_shutdown) + goto discard; + + if (all || trx->dict_operation || trx->has_stats_table_lock()) + { + trx_rollback_active(trx); + if (trx->error_state != DB_SUCCESS) + { + ut_ad(trx->error_state == DB_INTERRUPTED); + trx->error_state= DB_SUCCESS; + ut_ad(!srv_undo_sources); + ut_ad(srv_fast_shutdown); +discard: + /* Note: before kill_server() invoked innobase_end() via + unireg_end(), it invoked close_connections(), which should initiate + the rollback of any user transactions via THD::cleanup() in the + connection threads, and wait for all THD::cleanup() to complete. + So, no active user transactions should exist at this point. + + srv_undo_sources=false was cleared early in innobase_end(). + + Generally, the server guarantees that all connections using + InnoDB must be disconnected by the time we are reaching this code, + be it during shutdown or UNINSTALL PLUGIN. + + Because there is no possible race condition with any + concurrent user transaction, we do not have to invoke + trx->commit_state() or wait for !trx->is_referenced() + before trx_sys.deregister_rw(trx). */ + trx_sys.deregister_rw(trx); + trx_free_at_shutdown(trx); + } + else + trx->free(); + } + } +} + +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ +void trx_rollback_all_recovered(void*) +{ + ut_ad(!srv_read_only_mode); + + if (trx_sys.rw_trx_hash.size()) { + ib::info() << "Starting in background the rollback of" + " recovered transactions"; + trx_rollback_recovered(true); + ib::info() << "Rollback of non-prepared transactions" + " completed"; + } + + trx_rollback_is_active = false; +} + +/****************************************************************//** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. +@return own: the query graph */ +static +que_t* +trx_roll_graph_build( +/*=================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + + ut_ad(trx->mutex_is_owner()); + heap = mem_heap_create(512); + fork = que_fork_create(heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap, NULL); + + thr->child = row_undo_node_create(trx, thr, heap); + + return(fork); +} + +/*********************************************************************//** +Starts a rollback operation, creates the UNDO graph that will do the +actual undo operation. +@return query graph thread that will perform the UNDO operations. */ +static +que_thr_t* +trx_rollback_start( +/*===============*/ + trx_t* trx, /*!< in: transaction */ + undo_no_t roll_limit) /*!< in: rollback to undo no (for + partial undo), 0 if we are rolling back + the entire transaction */ +{ + /* Initialize the rollback field in the transaction */ + + ut_ad(trx->mutex_is_owner()); + ut_ad(!trx->roll_limit); + ut_ad(!trx->in_rollback); + + trx->roll_limit = roll_limit; + trx->in_rollback = true; + + ut_a(trx->roll_limit <= trx->undo_no); + + trx->pages_undone = 0; + + /* Build a 'query' graph which will perform the undo operations */ + + que_t* roll_graph = trx_roll_graph_build(trx); + + trx->graph = roll_graph; + + return(que_fork_start_command(roll_graph)); +} + +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + roll_node_t* node; + + node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node))); + + node->state = ROLL_NODE_SEND; + + node->common.type = QUE_NODE_ROLLBACK; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + roll_node_t* node; + + node = static_cast<roll_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = ROLL_NODE_SEND; + } + + if (node->state == ROLL_NODE_SEND) { + trx_t* trx; + ib_id_t roll_limit; + + trx = thr_get_trx(thr); + + node->state = ROLL_NODE_WAIT; + + ut_a(node->undo_thr == NULL); + + roll_limit = node->savept ? node->savept->least_undo_no : 0; + + trx->mutex_lock(); + + trx_commit_or_rollback_prepare(trx); + + node->undo_thr = trx_rollback_start(trx, roll_limit); + + trx->mutex_unlock(); + } else { + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc new file mode 100644 index 00000000..760c4e70 --- /dev/null +++ b/storage/innobase/trx/trx0rseg.cc @@ -0,0 +1,713 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0rseg.cc +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rseg.h" +#include "trx0undo.h" +#include "fut0lst.h" +#include "srv0srv.h" +#include "trx0purge.h" +#include "srv0mon.h" + +#ifdef WITH_WSREP +# include <mysql/service_wsrep.h> + +/** The offset to WSREP XID headers, after TRX_RSEG */ +# define TRX_RSEG_WSREP_XID_INFO TRX_RSEG_MAX_TRX_ID + 16 + 512 + +/** WSREP XID format (1 if present and valid, 0 if not present) */ +# define TRX_RSEG_WSREP_XID_FORMAT TRX_RSEG_WSREP_XID_INFO +/** WSREP XID GTRID length */ +# define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4 +/** WSREP XID bqual length */ +# define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8 +/** WSREP XID data (XIDDATASIZE bytes) */ +# define TRX_RSEG_WSREP_XID_DATA TRX_RSEG_WSREP_XID_INFO + 12 + +# ifdef UNIV_DEBUG +/** The latest known WSREP XID sequence number */ +static long long wsrep_seqno = -1; +# endif /* UNIV_DEBUG */ +/** The latest known WSREP XID UUID */ +static unsigned char wsrep_uuid[16]; + +/** Write the WSREP XID information into rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini transaction */ +static void +trx_rseg_write_wsrep_checkpoint( + buf_block_t* rseg_header, + const XID* xid, + mtr_t* mtr) +{ + DBUG_ASSERT(xid->gtrid_length >= 0); + DBUG_ASSERT(xid->bqual_length >= 0); + DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE); + + mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header, + TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT + + rseg_header->page.frame, + uint32_t(xid->formatID)); + + mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header, + TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN + + rseg_header->page.frame, + uint32_t(xid->gtrid_length)); + + mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header, + TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN + + rseg_header->page.frame, + uint32_t(xid->bqual_length)); + + const ulint xid_length = static_cast<ulint>(xid->gtrid_length + + xid->bqual_length); + mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header, + TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + + rseg_header->page.frame, + xid->data, xid_length); + if (xid_length < XIDDATASIZE + && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + + rseg_header->page.frame, field_ref_zero, + XIDDATASIZE - xid_length)) { + mtr->memset(rseg_header, + TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length, + XIDDATASIZE - xid_length, 0); + } +} + +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + buf_block_t* rseg_header, + const XID* xid, + mtr_t* mtr) +{ + ut_ad(wsrep_is_wsrep_xid(xid)); + +#ifdef UNIV_DEBUG + /* Check that seqno is monotonically increasing */ + long long xid_seqno = wsrep_xid_seqno(xid); + const byte* xid_uuid = wsrep_xid_uuid(xid); + + if (xid_seqno != -1 + && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) { + ut_ad(xid_seqno > wsrep_seqno); + } else { + memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid); + } + wsrep_seqno = xid_seqno; +#endif /* UNIV_DEBUG */ + trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr); +} + +static dberr_t trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr) +{ + dberr_t err; + buf_block_t *rseg_header = trx_sys.rseg_array[0].get(mtr, &err); + + if (UNIV_UNLIKELY(!rseg_header)) + return err; + + /* We must make check against wsrep_uuid here, the + trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with xid + contents in debug mode and the memcmp() will never give nonzero + result. */ + const bool must_clear_rsegs= + memcmp(wsrep_uuid, wsrep_xid_uuid(xid), sizeof wsrep_uuid); + + if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rseg_header->page.frame))) + trx_rseg_format_upgrade(rseg_header, mtr); + + trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr); + + if (must_clear_rsegs) + /* Because the UUID part of the WSREP XID differed from + current_xid_uuid, the WSREP group UUID was changed, and we must + reset the XID in all rollback segment headers. */ + for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) + if (buf_block_t* block= trx_sys.rseg_array[rseg_id].get(mtr, &err)) + mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO, + TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - + TRX_RSEG_WSREP_XID_INFO, 0); + return err; +} + +/** Update WSREP checkpoint XID in first rollback segment header +as part of wsrep_set_SE_checkpoint() when it is guaranteed that there +are no wsrep transactions committing. +If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already +stored into rollback segments, the WSREP XID in all the remaining rollback +segments will be reset. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid) +{ + mtr_t mtr; + mtr.start(); + trx_rseg_update_wsrep_checkpoint(xid, &mtr); + mtr.commit(); +} + +/** Read the WSREP XID information in rollback segment header. +@param[in] rseg_header Rollback segment header +@param[out] xid Transaction XID +@return whether the WSREP XID was present */ +static +bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid) +{ + int formatID = static_cast<int>( + mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT + + rseg_header->page.frame)); + if (formatID == 0) { + return false; + } + + xid.formatID = formatID; + xid.gtrid_length = static_cast<int>( + mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN + + rseg_header->page.frame)); + + xid.bqual_length = static_cast<int>( + mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN + + rseg_header->page.frame)); + + memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + + rseg_header->page.frame, XIDDATASIZE); + + return true; +} + +/** Read the WSREP XID from the TRX_SYS page (in case of upgrade). +@param[in] page TRX_SYS page +@param[out] xid WSREP XID (if present) +@return whether the WSREP XID is present */ +static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid) +{ + if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD + + page) + != TRX_SYS_WSREP_XID_MAGIC_N) { + return false; + } + + xid.formatID = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_FORMAT + page)); + xid.gtrid_length = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_GTRID_LEN + page)); + xid.bqual_length = static_cast<int>( + mach_read_from_4( + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_BQUAL_LEN + page)); + memcpy(xid.data, + TRX_SYS + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE); + return true; +} + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid) +{ + mtr_t mtr; + long long max_xid_seqno = -1; + bool found = false; + + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; + rseg_id++, mtr.commit()) { + mtr.start(); + const buf_block_t* sys = trx_sysf_get(&mtr, false); + if (UNIV_UNLIKELY(!sys)) { + continue; + } + const uint32_t page_no = trx_sysf_rseg_get_page_no( + sys, rseg_id); + + if (page_no == FIL_NULL) { + continue; + } + + const buf_block_t* rseg_header = buf_page_get_gen( + page_id_t(trx_sysf_rseg_get_space(sys, rseg_id), + page_no), + 0, RW_S_LATCH, nullptr, BUF_GET, &mtr); + + if (!rseg_header) { + continue; + } + + if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + + rseg_header->page.frame)) { + continue; + } + + XID tmp_xid; + long long tmp_seqno = 0; + if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid) + && (tmp_seqno = wsrep_xid_seqno(&tmp_xid)) + > max_xid_seqno) { + found = true; + max_xid_seqno = tmp_seqno; + xid = tmp_xid; + memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid), + sizeof wsrep_uuid); + } + } + + return found; +} +#endif /* WITH_WSREP */ + +buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const +{ + if (!space) + { + if (err) *err= DB_TABLESPACE_NOT_FOUND; + return nullptr; + } + return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr, + BUF_GET, mtr, err); +} + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr) +{ + mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0); + /* Clear also possible garbage at the end of the page. Old + InnoDB versions did not initialize unused parts of pages. */ + mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8, + srv_page_size + - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8), + 0); +} + +/** Create a rollback segment header. +@param[in,out] space system, undo, or temporary tablespace +@param[in] rseg_id rollback segment identifier +@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID +@param[in,out] mtr mini-transaction +@param[out] err error code +@return the created rollback segment +@retval nullptr on failure */ +buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id, + trx_id_t max_trx_id, mtr_t *mtr, + dberr_t *err) +{ + ut_ad(mtr->memo_contains(*space)); + buf_block_t *block= + fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr, err); + if (block) + { + ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG + + block->page.frame)); + ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG + + block->page.frame)); + ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG + + block->page.frame)); + + /* Initialize the history list */ + flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr); + + mtr->write<8,mtr_t::MAYBE_NOP>(*block, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + + block->page.frame, max_trx_id); + + /* Reset the undo log slots */ + mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, TRX_RSEG_N_SLOTS * 4, + 0xff); + } + return block; +} + +void trx_rseg_t::destroy() +{ + latch.destroy(); + + /* There can't be any active transactions. */ + ut_a(!UT_LIST_GET_LEN(undo_list)); + + for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo; + undo= next) + { + next= UT_LIST_GET_NEXT(undo_list, undo); + UT_LIST_REMOVE(undo_cached, undo); + ut_free(undo); + } +} + +void trx_rseg_t::init(fil_space_t *space, uint32_t page) +{ + latch.SRW_LOCK_INIT(trx_rseg_latch_key); + ut_ad(!this->space); + this->space= space; + page_no= page; + last_page_no= FIL_NULL; + curr_size= 1; + + UT_LIST_INIT(undo_list, &trx_undo_t::undo_list); + UT_LIST_INIT(undo_cached, &trx_undo_t::undo_list); +} + +void trx_rseg_t::reinit(uint32_t page) +{ + ut_ad(is_persistent()); + ut_ad(page_no == page); + ut_a(!UT_LIST_GET_LEN(undo_list)); + ut_ad(!history_size || UT_LIST_GET_FIRST(undo_cached)); + + history_size= 0; + page_no= page; + + for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo; + undo= next) + { + next= UT_LIST_GET_NEXT(undo_list, undo); + UT_LIST_REMOVE(undo_cached, undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + ut_free(undo); + } + + ut_ad(!is_referenced()); + clear_needs_purge(); + last_commit_and_offset= 0; + last_page_no= FIL_NULL; + curr_size= 1; +} + +/** Read the undo log lists. +@param[in,out] rseg rollback segment +@param[in,out] max_trx_id maximum observed transaction identifier +@param[in] rseg_header rollback segment header +@return error code */ +static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id, + const buf_block_t *rseg_header) +{ + ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); + + for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++) + { + uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i); + if (page_no != FIL_NULL) + { + const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no, + max_trx_id); + if (!undo) + return DB_CORRUPTION; + rseg->curr_size+= undo->size; + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + } + } + + return DB_SUCCESS; +} + +/** Restore the state of a persistent rollback segment. +@param[in,out] rseg persistent rollback segment +@param[in,out] max_trx_id maximum observed transaction identifier +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id, + mtr_t *mtr) +{ + if (!rseg->space) + return DB_TABLESPACE_NOT_FOUND; + dberr_t err; + const buf_block_t *rseg_hdr= + buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr, + &err); + if (!rseg_hdr) + return err; + + if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame)) + { + trx_id_t id= mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID + + rseg_hdr->page.frame); + + if (id > max_trx_id) + max_trx_id= id; + + const byte *binlog_name= + TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame; + if (*binlog_name) + { + lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + rseg_hdr->page.frame)); + static_assert(TRX_RSEG_BINLOG_NAME_LEN == + sizeof trx_sys.recovered_binlog_filename, "compatibility"); + if (lsn > trx_sys.recovered_binlog_lsn) + { + trx_sys.recovered_binlog_lsn= lsn; + trx_sys.recovered_binlog_offset= + mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET + + rseg_hdr->page.frame); + memcpy(trx_sys.recovered_binlog_filename, binlog_name, + TRX_RSEG_BINLOG_NAME_LEN); + } + +#ifdef WITH_WSREP + trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid); +#endif + } + } + + if (srv_operation == SRV_OPERATION_RESTORE) + /* mariabackup --prepare only deals with + the redo log and the data files, not with + transactions or the data dictionary. */ + return DB_SUCCESS; + + /* Initialize the undo log lists according to the rseg header */ + + rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rseg_hdr->page.frame) + 1; + err= trx_undo_lists_init(rseg, max_trx_id, rseg_hdr); + if (err != DB_SUCCESS); + else if (auto len= flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY + + rseg_hdr->page.frame)) + { + rseg->history_size+= len; + + fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY + + rseg_hdr->page.frame); + node_addr.boffset= static_cast<uint16_t>(node_addr.boffset - + TRX_UNDO_HISTORY_NODE); + rseg->last_page_no= node_addr.page; + + const buf_block_t* block= + buf_page_get_gen(page_id_t(rseg->space->id, node_addr.page), + 0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err); + if (!block) + return err; + + trx_id_t id= mach_read_from_8(block->page.frame + node_addr.boffset + + TRX_UNDO_TRX_ID); + if (id > max_trx_id) + max_trx_id= id; + id= mach_read_from_8(block->page.frame + node_addr.boffset + + TRX_UNDO_TRX_NO); + if (id > max_trx_id) + max_trx_id= id; + + rseg->set_last_commit(node_addr.boffset, id); + unsigned purge= mach_read_from_2(block->page.frame + node_addr.boffset + + TRX_UNDO_NEEDS_PURGE); + ut_ad(purge <= 1); + if (purge != 0) + rseg->set_needs_purge(); + + if (rseg->last_page_no != FIL_NULL) + /* There is no need to cover this operation by the purge + mutex because we are still bootstrapping. */ + purge_sys.purge_queue.push(*rseg); + } + + return err; +} + +/** Read binlog metadata from the TRX_SYS page, in case we are upgrading +from MySQL or a MariaDB version older than 10.3.5. */ +static void trx_rseg_init_binlog_info(const page_t* page) +{ + if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD + + page) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + memcpy(trx_sys.recovered_binlog_filename, + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME + + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN); + trx_sys.recovered_binlog_offset = mach_read_from_8( + TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET + + TRX_SYS + page); + } + +#ifdef WITH_WSREP + trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid); +#endif +} + +/** Initialize or recover the rollback segments at startup. */ +dberr_t trx_rseg_array_init() +{ + trx_id_t max_trx_id = 0; + + *trx_sys.recovered_binlog_filename = '\0'; + trx_sys.recovered_binlog_offset = 0; +#ifdef WITH_WSREP + trx_sys.recovered_wsrep_xid.null(); + XID wsrep_sys_xid; + wsrep_sys_xid.null(); + bool wsrep_xid_in_rseg_found = false; +#endif + mtr_t mtr; + dberr_t err = DB_SUCCESS; + + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + mtr.start(); + if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) { + if (rseg_id == 0) { + /* In case this is an upgrade from + before MariaDB 10.3.5, fetch the base + information from the TRX_SYS page. */ + max_trx_id = mach_read_from_8( + TRX_SYS + TRX_SYS_TRX_ID_STORE + + sys->page.frame); + trx_rseg_init_binlog_info(sys->page.frame); +#ifdef WITH_WSREP + wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid); +#endif + } + + const uint32_t page_no = trx_sysf_rseg_get_page_no( + sys, rseg_id); + if (page_no != FIL_NULL) { + trx_rseg_t& rseg = trx_sys.rseg_array[rseg_id]; + rseg.init(fil_space_get( + trx_sysf_rseg_get_space( + sys, rseg_id)), + page_no); + ut_ad(rseg.is_persistent()); + if ((err = trx_rseg_mem_restore( + &rseg, max_trx_id, &mtr)) + != DB_SUCCESS) { + mtr.commit(); + break; + } +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null() && + !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) { + wsrep_xid_in_rseg_found = true; + ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid), + wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid), + sizeof wsrep_uuid) + || wsrep_xid_seqno( + &wsrep_sys_xid) + <= wsrep_xid_seqno( + &trx_sys.recovered_wsrep_xid)); + } +#endif + } + } + + mtr.commit(); + } + + if (err != DB_SUCCESS) { + for (auto& rseg : trx_sys.rseg_array) { + while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) { + UT_LIST_REMOVE(rseg.undo_list, u); + ut_free(u); + } + } + return err; + } + +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null()) { + /* Upgrade from a version prior to 10.3.5, + where WSREP XID was stored in TRX_SYS page. + If no rollback segment has a WSREP XID set, + we must copy the XID found in TRX_SYS page + to rollback segments. */ + mtr.start(); + + if (!wsrep_xid_in_rseg_found) { + trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr); + } + + /* Finally, clear WSREP XID in TRX_SYS page. */ + mtr.memset(trx_sysf_get(&mtr), + TRX_SYS + TRX_SYS_WSREP_XID_INFO, + TRX_SYS_WSREP_XID_LEN, 0); + mtr.commit(); + } +#endif + + trx_sys.init_max_trx_id(max_trx_id + 1); + return DB_SUCCESS; +} + +/** Create the temporary rollback segments. */ +dberr_t trx_temp_rseg_create(mtr_t *mtr) +{ + for (ulong i= 0; i < array_elements(trx_sys.temp_rsegs); i++) + { + mtr->start(); + mtr->set_log_mode(MTR_LOG_NO_REDO); + mtr->x_lock_space(fil_system.temp_space); + dberr_t err; + buf_block_t *rblock= + trx_rseg_header_create(fil_system.temp_space, i, 0, mtr, &err); + if (UNIV_UNLIKELY(!rblock)) + { + mtr->commit(); + return err; + } + trx_sys.temp_rsegs[i].init(fil_system.temp_space, + rblock->page.id().page_no()); + mtr->commit(); + } + return DB_SUCCESS; +} + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] trx committing transaction +@param[in,out] mtr mini-transaction */ +void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx, + mtr_t *mtr) +{ + DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset); + + const size_t len = strlen(trx->mysql_log_file_name) + 1; + + ut_ad(len > 1); + + if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) { + return; + } + + mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header, + TRX_RSEG + TRX_RSEG_BINLOG_OFFSET + + rseg_header->page.frame, + trx->mysql_log_offset); + + void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->page.frame; + + if (memcmp(trx->mysql_log_file_name, name, len)) { + mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len); + } +} diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc new file mode 100644 index 00000000..d344f3a0 --- /dev/null +++ b/storage/innobase/trx/trx0sys.cc @@ -0,0 +1,357 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0sys.cc +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" +#include "mysqld.h" +#include "sql_error.h" + +#include "fsp0fsp.h" +#include "mtr0log.h" +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "log0log.h" +#include "log0recv.h" +#include "os0file.h" + +/** The transaction system */ +trx_sys_t trx_sys; + +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +uint trx_rseg_n_slots_debug = 0; +#endif + +/** Display the MySQL binlog offset info if it is present in the trx +system header. */ +void +trx_sys_print_mysql_binlog_offset() +{ + if (!*trx_sys.recovered_binlog_filename) { + return; + } + + ib::info() << "Last binlog file '" + << trx_sys.recovered_binlog_filename + << "', position " + << trx_sys.recovered_binlog_offset; +} + +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header +@retval ULINT_UNDEFINED if not found */ +ulint +trx_sys_rseg_find_free(const buf_block_t* sys_header) +{ + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) + == FIL_NULL) { + return rseg_id; + } + } + + return(ULINT_UNDEFINED); +} + +/** Count the number of initialized persistent rollback segment slots. */ +static +void +trx_sysf_get_n_rseg_slots() +{ + mtr_t mtr; + mtr.start(); + + srv_available_undo_logs = 0; + if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) { + for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { + srv_available_undo_logs + += trx_sysf_rseg_get_page_no(sys_header, + rseg_id) + != FIL_NULL; + } + } + + mtr.commit(); +} + +/** Initialize the transaction system when creating the database. */ +dberr_t trx_sys_create_sys_pages(mtr_t *mtr) +{ + mtr->start(); + mtr->x_lock_space(fil_system.sys_space); + static_assert(TRX_SYS_SPACE == 0, "compatibility"); + + /* Create the trx sys file block in a new allocated file segment */ + dberr_t err; + buf_block_t *block= fseg_create(fil_system.sys_space, + TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err); + if (UNIV_UNLIKELY(!block)) + { + error: + mtr->commit(); + return err; + } + ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO)); + + mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_TYPE_TRX_SYS); + + /* Reset the rollback segment slots. Old versions of InnoDB + (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect + that the whole array is initialized. */ + static_assert(256 >= TRX_SYS_N_RSEGS, ""); + static_assert(TRX_SYS + TRX_SYS_RSEGS + 256 * TRX_SYS_RSEG_SLOT_SIZE <= + UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END, ""); + mtr->write<4>(*block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + block->page.frame, FSP_FIRST_RSEG_PAGE_NO); + mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE, + 255 * TRX_SYS_RSEG_SLOT_SIZE, 0xff); + + buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0, + mtr, &err); + if (UNIV_UNLIKELY(!r)) + goto error; + ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO)); + mtr->commit(); + + return trx_lists_init_at_db_start(); +} + +void trx_sys_t::create() +{ + ut_ad(this == &trx_sys); + ut_ad(!is_initialised()); + m_initialised= true; + trx_list.create(); + rw_trx_hash.init(); +} + +size_t trx_sys_t::history_size() +{ + ut_ad(is_initialised()); + size_t size= 0; + for (auto &rseg : rseg_array) + { + rseg.latch.rd_lock(SRW_LOCK_CALL); + size+= rseg.history_size; + } + for (auto &rseg : rseg_array) + rseg.latch.rd_unlock(); + return size; +} + +bool trx_sys_t::history_exceeds(size_t threshold) +{ + ut_ad(is_initialised()); + size_t size= 0; + bool exceeds= false; + size_t i; + for (i= 0; i < array_elements(rseg_array); i++) + { + rseg_array[i].latch.rd_lock(SRW_LOCK_CALL); + size+= rseg_array[i].history_size; + if (size > threshold) + { + exceeds= true; + i++; + break; + } + } + while (i) + rseg_array[--i].latch.rd_unlock(); + return exceeds; +} + +TPOOL_SUPPRESS_TSAN bool trx_sys_t::history_exists() +{ + ut_ad(is_initialised()); + for (auto &rseg : rseg_array) + if (rseg.history_size) + return true; + return false; +} + +TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const +{ + ut_ad(is_initialised()); + size_t size= 0; + for (auto &rseg : rseg_array) + size+= rseg.history_size; + return size; +} + +/** Create a persistent rollback segment. +@param space_id system or undo tablespace id +@return pointer to new rollback segment +@retval nullptr on failure */ +static trx_rseg_t *trx_rseg_create(ulint space_id) +{ + trx_rseg_t *rseg= nullptr; + mtr_t mtr; + + mtr.start(); + + if (fil_space_t *space= mtr.x_lock_space(space_id)) + { + ut_ad(space->purpose == FIL_TYPE_TABLESPACE); + if (buf_block_t *sys_header= trx_sysf_get(&mtr)) + { + ulint rseg_id= trx_sys_rseg_find_free(sys_header); + dberr_t err; + if (buf_block_t *rblock= rseg_id == ULINT_UNDEFINED + ? nullptr : trx_rseg_header_create(space, rseg_id, 0, &mtr, &err)) + { + rseg= &trx_sys.rseg_array[rseg_id]; + rseg->init(space, rblock->page.id().page_no()); + ut_ad(rseg->is_persistent()); + mtr.write<4,mtr_t::MAYBE_NOP> + (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame, + space_id); + mtr.write<4,mtr_t::MAYBE_NOP> + (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame, + rseg->page_no); + } + } + } + + mtr.commit(); + return rseg; +} + +/** Create the rollback segments. +@return whether the creation succeeded */ +bool trx_sys_create_rsegs() +{ + /* srv_available_undo_logs reflects the number of persistent + rollback segments that have been initialized in the + transaction system header page. */ + ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES); + + if (high_level_read_only) { + srv_available_undo_logs = 0; + return(true); + } + + /* This is executed in single-threaded mode therefore it is not + necessary to use the same mtr in trx_rseg_create(). n_used cannot + change while the function is executing. */ + trx_sysf_get_n_rseg_slots(); + + ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS); + + /* The first persistent rollback segment is always initialized + in the system tablespace. */ + ut_a(srv_available_undo_logs > 0); + + for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS; + i++, srv_available_undo_logs++) { + /* Tablespace 0 is the system tablespace. + Dedicated undo log tablespaces start from 1. */ + ulint space = srv_undo_tablespaces > 0 + ? (i % srv_undo_tablespaces) + + srv_undo_space_id_start + : TRX_SYS_SPACE; + + if (!trx_rseg_create(space)) { + ib::error() << "Unable to allocate the" + " requested innodb_undo_logs"; + return(false); + } + + /* Increase the number of active undo + tablespace in case new rollback segment + assigned to new undo tablespace. */ + if (space > srv_undo_tablespaces_active) { + srv_undo_tablespaces_active++; + + ut_ad(srv_undo_tablespaces_active == space); + } + } + + ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS); + + ib::info info; + info << srv_available_undo_logs; + if (srv_undo_tablespaces_active) { + info << " rollback segments in " << srv_undo_tablespaces_active + << " undo tablespaces are active."; + } else { + info << " rollback segments are active."; + } + + return(true); +} + +/** Close the transaction system on shutdown */ +void +trx_sys_t::close() +{ + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + if (!is_initialised()) { + return; + } + + if (size_t size = view_count()) { + ib::error() << "All read views were not closed before" + " shutdown: " << size << " read views open"; + } + + rw_trx_hash.destroy(); + + /* There can't be any active transactions. */ + + for (ulint i = 0; i < array_elements(temp_rsegs); ++i) { + temp_rsegs[i].destroy(); + } + for (ulint i = 0; i < array_elements(rseg_array); ++i) { + rseg_array[i].destroy(); + } + + ut_a(trx_list.empty()); + trx_list.close(); + m_initialised = false; +} + +/** @return total number of active (non-prepared) transactions */ +ulint trx_sys_t::any_active_transactions() +{ + uint32_t total_trx= 0; + + trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) { + if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY || + (trx.state == TRX_STATE_ACTIVE && trx.id)) + total_trx++; + }); + + return total_trx; +} diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc new file mode 100644 index 00000000..f9a152e2 --- /dev/null +++ b/storage/innobase/trx/trx0trx.cc @@ -0,0 +1,2180 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0trx.cc +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" + +#ifdef WITH_WSREP +#include <mysql/service_wsrep.h> +#endif + +#include <mysql/service_thd_error_context.h> + +#include "btr0sea.h" +#include "lock0lock.h" +#include "log0log.h" +#include "que0que.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "trx0xa.h" +#include "ut0pool.h" +#include "ut0vec.h" +#include "log.h" + +#include <set> +#include <new> + +/** The bit pattern corresponding to TRX_ID_MAX */ +const byte trx_id_max_bytes[8] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** The bit pattern corresponding to max timestamp */ +const byte timestamp_max_bytes[7] = { + 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f +}; + + +static const ulint MAX_DETAILED_ERROR_LEN = 256; + +/*************************************************************//** +Set detailed error message for the transaction. */ +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg) /*!< in: detailed error message */ +{ + strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1); + trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0'; +} + +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file) /*!< in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN); +} + +/********************************************************************//** +Initialize transaction object. +@param trx trx to initialize */ +static +void +trx_init( +/*=====*/ + trx_t* trx) +{ + trx->state = TRX_STATE_NOT_STARTED; + + trx->is_recovered = false; + + trx->op_info = ""; + + trx->active_commit_ordered = false; + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->check_foreigns = true; + + trx->check_unique_secondary = true; + + trx->lock.n_rec_locks = 0; + + trx->dict_operation = false; + + trx->error_state = DB_SUCCESS; + + trx->error_key_num = ULINT_UNDEFINED; + + trx->undo_no = 0; + + trx->rsegs.m_redo.rseg = NULL; + + trx->rsegs.m_noredo.rseg = NULL; + + trx->read_only = false; + + trx->auto_commit = false; + + trx->will_lock = false; + + trx->bulk_insert = false; + + trx->apply_online_log = false; + + ut_d(trx->start_file = 0); + + ut_d(trx->start_line = 0); + + trx->magic_n = TRX_MAGIC_N; + + trx->last_sql_stat_start.least_undo_no = 0; + + ut_ad(!trx->read_view.is_open()); + + trx->lock.rec_cached = 0; + + trx->lock.table_cached = 0; +#ifdef WITH_WSREP + ut_ad(!trx->wsrep); +#endif /* WITH_WSREP */ +} + +/** For managing the life-cycle of the trx_t instance that we get +from the pool. */ +struct TrxFactory { + + /** Initializes a transaction object. It must be explicitly started + with trx_start_if_not_started() before using it. The default isolation + level is TRX_ISO_REPEATABLE_READ. + @param trx Transaction instance to initialise */ + static void init(trx_t* trx) + { + /* Explicitly call the constructor of the already + allocated object. trx_t objects are allocated by + ut_zalloc_nokey() in Pool::Pool() which would not call + the constructors of the trx_t members. */ + new(&trx->mod_tables) trx_mod_tables_t(); + + new(&trx->lock.table_locks) lock_list(); + + new(&trx->read_view) ReadView(); + + trx->rw_trx_hash_pins = 0; + trx_init(trx); + + trx->dict_operation_lock_mode = false; + + trx->detailed_error = reinterpret_cast<char*>( + ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN)); + + trx->lock.lock_heap = mem_heap_create_typed( + 1024, MEM_HEAP_FOR_LOCK_HEAP); + pthread_cond_init(&trx->lock.cond, nullptr); + + UT_LIST_INIT(trx->lock.trx_locks, &lock_t::trx_locks); + UT_LIST_INIT(trx->lock.evicted_tables, + &dict_table_t::table_LRU); + + UT_LIST_INIT( + trx->trx_savepoints, + &trx_named_savept_t::trx_savepoints); + + trx->mutex_init(); + } + + /** Release resources held by the transaction object. + @param trx the transaction for which to release resources */ + static void destroy(trx_t* trx) + { +#ifdef __SANITIZE_ADDRESS__ + /* Unpoison the memory for AddressSanitizer */ + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); +#elif !__has_feature(memory_sanitizer) + /* In Valgrind, we cannot cancel MEM_NOACCESS() without + changing the state of the V bits (which indicate + which bits are initialized). + We will declare the contents as initialized. + We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */ + MEM_MAKE_DEFINED(trx, sizeof *trx); +#endif + + ut_a(trx->magic_n == TRX_MAGIC_N); + ut_ad(!trx->mysql_thd); + + ut_a(trx->lock.wait_lock == NULL); + ut_a(trx->lock.wait_thr == NULL); + ut_a(!trx->dict_operation_lock_mode); + + if (trx->lock.lock_heap != NULL) { + mem_heap_free(trx->lock.lock_heap); + trx->lock.lock_heap = NULL; + } + + pthread_cond_destroy(&trx->lock.cond); + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); + + ut_free(trx->detailed_error); + + trx->mutex_destroy(); + + trx->mod_tables.~trx_mod_tables_t(); + + ut_ad(!trx->read_view.is_open()); + + trx->lock.table_locks.~lock_list(); + + trx->read_view.~ReadView(); + } +}; + +/** The lock strategy for TrxPool */ +class TrxPoolLock +{ + mysql_mutex_t mutex; + +public: + /** Create the mutex */ + void create() + { + mysql_mutex_init(trx_pool_mutex_key, &mutex, nullptr); + } + + /** Acquire the mutex */ + void enter() { mysql_mutex_lock(&mutex); } + + /** Release the mutex */ + void exit() { mysql_mutex_unlock(&mutex); } + + /** Free the mutex */ + void destroy() { mysql_mutex_destroy(&mutex); } +}; + +/** The lock strategy for the TrxPoolManager */ +class TrxPoolManagerLock +{ + mysql_mutex_t mutex; + +public: + /** Create the mutex */ + void create() + { + mysql_mutex_init(trx_pool_manager_mutex_key, &mutex, nullptr); + } + + /** Acquire the mutex */ + void enter() { mysql_mutex_lock(&mutex); } + + /** Release the mutex */ + void exit() { mysql_mutex_unlock(&mutex); } + + /** Free the mutex */ + void destroy() { mysql_mutex_destroy(&mutex); } +}; + +/** Use explicit mutexes for the trx_t pool and its manager. */ +typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t; +typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t; + +/** The trx_t pool manager */ +static trx_pools_t* trx_pools; + +/** Size of on trx_t pool in bytes. */ +static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4; + +/** Create the trx_t pool */ +void +trx_pool_init() +{ + trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE)); + + ut_a(trx_pools != 0); +} + +/** Destroy the trx_t pool */ +void +trx_pool_close() +{ + UT_DELETE(trx_pools); + + trx_pools = 0; +} + +/** @return an allocated transaction */ +trx_t *trx_create() +{ + trx_t* trx = trx_pools->get(); + +#ifdef __SANITIZE_ADDRESS__ + /* Unpoison the memory for AddressSanitizer. + It may have been poisoned in trx_t::free().*/ + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); +#elif !__has_feature(memory_sanitizer) + /* In Valgrind, we cannot cancel MEM_NOACCESS() without + changing the state of the V bits (which indicate + which bits are initialized). + We will declare the contents as initialized. + We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */ + MEM_MAKE_DEFINED(trx, sizeof *trx); +#endif + + trx->assert_freed(); + + mem_heap_t* heap; + ib_alloc_t* alloc; + + /* We just got trx from pool, it should be non locking */ + ut_ad(!trx->will_lock); + ut_ad(!trx->rw_trx_hash_pins); + + DBUG_LOG("trx", "Create: " << trx); + + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8); + + alloc = ib_heap_allocator_create(heap); + + trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4); + + ut_ad(trx->mod_tables.empty()); + ut_ad(trx->lock.n_rec_locks == 0); + ut_ad(trx->lock.table_cached == 0); + ut_ad(trx->lock.rec_cached == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); + + trx_sys.register_trx(trx); + + return(trx); +} + +/** Free the memory to trx_pools */ +void trx_t::free() +{ +#ifdef HAVE_MEM_CHECK + if (xid.is_null()) + MEM_MAKE_DEFINED(&xid, sizeof xid); + else + MEM_MAKE_DEFINED(&xid.data[xid.gtrid_length + xid.bqual_length], + sizeof xid.data - (xid.gtrid_length + xid.bqual_length)); +#endif + MEM_CHECK_DEFINED(this, sizeof *this); + + ut_ad(!n_mysql_tables_in_use); + ut_ad(!mysql_log_file_name); + ut_ad(!mysql_n_tables_locked); + ut_ad(!will_lock); + ut_ad(error_state == DB_SUCCESS); + ut_ad(magic_n == TRX_MAGIC_N); + ut_ad(!read_only); + ut_ad(!lock.wait_lock); + + dict_operation= false; + trx_sys.deregister_trx(this); + assert_freed(); + trx_sys.rw_trx_hash.put_pins(this); + + mysql_thd= nullptr; + + // FIXME: We need to avoid this heap free/alloc for each commit. + if (autoinc_locks) + { + ut_ad(ib_vector_is_empty(autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(autoinc_locks); + autoinc_locks= NULL; + } + + MEM_NOACCESS(&skip_lock_inheritance_and_n_ref, + sizeof skip_lock_inheritance_and_n_ref); + /* do not poison mutex */ + MEM_NOACCESS(&id, sizeof id); + MEM_NOACCESS(&state, sizeof state); + MEM_NOACCESS(&is_recovered, sizeof is_recovered); +#ifdef WITH_WSREP + MEM_NOACCESS(&wsrep, sizeof wsrep); +#endif + read_view.mem_noaccess(); + MEM_NOACCESS(&lock, sizeof lock); + MEM_NOACCESS(&op_info, sizeof op_info); + MEM_NOACCESS(&isolation_level, sizeof isolation_level); + MEM_NOACCESS(&check_foreigns, sizeof check_foreigns); + MEM_NOACCESS(&is_registered, sizeof is_registered); + MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered); + MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary); + MEM_NOACCESS(&flush_log_later, sizeof flush_log_later); + MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later); + MEM_NOACCESS(&duplicates, sizeof duplicates); + MEM_NOACCESS(&dict_operation, sizeof dict_operation); + MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode); + MEM_NOACCESS(&start_time, sizeof start_time); + MEM_NOACCESS(&start_time_micro, sizeof start_time_micro); + MEM_NOACCESS(&commit_lsn, sizeof commit_lsn); + MEM_NOACCESS(&mysql_thd, sizeof mysql_thd); + MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name); + MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset); + MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use); + MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked); + MEM_NOACCESS(&error_state, sizeof error_state); + MEM_NOACCESS(&error_info, sizeof error_info); + MEM_NOACCESS(&error_key_num, sizeof error_key_num); + MEM_NOACCESS(&graph, sizeof graph); + MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints); + MEM_NOACCESS(&undo_no, sizeof undo_no); + MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start); + MEM_NOACCESS(&rsegs, sizeof rsegs); + MEM_NOACCESS(&roll_limit, sizeof roll_limit); + MEM_NOACCESS(&in_rollback, sizeof in_rollback); + MEM_NOACCESS(&pages_undone, sizeof pages_undone); + MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows); + MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks); + MEM_NOACCESS(&read_only, sizeof read_only); + MEM_NOACCESS(&auto_commit, sizeof auto_commit); + MEM_NOACCESS(&will_lock, sizeof will_lock); + MEM_NOACCESS(&fts_trx, sizeof fts_trx); + MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id); + MEM_NOACCESS(&flush_tables, sizeof flush_tables); +#ifdef UNIV_DEBUG + MEM_NOACCESS(&start_line, sizeof start_line); + MEM_NOACCESS(&start_file, sizeof start_file); +#endif /* UNIV_DEBUG */ + MEM_NOACCESS(&xid, sizeof xid); + MEM_NOACCESS(&mod_tables, sizeof mod_tables); + MEM_NOACCESS(&detailed_error, sizeof detailed_error); + MEM_NOACCESS(&magic_n, sizeof magic_n); + MEM_NOACCESS(&apply_online_log, sizeof apply_online_log); + trx_pools->mem_free(this); +} + +/** Transition to committed state, to release implicit locks. */ +TRANSACTIONAL_INLINE inline void trx_t::commit_state() +{ + ut_ad(state == TRX_STATE_PREPARED + || state == TRX_STATE_PREPARED_RECOVERED + || state == TRX_STATE_ACTIVE); + /* This makes the transaction committed in memory and makes its + changes to data visible to other transactions. NOTE that there is a + small discrepancy from the strict formal visibility rules here: a + user of the database can see modifications made by another + transaction T even before the necessary redo log segment has been + flushed to the disk. If the database happens to crash before the + flush, the user has seen modifications from T which will never be a + committed transaction. However, any transaction T2 which sees the + modifications of the committing transaction T, and which also itself + makes modifications to the database, will get an lsn larger than the + committing transaction T. In the case where the log flush fails, and + T never gets committed, also T2 will never get committed. */ + TMTrxGuard tg{*this}; + state= TRX_STATE_COMMITTED_IN_MEMORY; + ut_ad(id || !is_referenced()); +} + +/** Release any explicit locks of a committing transaction. */ +inline void trx_t::release_locks() +{ + DEBUG_SYNC_C("trx_t_release_locks_enter"); + DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY); + DBUG_ASSERT(!is_referenced()); + + if (UT_LIST_GET_LEN(lock.trx_locks)) + { + lock_release(this); + ut_ad(!lock.n_rec_locks); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(ib_vector_is_empty(autoinc_locks)); + mem_heap_empty(lock.lock_heap); + } + + lock.table_locks.clear(); + reset_skip_lock_inheritance(); + id= 0; + while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) + { + UT_LIST_REMOVE(lock.evicted_tables, table); + dict_mem_table_free(table); + } + DEBUG_SYNC_C("after_trx_committed_in_memory"); +} + +/** At shutdown, frees a transaction object. */ +TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx) +{ + ut_ad(trx->is_recovered); + ut_a(trx_state_eq(trx, TRX_STATE_PREPARED) + || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) + || (trx_state_eq(trx, TRX_STATE_ACTIVE) + && (!srv_was_started + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + || (!srv_is_being_started + && !srv_undo_sources && srv_fast_shutdown)))); + ut_a(trx->magic_n == TRX_MAGIC_N); + + ut_d(trx->apply_online_log = false); + trx->commit_state(); + trx->release_locks(); + trx->mod_tables.clear(); + trx_undo_free_at_shutdown(trx); + + ut_a(!trx->read_only); + + DBUG_LOG("trx", "Free prepared: " << trx); + trx->state = TRX_STATE_NOT_STARTED; + ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks)); + trx->free(); +} + + +/** + Disconnect a prepared transaction from MySQL + @param[in,out] trx transaction +*/ +void trx_disconnect_prepared(trx_t *trx) +{ + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->mysql_thd); + ut_ad(!trx->mysql_log_file_name); + trx->read_view.close(); + trx_sys.trx_list.freeze(); + trx->is_recovered= true; + trx->mysql_thd= NULL; + trx_sys.trx_list.unfreeze(); + /* todo/fixme: suggest to do it at innodb prepare */ + trx->will_lock= false; + trx_sys.rw_trx_hash.put_pins(trx); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Resurrect the table locks for a resurrected transaction. */ +static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo) +{ + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(undo.rseg == trx->rsegs.m_redo.rseg); + + if (undo.empty()) + return DB_SUCCESS; + + mtr_t mtr; + std::map<table_id_t, bool> tables; + mtr.start(); + + dberr_t err; + if (buf_block_t *block= + buf_page_get_gen(page_id_t(trx->rsegs.m_redo.rseg->space->id, + undo.top_page_no), 0, RW_S_LATCH, nullptr, + BUF_GET, &mtr, &err)) + { + buf_block_t *undo_block= block; + const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset; + + do + { + ulint type; + undo_no_t undo_no; + table_id_t table_id; + ulint cmpl_info; + bool updated_extern; + + if (undo_block != block) + { + mtr.memo_release(undo_block, MTR_MEMO_PAGE_S_FIX); + undo_block= block; + } + trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &updated_extern, &undo_no, &table_id); + tables.emplace(table_id, type == TRX_UNDO_EMPTY); + undo_rec= trx_undo_get_prev_rec(block, page_offset(undo_rec), + undo.hdr_page_no, undo.hdr_offset, + true, &mtr); + } + while (undo_rec); + } + + mtr.commit(); + + if (err != DB_SUCCESS) + return err; + + for (auto p : tables) + { + if (dict_table_t *table= + dict_table_open_on_id(p.first, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) + { + if (!table->is_readable()) + { + dict_sys.lock(SRW_LOCK_CALL); + table->release(); + dict_sys.remove(table); + dict_sys.unlock(); + continue; + } + + if (trx->state == TRX_STATE_PREPARED) + trx->mod_tables.emplace(table, 0); + + lock_table_resurrect(table, trx, p.second ? LOCK_X : LOCK_IX); + + DBUG_LOG("ib_trx", + "resurrect " << ib::hex(trx->id) << " lock on " << table->name); + table->release(); + } + } + + return DB_SUCCESS; +} + + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** + Resurrect the transactions that were doing inserts/updates the time of the + crash, they need to be undone. +*/ +static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, + time_t start_time, ulonglong start_time_micro, + uint64_t *rows_to_undo) +{ + trx_state_t state; + /* + This is single-threaded startup code, we do not need the + protection of trx->mutex here. + */ + switch (undo->state) + { + case TRX_UNDO_ACTIVE: + state= TRX_STATE_ACTIVE; + break; + case TRX_UNDO_PREPARED: + /* + Prepared transactions are left in the prepared state + waiting for a commit or abort decision from MySQL + */ + state= TRX_STATE_PREPARED; + sql_print_information("InnoDB: Transaction " TRX_ID_FMT + " was in the XA prepared state.", undo->trx_id); + break; + default: + return DB_SUCCESS; + } + + trx_t *trx= trx_create(); + trx->state= state; + ut_d(trx->start_file= __FILE__); + ut_d(trx->start_line= __LINE__); + + trx->rsegs.m_redo.undo= undo; + trx->undo_no= undo->top_undo_no + 1; + trx->rsegs.m_redo.rseg= rseg; + /* + For transactions with active data will not have rseg size = 1 + or will not qualify for purge limit criteria. So it is safe to increment + this trx_ref_count w/o mutex protection. + */ + trx->rsegs.m_redo.rseg->acquire(); + trx->xid= undo->xid; + trx->id= undo->trx_id; + trx->is_recovered= true; + trx->start_time= start_time; + trx->start_time_micro= start_time_micro; + trx->dict_operation= undo->dict_operation; + + trx_sys.rw_trx_hash.insert(trx); + trx_sys.rw_trx_hash.put_pins(trx); + if (trx_state_eq(trx, TRX_STATE_ACTIVE)) + *rows_to_undo+= trx->undo_no; + return trx_resurrect_table_locks(trx, *undo); +} + + +/** Initialize (resurrect) transactions at startup. */ +dberr_t trx_lists_init_at_db_start() +{ + ut_a(srv_is_being_started); + ut_ad(!srv_was_started); + + if (srv_operation == SRV_OPERATION_RESTORE) { + /* mariabackup --prepare only deals with + the redo log and the data files, not with + transactions or the data dictionary. */ + return trx_rseg_array_init(); + } + + if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + return DB_SUCCESS; + } + + purge_sys.create(); + dberr_t err = trx_rseg_array_init(); + + if (err != DB_SUCCESS) { +corrupted: + ib::info() << "Retry with innodb_force_recovery=5"; + return err; + } + + /* Look from the rollback segments if there exist undo logs for + transactions. */ + const time_t start_time = time(NULL); + const ulonglong start_time_micro= microsecond_interval_timer(); + uint64_t rows_to_undo = 0; + + for (auto& rseg : trx_sys.rseg_array) { + trx_undo_t* undo; + + /* Some rollback segment may be unavailable, + especially if the server was previously run with a + non-default value of innodb_undo_logs. */ + if (!rseg.space) { + continue; + } + /* Resurrect other transactions. */ + for (undo = UT_LIST_GET_FIRST(rseg.undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t *trx = trx_sys.find(0, undo->trx_id, false); + if (!trx) { + err = trx_resurrect(undo, &rseg, start_time, + start_time_micro, + &rows_to_undo); + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->start_time == start_time); + ut_ad(trx->is_recovered); + ut_ad(trx->rsegs.m_redo.rseg == &rseg); + ut_ad(rseg.is_referenced()); + + trx->rsegs.m_redo.undo = undo; + if (undo->top_undo_no >= trx->undo_no) { + if (trx_state_eq(trx, + TRX_STATE_ACTIVE)) { + rows_to_undo -= trx->undo_no; + rows_to_undo += + undo->top_undo_no + 1; + } + + trx->undo_no = undo->top_undo_no + 1; + } + err = trx_resurrect_table_locks(trx, *undo); + } + + if (err != DB_SUCCESS) { + goto corrupted; + } + } + } + + if (const auto size = trx_sys.rw_trx_hash.size()) { + ib::info() << size + << " transaction(s) which must be rolled back or" + " cleaned up in total " << rows_to_undo + << " row operations to undo"; + ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id(); + } + + purge_sys.clone_oldest_view<true>(); + return DB_SUCCESS; +} + +/** Assign a persistent rollback segment in a round-robin fashion, +evenly distributed between 0 and innodb_undo_logs-1 +@return persistent rollback segment +@retval NULL if innodb_read_only */ +static trx_rseg_t* trx_assign_rseg_low() +{ + if (high_level_read_only) { + ut_ad(!srv_available_undo_logs); + return(NULL); + } + + ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS); + + /* The first slot is always assigned to the system tablespace. */ + ut_ad(trx_sys.rseg_array[0].space == fil_system.sys_space); + + /* Choose a rollback segment evenly distributed between 0 and + innodb_undo_logs-1 in a round-robin fashion, skipping those + undo tablespaces that are scheduled for truncation. */ + static Atomic_counter<unsigned> rseg_slot; + unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS; + ut_d(if (trx_rseg_n_slots_debug) slot = 0); + trx_rseg_t* rseg; + +#ifdef UNIV_DEBUG + ulint start_scan_slot = slot; + bool look_for_rollover = false; +#endif /* UNIV_DEBUG */ + + bool allocated = false; + + do { + for (;;) { + rseg = &trx_sys.rseg_array[slot]; + +#ifdef UNIV_DEBUG + /* Ensure that we are not revisiting the same + slot that we have already inspected. */ + if (look_for_rollover) { + ut_ad(start_scan_slot != slot); + } + look_for_rollover = true; +#endif /* UNIV_DEBUG */ + + ut_d(if (!trx_rseg_n_slots_debug)) + slot = (slot + 1) % TRX_SYS_N_RSEGS; + + if (!rseg->space) { + continue; + } + + ut_ad(rseg->is_persistent()); + + if (rseg->space != fil_system.sys_space) { + if (rseg->skip_allocation() + || !srv_undo_tablespaces) { + continue; + } + } else if (const fil_space_t *space = + trx_sys.rseg_array[slot].space) { + if (space != fil_system.sys_space + && srv_undo_tablespaces > 0) { + /** If dedicated + innodb_undo_tablespaces have + been configured, try to use them + instead of the system tablespace. */ + continue; + } + } + + break; + } + + /* By now we have only selected the rseg but not marked it + allocated. By marking it allocated we are ensuring that it will + never be selected for UNDO truncate purge. */ + allocated = rseg->acquire_if_available(); + } while (!allocated); + + ut_ad(rseg->is_referenced()); + ut_ad(rseg->is_persistent()); + return(rseg); +} + +/** Assign a rollback segment for modifying temporary tables. +@return the assigned rollback segment */ +trx_rseg_t *trx_t::assign_temp_rseg() +{ + ut_ad(!rsegs.m_noredo.rseg); + ut_ad(!is_autocommit_non_locking()); + compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS)); + + /* Choose a temporary rollback segment between 0 and 127 + in a round-robin fashion. */ + static Atomic_counter<unsigned> rseg_slot; + trx_rseg_t* rseg = &trx_sys.temp_rsegs[ + rseg_slot++ & (TRX_SYS_N_RSEGS - 1)]; + ut_ad(!rseg->is_persistent()); + rsegs.m_noredo.rseg = rseg; + + if (id == 0) { + trx_sys.register_rw(this); + } + + return(rseg); +} + +/****************************************************************//** +Starts a transaction. */ +static +void +trx_start_low( +/*==========*/ + trx_t* trx, /*!< in: transaction */ + bool read_write) /*!< in: true if read-write transaction */ +{ + ut_ad(!trx->in_rollback); + ut_ad(!trx->is_recovered); + ut_ad(trx->start_line != 0); + ut_ad(trx->start_file != 0); + ut_ad(trx->roll_limit == 0); + ut_ad(trx->error_state == DB_SUCCESS); + ut_ad(trx->rsegs.m_redo.rseg == NULL); + ut_ad(trx->rsegs.m_noredo.rseg == NULL); + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* Check whether it is an AUTOCOMMIT SELECT */ + trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd); + + trx->read_only = srv_read_only_mode + || (!trx->dict_operation + && thd_trx_is_read_only(trx->mysql_thd)); + + if (!trx->auto_commit) { + trx->will_lock = true; + } else if (!trx->will_lock) { + trx->read_only = true; + } + +#ifdef WITH_WSREP + trx->xid.null(); +#endif /* WITH_WSREP */ + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ut_a(trx->lock.table_locks.empty()); + + /* No other thread can access this trx object through rw_trx_hash, + still it can be found through trx_sys.trx_list. Sometimes it's + possible to indirectly protect trx_t::state by freezing + trx_sys.trx_list. + + For now we update it without mutex protection, because original code + did it this way. It has to be reviewed and fixed properly. */ + trx->state = TRX_STATE_ACTIVE; + + /* By default all transactions are in the read-only list unless they + are non-locking auto-commit read only transactions or background + (internal) transactions. Note: Transactions marked explicitly as + read only can write to temporary tables, we put those on the RO + list too. */ + + if (!trx->read_only + && (!trx->mysql_thd || read_write || trx->dict_operation)) { + + /* Temporary rseg is assigned only if the transaction + updates a temporary table */ + trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); + ut_ad(trx->rsegs.m_redo.rseg != 0 + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + + trx_sys.register_rw(trx); + } else { + if (!trx->is_autocommit_non_locking()) { + + /* If this is a read-only transaction that is writing + to a temporary table then it needs a transaction id + to write to the temporary table. */ + + if (read_write) { + ut_ad(!srv_read_only_mode); + trx_sys.register_rw(trx); + } + } else { + ut_ad(!read_write); + } + } + + trx->start_time = time(NULL); + trx->start_time_micro = trx->mysql_thd + ? thd_start_utime(trx->mysql_thd) + : microsecond_interval_timer(); + + ut_a(trx->error_state == DB_SUCCESS); +} + +/** Set the serialisation number for a persistent committed transaction. +@param[in,out] trx committed transaction with persistent changes */ +static +void +trx_serialise(trx_t* trx) +{ + trx_rseg_t *rseg = trx->rsegs.m_redo.rseg; + ut_ad(rseg); + + if (rseg->last_page_no == FIL_NULL) { + mysql_mutex_lock(&purge_sys.pq_mutex); + } + + trx_sys.assign_new_trx_no(trx); + + /* If the rollback segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + if (rseg->last_page_no == FIL_NULL) { + purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no, + *rseg)); + mysql_mutex_unlock(&purge_sys.pq_mutex); + } +} + +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. */ +static +void +trx_write_serialisation_history( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + /* Change the undo log segment states from TRX_UNDO_ACTIVE to some + other state: these modifications to the file data structure define + the transaction as committed in the file based domain, at the + serialization point of the log sequence number lsn obtained below. */ + + /* We have to hold the rseg mutex because update log headers have + to be put to the history list in the (serialisation) order of the + UNDO trx number. This is required for the purge in-memory data + structures too. */ + + if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { + /* Undo log for temporary tables is discarded at transaction + commit. There is no purge for temporary tables, and also no + MVCC, because they are private to a session. */ + + mtr_t temp_mtr; + temp_mtr.start(); + temp_mtr.set_log_mode(MTR_LOG_NO_REDO); + trx_undo_set_state_at_finish(undo, &temp_mtr); + temp_mtr.commit(); + } + + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + if (!rseg) { + ut_ad(!trx->rsegs.m_redo.undo); + return; + } + + trx_undo_t*& undo = trx->rsegs.m_redo.undo; + + if (!undo) { + return; + } + + ut_ad(!trx->read_only); + ut_ad(!undo || undo->rseg == rseg); + rseg->latch.wr_lock(SRW_LOCK_CALL); + + /* Assign the transaction serialisation number and add any + undo log to the purge queue. */ + trx_serialise(trx); + if (undo) { + UT_LIST_REMOVE(rseg->undo_list, undo); + trx_purge_add_undo_to_history(trx, undo, mtr); + } + + rseg->latch.wr_unlock(); + + MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); +} + +/******************************************************************** +Finalize a transaction containing updates for a FTS table. */ +static +void +trx_finalize_for_fts_table( +/*=======================*/ + fts_trx_table_t* ftt) /* in: FTS trx table */ +{ + fts_t* fts = ftt->table->fts; + fts_doc_ids_t* doc_ids = ftt->added_doc_ids; + + ut_a(fts->add_wq); + + mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg); + + ib_wqueue_add(fts->add_wq, doc_ids, heap); + + /* fts_trx_table_t no longer owns the list. */ + ftt->added_doc_ids = NULL; +} + +/******************************************************************//** +Finalize a transaction containing updates to FTS tables. */ +static +void +trx_finalize_for_fts( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + bool is_commit) /*!< in: true if the transaction was + committed, false if it was rolled back. */ +{ + if (is_commit) { + const ib_rbt_node_t* node; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + + tables = savepoint->tables; + + for (node = rbt_first(tables); + node; + node = rbt_next(tables, node)) { + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + if ((*ftt)->added_doc_ids) { + trx_finalize_for_fts_table(*ftt); + } + } + } + + fts_trx_free(trx->fts_trx); + trx->fts_trx = NULL; +} + +extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD); +extern "C" void thd_decrement_pending_ops(MYSQL_THD); + + +#include "../log/log0sync.h" + +/* + If required, initiates write and optionally flush of the log to + disk + @param lsn LSN up to which logs are to be flushed. + @param trx transaction; if trx->state is PREPARED, the function will + also wait for the flush to complete. +*/ +static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx) +{ + if (!srv_flush_log_at_trx_commit) + return; + + if (log_sys.get_flushed_lsn() > lsn) + return; + + const bool flush= srv_file_flush_method != SRV_NOSYNC && + (srv_flush_log_at_trx_commit & 1); + + if (trx->state == TRX_STATE_PREPARED) + { + /* XA, which is used with binlog as well. + Be conservative, use synchronous wait.*/ +sync: + log_write_up_to(lsn, flush); + return; + } + + completion_callback cb; + if ((cb.m_param = thd_increment_pending_ops(trx->mysql_thd))) + { + cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops; + log_write_up_to(lsn, flush, false, &cb); + } + else + goto sync; +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed( +/*====================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "flushing log"; + trx_flush_log_if_needed_low(lsn, trx); + trx->op_info = ""; +} + +/** Process tables that were modified by the committing transaction. */ +inline void trx_t::commit_tables() +{ + if (undo_no && !mod_tables.empty()) + { + const trx_id_t max_trx_id= trx_sys.get_max_trx_id(); + const auto now= start_time; + + for (const auto &p : mod_tables) + { + dict_table_t *table= p.first; + table->update_time= now; + table->query_cache_inv_trx_id= max_trx_id; + } + } +} + +/** Evict a table definition due to the rollback of ALTER TABLE. +@param table_id table identifier +@param reset_only whether to only reset dict_table_t::def_trx_id */ +void trx_t::evict_table(table_id_t table_id, bool reset_only) +{ + ut_ad(in_rollback); + + dict_table_t* table = dict_sys.find_table(table_id); + if (!table) { + return; + } + + table->def_trx_id = 0; + + if (auto ref_count = table->get_ref_count()) { + /* This must be a DDL operation that is being rolled + back in an active connection. */ + ut_a(ref_count == 1); + ut_ad(!is_recovered); + ut_ad(mysql_thd); + return; + } + + if (reset_only) { + return; + } + + /* This table should only be locked by this transaction, if at all. */ + ut_ad(UT_LIST_GET_LEN(table->locks) <= 1); + const bool locked = UT_LIST_GET_LEN(table->locks); + ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this); + dict_sys.remove(table, true, locked); + if (locked) { + UT_LIST_ADD_FIRST(lock.evicted_tables, table); + } +} + +TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr) +{ + /* We already detached from rseg in trx_write_serialisation_history() */ + ut_ad(!rsegs.m_redo.undo); + must_flush_log_later= false; + read_view.close(); + + if (is_autocommit_non_locking()) + { + ut_ad(id == 0); + ut_ad(read_only); + ut_ad(!will_lock); + ut_a(!is_recovered); + ut_ad(!rsegs.m_redo.rseg); + ut_ad(!rsegs.m_redo.undo); + ut_ad(mysql_thd); + ut_ad(state == TRX_STATE_ACTIVE); + + /* Note: We do not have to hold any lock_sys latch here, because + this is a non-locking transaction. */ + ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + + /* This state change is not protected by any mutex, therefore + there is an inherent race here around state transition during + printouts. We ignore this race for the sake of efficiency. + However, the freezing of trx_sys.trx_list will protect the trx_t + instance and it cannot be removed from the trx_list and freed + without first unfreezing trx_list. */ + state= TRX_STATE_NOT_STARTED; + + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); + + DBUG_LOG("trx", "Autocommit in memory: " << this); + } + else + { +#ifdef UNIV_DEBUG + if (!UT_LIST_GET_LEN(lock.trx_locks)) + for (auto l : lock.table_locks) + ut_ad(!l); +#endif /* UNIV_DEBUG */ + commit_state(); + + if (id) + { + trx_sys.deregister_rw(this); + + /* Wait for any implicit-to-explicit lock conversions to cease, + so that there will be no race condition in lock_release(). */ + while (UNIV_UNLIKELY(is_referenced())) + LF_BACKOFF(); + } + else + ut_ad(read_only || !rsegs.m_redo.rseg); + + if (read_only || !rsegs.m_redo.rseg) + { + MONITOR_INC(MONITOR_TRX_RO_COMMIT); + } + else + { + commit_tables(); + MONITOR_INC(MONITOR_TRX_RW_COMMIT); + is_recovered= false; + } + + if (UNIV_LIKELY(!dict_operation)) + release_locks(); + } + + if (trx_rseg_t *rseg= rsegs.m_redo.rseg) + /* This is safe due to us having detached the persistent undo log. */ + rseg->release(); + + if (mtr) + { + if (trx_undo_t *&undo= rsegs.m_noredo.undo) + { + ut_ad(undo->rseg == rsegs.m_noredo.rseg); + trx_undo_commit_cleanup(undo); + undo= nullptr; + } + + /* NOTE that we could possibly make a group commit more efficient + here: call std::this_thread::yield() here to allow also other trxs to come + to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if the OS + does not crash. We may also flush the log files to disk, making + the transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group commit is that a group of transactions + gather behind a trx doing a physical disk write to log files, and + when that physical write has been completed, one of those + transactions does a write which commits the whole group. Note that + this group commit will only bring benefit if there are > 2 users + in the database. Then at least 2 users can gather behind one doing + the physical log write to disk. + + If we are calling trx_t::commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the group commit + algorithm to work. Otherwise, the prepare_commit mutex would + serialize all commits and prevent a group of transactions from + gathering. */ + + commit_lsn= undo_no || !xid.is_null() ? mtr->commit_lsn() : 0; + if (!commit_lsn) + /* Nothing to be done. */; + else if (flush_log_later) + /* Do nothing yet */ + must_flush_log_later= true; + else if (srv_flush_log_at_trx_commit) + trx_flush_log_if_needed(commit_lsn, this); + } + + ut_ad(!rsegs.m_noredo.undo); + + savepoints_discard(); + + if (fts_trx) + trx_finalize_for_fts(this, undo_no != 0); + +#ifdef WITH_WSREP + /* Serialization history has been written and the transaction is + committed in memory, which makes this commit ordered. Release commit + order critical section. */ + if (wsrep) + { + wsrep= false; + wsrep_commit_ordered(mysql_thd); + } +#endif /* WITH_WSREP */ + lock.was_chosen_as_deadlock_victim= false; +} + +void trx_t::commit_cleanup() +{ + ut_ad(!dict_operation); + ut_ad(!was_dict_operation); + + mutex.wr_lock(); + state= TRX_STATE_NOT_STARTED; + mod_tables.clear(); + + assert_freed(); + trx_init(this); + mutex.wr_unlock(); + + ut_a(error_state == DB_SUCCESS); +} + +/** Commit the transaction in a mini-transaction. +@param mtr mini-transaction (if there are any persistent modifications) */ +TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr) +{ + ut_ad(!mtr || mtr->is_active()); + ut_d(bool aborted= in_rollback && error_state == DB_DEADLOCK); + ut_ad(!mtr == (aborted || !has_logged())); + ut_ad(!mtr || !aborted); + + if (fts_trx && undo_no) + { + ut_a(!is_autocommit_non_locking()); + /* MDEV-24088 FIXME: Invoke fts_commit() earlier (before possible + XA PREPARE), so that we will be able to return an error and rollback + the transaction, instead of violating consistency! + + The original claim about DB_DUPLICATE KEY was: + This is a possible scenario if there is a crash between + insert to DELETED table committing and transaction committing. The + fix would be able to return error from this function */ + if (ut_d(dberr_t error=) fts_commit(this)) + ut_ad(error == DB_DUPLICATE_KEY || error == DB_LOCK_WAIT_TIMEOUT); + } + +#ifdef ENABLED_DEBUG_SYNC + const bool debug_sync= mysql_thd && has_logged_persistent(); +#endif + + if (mtr) + { + if (UNIV_UNLIKELY(apply_online_log)) + apply_log(); + trx_write_serialisation_history(this, mtr); + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this log + sequence number. The transaction becomes 'durable' when we write + the log to disk, but in the logical sense the commit in the + file-based data structures (undo logs etc.) happens here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come in + exactly the same order as commit lsn's, if the transactions have + different rollback segments. To get exactly the same order we + should hold the kernel mutex up to this point, adding to the + contention of the kernel mutex. However, if a transaction T2 is + able to see modifications made by a transaction T1, T2 will always + get a bigger transaction number and a bigger commit lsn than T1. */ + + mtr->commit(); + } +#ifdef ENABLED_DEBUG_SYNC + if (debug_sync) + DEBUG_SYNC_C("before_trx_state_committed_in_memory"); +#endif + + commit_in_memory(mtr); +} + + +void trx_t::commit_persist() +{ + mtr_t *mtr= nullptr; + mtr_t local_mtr; + + if (has_logged()) + { + mtr= &local_mtr; + local_mtr.start(); + } + commit_low(mtr); +} + + +void trx_t::commit() +{ + ut_ad(!was_dict_operation); + ut_d(was_dict_operation= dict_operation); + dict_operation= false; + commit_persist(); + ut_d(was_dict_operation= false); + ut_d(for (const auto &p : mod_tables) ut_ad(!p.second.is_dropped())); + commit_cleanup(); +} + + +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx->mutex + here, because the commit or rollback should be invoked for a + running (or recovered prepared) transaction that is associated + with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, true); + /* fall through */ + + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + trx->lock.wait_thr = NULL; + return; + + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node))); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + trx_t* trx; + + node->state = COMMIT_NODE_WAIT; + + trx = thr_get_trx(thr); + + ut_a(trx->lock.wait_thr == NULL); + + trx_commit_or_rollback_prepare(trx); + + trx->commit(); + ut_ad(trx->lock.wait_thr == NULL); + + thr = NULL; + } else { + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + return DB_SUCCESS; + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + trx->op_info = "committing"; + trx->commit(); + trx->op_info = ""; + return(DB_SUCCESS); + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + ut_error; + return(DB_CORRUPTION); +} + +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + if (trx->id != 0 + || !trx->must_flush_log_later + || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) { + + return; + } + + trx_flush_log_if_needed(trx->commit_lsn, trx); + + trx->must_flush_log_later = false; +} + +/**********************************************************************//** +Marks the latest SQL statement ended. */ +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + case TRX_STATE_NOT_STARTED: + trx->undo_no = 0; + /* fall through */ + case TRX_STATE_ACTIVE: + if (trx->fts_trx != NULL) { + fts_savepoint_laststmt_refresh(trx); + } + + if (trx->is_bulk_insert()) { + /* Allow a subsequent INSERT into an empty table + if !unique_checks && !foreign_key_checks. */ + return; + } + + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + trx->end_bulk_insert(); + return; + } + + ut_error; +} + +/**********************************************************************//** +Prints info about a transaction. */ +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: trx->lock.n_rec_locks */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ +{ + if (const trx_id_t id = trx->id) { + fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id); + } else { + fprintf(f, "TRANSACTION (%p)", trx); + } + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + fputs(", not started", f); + goto state_ok; + case TRX_STATE_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + goto state_ok; + } + fprintf(f, ", state %lu", (ulong) trx->state); + ut_ad(0); +state_ok: + const char* op_info = trx->op_info; + + if (*op_info) { + putc(' ', f); + fputs(op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + bool newline = true; + + if (trx->in_rollback) { /* dirty read for performance reasons */ + fputs("ROLLING BACK ", f); + } else if (trx->lock.wait_lock) { + fputs("LOCK WAIT ", f); + } else { + newline = false; + } + + if (n_trx_locks > 0 || heap_size > 400) { + newline = true; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) n_trx_locks, + (ulong) heap_size, + (ulong) n_rec_locks); + } + + if (trx->undo_no != 0) { + newline = true; + fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); + } + + if (newline) { + putc('\n', f); + } + + if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) { + innobase_mysql_print_thd( + f, trx->mysql_thd, static_cast<uint>(max_query_len)); + } +} + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys.latch. +When possible, use trx_print() instead. */ +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + lock_sys.assert_locked(); + + trx_print_low(f, trx, max_query_len, + trx->lock.n_rec_locks, + UT_LIST_GET_LEN(trx->lock.trx_locks), + mem_heap_get_size(trx->lock.lock_heap)); +} + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys.latch. */ +TRANSACTIONAL_TARGET +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ulint n_rec_locks, n_trx_locks, heap_size; + { + TMLockMutexGuard g{SRW_LOCK_CALL}; + n_rec_locks= trx->lock.n_rec_locks; + n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size= mem_heap_get_size(trx->lock.lock_heap); + } + + trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size); +} + +/** Prepare a transaction. +@return log sequence number that makes the XA PREPARE durable +@retval 0 if no changes needed to be made durable */ +static lsn_t trx_prepare_low(trx_t *trx) +{ + ut_ad(!trx->is_recovered); + + mtr_t mtr; + + if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { + ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); + + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + trx_undo_set_state_at_prepare(trx, undo, false, &mtr); + mtr.commit(); + } + + trx_undo_t* undo = trx->rsegs.m_redo.undo; + + if (!undo) { + /* There were no changes to persistent tables. */ + return(0); + } + + ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); + + mtr.start(); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE to + TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the file-based + world, at the serialization point of lsn. */ + trx_undo_set_state_at_prepare(trx, undo, false, &mtr); + + /* Make the XA PREPARE durable. */ + mtr.commit(); + ut_ad(mtr.commit_lsn() > 0); + return(mtr.commit_lsn()); +} + +/****************************************************************//** +Prepares a transaction. */ +TRANSACTIONAL_TARGET +static +void +trx_prepare( +/*========*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Only fresh user transactions can be prepared. + Recovered transactions cannot. */ + ut_a(!trx->is_recovered); + + lsn_t lsn = trx_prepare_low(trx); + + DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE();); + + ut_a(trx->state == TRX_STATE_ACTIVE); + { + TMTrxGuard tg{*trx}; + trx->state = TRX_STATE_PREPARED; + } + + if (lsn) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + We must not be holding any mutexes or latches here. */ + + trx_flush_log_if_needed(lsn, trx); + + if (!UT_LIST_GET_LEN(trx->lock.trx_locks) + || trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* Do not release any locks at the + SERIALIZABLE isolation level. */ + } else if (!trx->mysql_thd + || thd_sql_command(trx->mysql_thd) + != SQLCOM_XA_PREPARE) { + /* Do not release locks for XA COMMIT ONE PHASE + or for internal distributed transactions + (XID::get_my_xid() would be nonzero). */ + } else { + lock_release_on_prepare(trx); + } + } +} + +/** XA PREPARE a transaction. +@param[in,out] trx transaction to prepare */ +void trx_prepare_for_mysql(trx_t* trx) +{ + trx_start_if_not_started_xa(trx, false); + + trx->op_info = "preparing"; + + trx_prepare(trx); + + trx->op_info = ""; +} + + +struct trx_recover_for_mysql_callback_arg +{ + XID *xid_list; + uint len; + uint count; +}; + + +static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element, + trx_recover_for_mysql_callback_arg *arg) +{ + DBUG_ASSERT(arg->len > 0); + element->mutex.wr_lock(); + if (trx_t *trx= element->trx) + { + /* + The state of a read-write transaction can only change from ACTIVE to + PREPARED while we are holding the element->mutex. But since it is + executed at startup no state change should occur. + */ + if (trx_state_eq(trx, TRX_STATE_PREPARED)) + { + ut_ad(trx->is_recovered); + ut_ad(trx->id); + if (arg->count == 0) + ib::info() << "Starting recovery for XA transactions..."; + XID& xid= arg->xid_list[arg->count]; + if (arg->count++ < arg->len) + { + trx->state= TRX_STATE_PREPARED_RECOVERED; + ib::info() << "Transaction " << trx->id + << " in prepared state after recovery"; + ib::info() << "Transaction contains changes to " << trx->undo_no + << " rows"; + xid= trx->xid; + } + } + } + element->mutex.wr_unlock(); + /* Do not terminate upon reaching arg->len; count all transactions */ + return false; +} + + +static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element, + void*) +{ + element->mutex.wr_lock(); + if (trx_t *trx= element->trx) + { + if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) + trx->state= TRX_STATE_PREPARED; + } + element->mutex.wr_unlock(); + return false; +} + + +/** + Find prepared transaction objects for recovery. + + @param[out] xid_list prepared transactions + @param[in] len number of slots in xid_list + + @return number of prepared transactions stored in xid_list +*/ + +int trx_recover_for_mysql(XID *xid_list, uint len) +{ + trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 }; + + ut_ad(xid_list); + ut_ad(len); + + /* Fill xid_list with PREPARED transactions. */ + trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg); + if (arg.count) + { + ib::info() << arg.count + << " transactions in prepared state after recovery"; + /* After returning the full list, reset the state, because + init_server_components() wants to recover the collection of + transactions twice, by first calling tc_log->open() and then + ha_recover() directly. */ + if (arg.count <= len) + trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback); + } + return int(std::min(arg.count, len)); +} + + +struct trx_get_trx_by_xid_callback_arg +{ + const XID *xid; + trx_t *trx; +}; + + +static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element, + trx_get_trx_by_xid_callback_arg *arg) +{ + my_bool found= 0; + element->mutex.wr_lock(); + if (trx_t *trx= element->trx) + { + trx->mutex_lock(); + if (trx->is_recovered && + (trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) && + arg->xid->eq(&trx->xid)) + { +#ifdef WITH_WSREP + /* The commit of a prepared recovered Galera + transaction needs a valid trx->xid for + invoking trx_sys_update_wsrep_checkpoint(). */ + if (!wsrep_is_wsrep_xid(&trx->xid)) +#endif /* WITH_WSREP */ + /* Invalidate the XID, so that subsequent calls will not find it. */ + trx->xid.null(); + arg->trx= trx; + found= 1; + } + trx->mutex_unlock(); + } + element->mutex.wr_unlock(); + return found; +} + +/** Look up an X/Open distributed transaction in XA PREPARE state. +@param[in] xid X/Open XA transaction identifier +@return transaction on match (the trx_t::xid will be invalidated); +note that the trx may have been committed before the caller acquires +trx_t::mutex +@retval NULL if no match */ +trx_t* trx_get_trx_by_xid(const XID* xid) +{ + trx_get_trx_by_xid_callback_arg arg= { xid, 0 }; + + if (xid) + trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg); + return arg.trx; +} + + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write) /*!< in: true if read write transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, read_write); + return; + + case TRX_STATE_ACTIVE: + if (trx->id == 0 && read_write) { + /* If the transaction is tagged as read-only then + it can only write to temp tables and for such + transactions we don't want to move them to the + trx_sys_t::rw_trx_hash. */ + if (!trx->read_only) { + trx_set_rw_mode(trx); + } + } + return; + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_low( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + bool read_write) /*!< in: true if read write transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, read_write); + return; + + case TRX_STATE_ACTIVE: + if (read_write && trx->id == 0 && !trx->read_only) { + trx_set_rw_mode(trx); + } + return; + + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/** +Start a transaction for internal processing. +@param trx transaction +@param read_write whether writes may be performed */ +void trx_start_internal_low(trx_t *trx, bool read_write) +{ + trx->will_lock= true; + trx_start_low(trx, read_write); +} + +/** Start a transaction for a DDL operation. +@param trx transaction */ +void trx_start_for_ddl_low(trx_t *trx) +{ + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + trx->dict_operation= true; + trx_start_internal_low(trx, true); +} + +/*************************************************************//** +Set the transaction as a read-write transaction if it is not already +tagged as such. Read-only transactions that are writing to temporary +tables are assigned an ID and a rollback segment but are not added +to the trx read-write list because their updates should not be visible +to other transactions and therefore their changes can be ignored by +by MVCC. */ +void +trx_set_rw_mode( +/*============*/ + trx_t* trx) /*!< in/out: transaction that is RW */ +{ + ut_ad(trx->rsegs.m_redo.rseg == 0); + ut_ad(!trx->is_autocommit_non_locking()); + ut_ad(!trx->read_only); + ut_ad(trx->id == 0); + + if (high_level_read_only) { + return; + } + + trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); + ut_ad(trx->rsegs.m_redo.rseg != 0); + + trx_sys.register_rw(trx); + ut_ad(trx->id); + + /* So that we can see our own changes. */ + if (trx->read_view.is_open()) { + trx->read_view.set_creator_trx_id(trx->id); + } +} diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc new file mode 100644 index 00000000..cd21ebe1 --- /dev/null +++ b/storage/innobase/trx/trx0undo.cc @@ -0,0 +1,1581 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0undo.cc +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" +#include "fsp0fsp.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0rseg.h" +#include "log.h" + +/* How should the old versions in the history list be managed? + ---------------------------------------------------------- +If each transaction is given a whole page for its update undo log, file +space consumption can be 10 times higher than necessary. Therefore, +partly filled update undo log pages should be reusable. But then there +is no way individual pages can be ordered so that the ordering agrees +with the serialization numbers of the transactions on the pages. Thus, +the history list must be formed of undo logs, not their header pages as +it was in the old implementation. + However, on a single header page the transactions are placed in +the order of their serialization numbers. As old versions are purged, we +may free the page when the last transaction on the page has been purged. + A problem is that the purge has to go through the transactions +in the serialization order. This means that we have to look through all +rollback segments for the one that has the smallest transaction number +in its history list. + When should we do a purge? A purge is necessary when space is +running out in any of the rollback segments. Then we may have to purge +also old version which might be needed by some consistent read. How do +we trigger the start of a purge? When a transaction writes to an undo log, +it may notice that the space is running out. When a read view is closed, +it may make some history superfluous. The server can have an utility which +periodically checks if it can purge some history. + In a parallellized purge we have the problem that a query thread +can remove a delete marked clustered index record before another query +thread has processed an earlier version of the record, which cannot then +be done because the row cannot be constructed from the clustered index +record. To avoid this problem, we will store in the update and delete mark +undo record also the columns necessary to construct the secondary index +entries which are modified. + We can latch the stack of versions of a single clustered index record +by taking a latch on the clustered index page. As long as the latch is held, +no new versions can be added and no versions removed by undo. But, a purge +can still remove old versions from the bottom of the stack. */ + +/* How to protect rollback segments, undo logs, and history lists with + ------------------------------------------------------------------- +latches? +------- +When a transaction does its first insert or modify in the clustered index, an +undo log is assigned for it. Then we must have an x-latch to the rollback +segment header. + When the transaction performs modifications or rolls back, its +undo log is protected by undo page latches. +Only the thread that is associated with the transaction may hold multiple +undo page latches at a time. Undo pages are always private to a single +transaction. Other threads that are performing MVCC reads +or checking for implicit locks will lock at most one undo page at a time +in trx_undo_get_undo_rec_low(). + When the transaction commits, its persistent undo log is added +to the history list. If it is not suitable for reuse, its slot is reset. +In both cases, an x-latch must be acquired on the rollback segment header page. + The purge operation steps through the history list without modifying +it until a truncate operation occurs, which can remove undo logs from the end +of the list and release undo log segments. In stepping through the list, +s-latches on the undo log pages are enough, but in a truncate, x-latches must +be obtained on the rollback segment and individual pages. */ + +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + uint32_t page_no,/*!< in: undo log header page number */ + uint16_t offset);/*!< in: undo log header byte offset on page */ + +/** Determine the start offset of undo log records of an undo log page. +@param[in] block undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset +@return start offset */ +static +uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no, + uint16_t offset) +{ + return page_no == block->page.id().page_no() + ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->page.frame) + : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; +} + +/** Get the first undo log record on a page. +@param[in] block undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to first record +@retval NULL if none exists */ +static trx_undo_rec_t* +trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no, + uint16_t offset) +{ + uint16_t start= trx_undo_page_get_start(block, page_no, offset); + return start == trx_undo_page_get_end(block, page_no, offset) + ? nullptr : block->page.frame + start; +} + +/** Get the last undo log record on a page. +@param[in] page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to last record +@retval NULL if none exists */ +static +trx_undo_rec_t* +trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no, + uint16_t offset) +{ + uint16_t end= trx_undo_page_get_end(block, page_no, offset); + return trx_undo_page_get_start(block, page_no, offset) == end + ? nullptr + : block->page.frame + mach_read_from_2(block->page.frame + end - 2); +} + +/** Get the previous record in an undo log from the previous page. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +static trx_undo_rec_t* +trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec, + uint32_t page_no, uint16_t offset, + bool shared, mtr_t *mtr) +{ + uint32_t prev_page_no= mach_read_from_4(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE + + FLST_PREV + FIL_ADDR_PAGE + + block->page.frame); + + if (prev_page_no == FIL_NULL) + return nullptr; + + block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no), + 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr); + + return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr; +} + +/** Get the previous undo log record. +@param[in] block undo log page +@param[in] rec undo log record +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to record +@retval NULL if none */ +static +trx_undo_rec_t* +trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec, + uint32_t page_no, uint16_t offset) +{ + ut_ad(block->page.frame == page_align(rec)); + return + rec == block->page.frame + trx_undo_page_get_start(block, page_no, offset) + ? nullptr + : block->page.frame + mach_read_from_2(rec - 2); +} + +/** Get the previous record in an undo log. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no, + uint16_t offset, bool shared, mtr_t *mtr) +{ + if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block, + block->page.frame + rec, + page_no, offset)) + return prev; + + /* We have to go to the previous undo log page to look for the + previous record */ + + return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset, + shared, mtr); +} + +/** Get the next record in an undo log from the next page. +@param[in,out] block undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +static trx_undo_rec_t* +trx_undo_get_next_rec_from_next_page(const buf_block_t *&block, + uint32_t page_no, uint16_t offset, + ulint mode, mtr_t *mtr) +{ + if (page_no == block->page.id().page_no() && + mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG)) + return nullptr; + + uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + + FLST_NEXT + FIL_ADDR_PAGE + + block->page.frame); + if (next == FIL_NULL) + return nullptr; + + block= buf_page_get_gen(page_id_t(block->page.id().space(), next), 0, mode, + nullptr, BUF_GET_POSSIBLY_FREED, mtr); + + return block ? trx_undo_page_get_first_rec(block, page_no, offset) : nullptr; +} + +/** Get the next record in an undo log. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_next_rec(const buf_block_t *&block, uint16_t rec, + uint32_t page_no, uint16_t offset, mtr_t *mtr) +{ + if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no, + offset)) + return next; + + return trx_undo_get_next_rec_from_next_page(block, page_no, offset, + RW_S_LATCH, mtr); +} + +/** Get the first record in an undo log. +@param[in] space undo log header space +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] mode latching mode: RW_S_LATCH or RW_X_LATCH +@param[out] block undo log page +@param[in,out] mtr mini-transaction +@param[out] err error code +@return undo log record, the page latched +@retval nullptr if none */ +trx_undo_rec_t* +trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, + uint16_t offset, ulint mode, const buf_block_t*& block, + mtr_t *mtr, dberr_t *err) +{ + block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode, + nullptr, BUF_GET, mtr, err); + if (!block) + return nullptr; + + if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset)) + return rec; + + return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode, + mtr); +} + +inline void UndorecApplier::assign_rec(const buf_block_t &block, + uint16_t offset) +{ + ut_ad(block.page.lock.have_s()); + this->offset= offset; + this->undo_rec= trx_undo_rec_copy(block.page.frame + offset, heap); +} + +inline void UndorecApplier::apply_undo_rec() +{ + if (!undo_rec) + return; + bool updated_extern= false; + undo_no_t undo_no= 0; + table_id_t table_id= 0; + undo_rec= trx_undo_rec_get_pars(undo_rec, &type, + &cmpl_info, + &updated_extern, &undo_no, &table_id); + dict_sys.freeze(SRW_LOCK_CALL); + dict_table_t *table= dict_sys.find_table(table_id); + dict_sys.unfreeze(); + + ut_ad(table); + if (!table->is_active_ddl()) + return; + + dict_index_t *index= dict_table_get_first_index(table); + const dtuple_t *undo_tuple; + switch (type) { + default: + ut_ad("invalid type" == 0); + MY_ASSERT_UNREACHABLE(); + case TRX_UNDO_INSERT_REC: + undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap); + insert: + log_insert(*undo_tuple, index); + break; + case TRX_UNDO_UPD_EXIST_REC: + case TRX_UNDO_UPD_DEL_REC: + case TRX_UNDO_DEL_MARK_REC: + trx_id_t trx_id; + roll_ptr_t roll_ptr; + byte info_bits; + undo_rec= trx_undo_update_rec_get_sys_cols( + undo_rec, &trx_id, &roll_ptr, &info_bits); + + undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap); + undo_rec= trx_undo_update_rec_get_update(undo_rec, index, type, trx_id, + roll_ptr, info_bits, + heap, &update); + if (type == TRX_UNDO_UPD_DEL_REC) + goto insert; + log_update(*undo_tuple, index); + } + + clear_undo_rec(); +} + +/** Apply any changes to tables for which online DDL is in progress. */ +ATTRIBUTE_COLD void trx_t::apply_log() +{ + const trx_undo_t *undo= rsegs.m_redo.undo; + if (!undo || !undo_no) + return; + page_id_t page_id{rsegs.m_redo.rseg->space->id, undo->hdr_page_no}; + page_id_t next_page_id(page_id); + mtr_t mtr; + mtr.start(); + buf_block_t *block= buf_page_get(page_id, 0, RW_S_LATCH, &mtr); + if (UNIV_UNLIKELY(!block)) + { + mtr.commit(); + return; + } + + UndorecApplier log_applier(page_id, id); + + for (;;) + { + trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_id.page_no(), + undo->hdr_offset); + while (rec) + { + log_applier.assign_rec(*block, page_offset(rec)); + mtr.commit(); + log_applier.apply_undo_rec(); + mtr.start(); + block= buf_page_get(log_applier.get_page_id(), 0, RW_S_LATCH, &mtr); + if (UNIV_UNLIKELY(!block)) + goto func_exit; + rec= trx_undo_page_get_next_rec(block, log_applier.get_offset(), + page_id.page_no(), undo->hdr_offset); + } + + uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + + FLST_NEXT + FIL_ADDR_PAGE + + block->page.frame); + if (next == FIL_NULL) + break; + next_page_id.set_page_no(next); + mtr.commit(); + mtr.start(); + block= buf_page_get_gen(next_page_id, 0, RW_S_LATCH, block, BUF_GET, &mtr); + if (UNIV_UNLIKELY(!block)) + break; + log_applier.assign_next(next_page_id); + } +func_exit: + mtr.commit(); + apply_online_log= false; +} + +/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ + +/** Initialize an undo log page. +NOTE: This corresponds to a redo log record and must not be changed! +@see mtr_t::undo_create() +@param block undo log page */ +void trx_undo_page_init(const buf_block_t &block) +{ + mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.page.frame), + FIL_PAGE_UNDO_LOG); + static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility"); + memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.page.frame, + 0, 2); + mach_write_to_2(my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame), + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame, + 2); + /* The following corresponds to flst_zero_both(), but without writing log. */ + memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV + + FIL_ADDR_PAGE + block.page.frame, 0xff, 4); + memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV + + FIL_ADDR_BYTE + block.page.frame, 0, 2); + memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT + + FIL_ADDR_PAGE + block.page.frame, 0xff, 4); + memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT + + FIL_ADDR_BYTE + block.page.frame, 0, 2); + static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 == + TRX_UNDO_PAGE_HDR_SIZE, "compatibility"); + /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */ + memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + + block.page.frame, 0, + srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + + FIL_PAGE_DATA_END)); +} + +/** Look for a free slot for an undo log segment. +@param rseg_header rollback segment header +@return slot index +@retval ULINT_UNDEFINED if not found */ +static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header) +{ + ulint max_slots= TRX_RSEG_N_SLOTS; + +#ifdef UNIV_DEBUG + if (trx_rseg_n_slots_debug) + max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS); +#endif + + for (ulint i= 0; i < max_slots; i++) + if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL) + return i; + + return ULINT_UNDEFINED; +} + +/** Create an undo log segment. +@param[in,out] space tablespace +@param[in,out] rseg_hdr rollback segment header (x-latched) +@param[out] id undo slot number +@param[out] err error code +@param[in,out] mtr mini-transaction +@return undo log block +@retval NULL on failure */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id, + dberr_t *err, mtr_t *mtr) +{ + buf_block_t* block; + uint32_t n_reserved; + + const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr); + + if (slot_no == ULINT_UNDEFINED) { + ib::warn() << "Cannot find a free slot for an undo log. Do" + " you have too many active transactions running" + " concurrently?"; + + *err = DB_TOO_MANY_CONCURRENT_TRXS; + return NULL; + } + + ut_ad(slot_no < TRX_RSEG_N_SLOTS); + + *err = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return NULL; + } + + /* Allocate a new file segment for the undo log */ + block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + mtr, err, true); + + space->release_free_extents(n_reserved); + + if (!block) { + return block; + } + + mtr->undo_create(*block); + trx_undo_page_init(*block); + + mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + block->page.frame, + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE); + mtr->write<2,mtr_t::MAYBE_NOP>(*block, + TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + block->page.frame, 0U); + + flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + block->page.frame, mtr); + + *err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, + mtr); + + *id = slot_no; + mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->page.frame, + block->page.id().page_no()); + + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); + + *err = DB_SUCCESS; + return block; +} + +/** Initialize an undo log header. +@param[in,out] undo_page undo log segment header page +@param[in] trx_id transaction identifier +@param[in,out] mtr mini-transaction +@return header byte offset on page */ +static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id, + mtr_t* mtr) +{ + /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being + repurposed after upgrading to MariaDB 10.3. */ + byte *undo_type= my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->page.frame); + ut_ad(mach_read_from_2(undo_type) <= 2); + mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U); + byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + + undo_page->page.frame); + const uint16_t free= mach_read_from_2(start + 2); + static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE, + "compatibility"); + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100); + + mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE); + /* A WRITE of 2 bytes is never longer than a MEMMOVE. + So, WRITE 2+2 bytes is better than WRITE+MEMMOVE. + But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */ + memcpy_aligned<2>(start + 2, start, 2); + mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4, + start, 2); + uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + undo_page->page.frame); + alignas(4) byte buf[4]; + mach_write_to_2(buf, TRX_UNDO_ACTIVE); + mach_write_to_2(buf + 2, free); + static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility"); + static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment"); + mtr->memcpy(*undo_page, my_assume_aligned<4> + (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame), + buf, 4); + if (prev_log) + mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + + undo_page->page.frame, free); + mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID + + undo_page->page.frame, trx_id); + if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO + + undo_page->page.frame) != 0)) + mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0); + + /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */ + mach_write_to_2(buf, 1); + memcpy_aligned<2>(buf + 2, start, 2); + static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START, + "compatibility"); + mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE + + undo_page->page.frame, buf, 4); + /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */ + if (prev_log) + { + mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS, + TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0); + mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG + + undo_page->page.frame, prev_log); + static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE, + "compatibility"); + mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0); + static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE + + FLST_NODE_SIZE, "compatibility"); + } + else + mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS, + TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0); + return free; +} + +/** Write X/Open XA Transaction Identifier (XID) to undo log header +@param[in,out] block undo header page +@param[in] offset undo header record offset +@param[in] xid distributed transaction identifier +@param[in,out] mtr mini-transaction */ +static void trx_undo_write_xid(buf_block_t *block, uint16_t offset, + const XID &xid, mtr_t *mtr) +{ + DBUG_ASSERT(xid.gtrid_length > 0); + DBUG_ASSERT(xid.bqual_length >= 0); + DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE); + DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE); + static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE, + "gtrid and bqual don't fit xid data"); + DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + block->page.frame) == offset); + + trx_ulogf_t* log_hdr= block->page.frame + offset; + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT, + static_cast<uint32_t>(xid.formatID)); + mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN, + static_cast<uint32_t>(xid.gtrid_length)); + mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN, + static_cast<uint32_t>(xid.bqual_length)); + const ulint xid_length= static_cast<ulint>(xid.gtrid_length + + xid.bqual_length); + mtr->memcpy(*block, &block->page.frame[offset + TRX_UNDO_XA_XID], + xid.data, xid_length); + if (UNIV_LIKELY(xid_length < XIDDATASIZE)) + mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length, + XIDDATASIZE - xid_length, 0); +} + +/********************************************************************//** +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid) +{ + xid->formatID=static_cast<long>(mach_read_from_4( + log_hdr + TRX_UNDO_XA_FORMAT)); + + xid->gtrid_length=static_cast<long>(mach_read_from_4( + log_hdr + TRX_UNDO_XA_TRID_LEN)); + + xid->bqual_length=static_cast<long>(mach_read_from_4( + log_hdr + TRX_UNDO_XA_BQUAL_LEN)); + + memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); +} + +/** Allocate an undo log page. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any page latch +@param[out] err error code +@return X-latched block if success +@retval nullptr on failure */ +buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err) +{ + buf_block_t *new_block= nullptr; + uint32_t n_reserved; + + /* When we add a page to an undo log, this is analogous to + a pessimistic insert in a B-tree, and we must reserve the + counterpart of the tree latch, which is the rseg mutex. */ + + trx_rseg_t *rseg= undo->rseg; + rseg->latch.wr_lock(SRW_LOCK_CALL); + + buf_block_t *header_block= + buf_page_get_gen(page_id_t{rseg->space->id, undo->hdr_page_no}, + 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); + if (!header_block) + goto func_exit; + *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr); + + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + goto func_exit; + + new_block= + fseg_alloc_free_page_general(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + header_block->page.frame, + undo->top_page_no + 1, FSP_UP, true, + mtr, mtr, err); + rseg->space->release_free_extents(n_reserved); + + if (!new_block) + goto func_exit; + + undo->last_page_no= new_block->page.id().page_no(); + + mtr->undo_create(*new_block); + trx_undo_page_init(*new_block); + *err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + new_block= nullptr; + else + { + undo->size++; + rseg->curr_size++; + } + +func_exit: + rseg->latch.wr_unlock(); + return new_block; +} + +/********************************************************************//** +Frees an undo log page that is not the header page. +@return last page number in remaining log */ +static +uint32_t +trx_undo_free_page( +/*===============*/ + trx_rseg_t* rseg, /*!< in: rollback segment */ + bool in_history, /*!< in: TRUE if the undo log is in the history + list */ + uint32_t hdr_page_no, /*!< in: header page number */ + uint32_t page_no, /*!< in: page number to free: must not be the + header page */ + mtr_t* mtr, /*!< in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ + dberr_t* err) /*!< out: error code */ +{ + ut_a(hdr_page_no != page_no); + + buf_block_t* undo_block = buf_page_get_gen(page_id_t(rseg->space->id, + page_no), + 0, RW_X_LATCH, nullptr, + BUF_GET, mtr, err); + if (UNIV_UNLIKELY(!undo_block)) { + return FIL_NULL; + } + buf_block_t* header_block = buf_page_get_gen(page_id_t(rseg->space->id, + hdr_page_no), + 0, RW_X_LATCH, nullptr, + BUF_GET, mtr, err); + if (UNIV_UNLIKELY(!header_block)) { + return FIL_NULL; + } + + *err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, + mtr); + + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return FIL_NULL; + } + + *err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + header_block->page.frame, + rseg->space, page_no, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return FIL_NULL; + } + buf_page_free(rseg->space, page_no, mtr); + + const fil_addr_t last_addr = flst_get_last( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + header_block->page.frame); + rseg->curr_size--; + + if (!in_history) { + } else if (buf_block_t* rseg_header = rseg->get(mtr, err)) { + byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE + + rseg_header->page.frame; + uint32_t hist_size = mach_read_from_4(rseg_hist_size); + ut_ad(hist_size > 0); + mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1); + } else { + return FIL_NULL; + } + + return(last_addr.page); +} + +/** Free the last undo log page. The caller must hold the rseg mutex. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any undo log page + or that has allocated the undo log page +@return error code */ +dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr) +{ + ut_ad(undo->hdr_page_no != undo->last_page_no); + ut_ad(undo->size > 0); + undo->size--; + + dberr_t err; + undo->last_page_no= trx_undo_free_page(undo->rseg, false, undo->hdr_page_no, + undo->last_page_no, mtr, &err); + return err; +} + +/** Truncate the tail of an undo log during rollback. +@param[in,out] undo undo log +@param[in] limit all undo logs after this limit will be discarded +@param[in] is_temp whether this is temporary undo log +@return error code */ +static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit, + bool is_temp) +{ + ut_ad(is_temp == !undo.rseg->is_persistent()); + + for (mtr_t mtr;;) + { + mtr.start(); + if (is_temp) + mtr.set_log_mode(MTR_LOG_NO_REDO); + + trx_undo_rec_t *trunc_here= nullptr; + undo.rseg->latch.wr_lock(SRW_LOCK_CALL); + dberr_t err; + buf_block_t *undo_block= + buf_page_get_gen(page_id_t{undo.rseg->space->id, undo.last_page_no}, + 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err); + if (UNIV_UNLIKELY(!undo_block)) + goto func_exit; + + for (trx_undo_rec_t *rec= + trx_undo_page_get_last_rec(undo_block, + undo.hdr_page_no, undo.hdr_offset); + rec; ) + { + if (trx_undo_rec_get_undo_no(rec) < limit) + goto func_exit; + /* Truncate at least this record off, maybe more */ + trunc_here= rec; + rec= trx_undo_page_get_prev_rec(undo_block, rec, + undo.hdr_page_no, undo.hdr_offset); + } + + if (undo.last_page_no != undo.hdr_page_no) + { + err= trx_undo_free_last_page(&undo, &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + goto func_exit; + undo.rseg->latch.wr_unlock(); + mtr.commit(); + continue; + } + +func_exit: + undo.rseg->latch.wr_unlock(); + + if (trunc_here && err == DB_SUCCESS) + mtr.write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_block->page.frame, + ulint(trunc_here - undo_block->page.frame)); + + mtr.commit(); + return err; + } +} + +/** Try to truncate the undo logs. +@param trx transaction +@return error code */ +dberr_t trx_undo_try_truncate(const trx_t &trx) +{ + if (trx_undo_t *undo= trx.rsegs.m_redo.undo) + { + ut_ad(undo->rseg == trx.rsegs.m_redo.rseg); + if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, false)) + return err; + } + + if (trx_undo_t *undo = trx.rsegs.m_noredo.undo) + { + ut_ad(undo->rseg == trx.rsegs.m_noredo.rseg); + if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, true)) + return err; + } + + return DB_SUCCESS; +} + +/** Truncate the head of an undo log. +NOTE that only whole pages are freed; the header page is not +freed, but emptied, if all the records there are below the limit. +@param[in,out] rseg rollback segment +@param[in] hdr_page_no header page number +@param[in] hdr_offset header offset on the page +@param[in] limit first undo number to preserve +(everything below the limit will be truncated) +@return error code */ +dberr_t +trx_undo_truncate_start( + trx_rseg_t* rseg, + uint32_t hdr_page_no, + uint16_t hdr_offset, + undo_no_t limit) +{ + trx_undo_rec_t* rec; + trx_undo_rec_t* last_rec; + mtr_t mtr; + + if (!limit) { + return DB_SUCCESS; + } +loop: + mtr_start(&mtr); + + if (!rseg->is_persistent()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } + + dberr_t err; + const buf_block_t* undo_page; + rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset, + RW_X_LATCH, undo_page, &mtr, &err); + if (rec == NULL) { + /* Already empty */ +done: + mtr.commit(); + return err; + } + + last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no, + hdr_offset); + if (trx_undo_rec_get_undo_no(last_rec) >= limit) { + goto done; + } + + if (undo_page->page.id().page_no() == hdr_page_no) { + uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG + + undo_page->page.frame); + if (end == 0) { + end = mach_read_from_2(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE + + undo_page->page.frame); + } + + mtr.write<2>(*undo_page, undo_page->page.frame + hdr_offset + + TRX_UNDO_LOG_START, end); + } else { + trx_undo_free_page(rseg, true, hdr_page_no, + undo_page->page.id().page_no(), &mtr, &err); + if (err != DB_SUCCESS) { + goto done; + } + } + + mtr.commit(); + goto loop; +} + +/** Frees an undo log segment which is not in the history list. +@param undo temporary undo log */ +static void trx_undo_seg_free(const trx_undo_t *undo) +{ + ut_ad(undo->id < TRX_RSEG_N_SLOTS); + + trx_rseg_t *const rseg= undo->rseg; + bool finished; + mtr_t mtr; + ut_ad(rseg->space == fil_system.temp_space); + + do + { + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + finished= true; + + if (buf_block_t *block= + buf_page_get(page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), 0, + RW_X_LATCH, &mtr)) + { + fseg_header_t *file_seg= TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + + block->page.frame; + + finished= fseg_free_step(file_seg, &mtr); + + if (!finished); + else if (buf_block_t* rseg_header = rseg->get(&mtr, nullptr)) + { + static_assert(FIL_NULL == 0xffffffff, "compatibility"); + mtr.memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED); + } + } + + mtr.commit(); + } + while (!finished); +} + +/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ + +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@param[in,out] max_trx_id the largest observed transaction ID +@return the undo log +@retval nullptr on error */ +trx_undo_t * +trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no, + trx_id_t &max_trx_id) +{ + mtr_t mtr; + XID xid; + + ut_ad(id < TRX_RSEG_N_SLOTS); + + mtr.start(); + const buf_block_t* block = buf_page_get( + page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr); + if (UNIV_UNLIKELY(!block)) { +corrupted: + mtr.commit(); + return nullptr; + } + + const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE + + block->page.frame); + if (UNIV_UNLIKELY(type > 2)) { +corrupted_type: + sql_print_error("InnoDB: unsupported undo header type %u", + type); + goto corrupted; + } + + uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + block->page.frame); + if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE || + offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) { + sql_print_error("InnoDB: invalid undo header offset %u", + offset); + goto corrupted; + } + + const trx_ulogf_t* const undo_header = block->page.frame + offset; + uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + block->page.frame); + switch (state) { + case TRX_UNDO_ACTIVE: + case TRX_UNDO_PREPARED: + if (UNIV_LIKELY(type != 1)) { + break; + } + sql_print_error("InnoDB: upgrade from older version than" + " MariaDB 10.3 requires clean shutdown"); + goto corrupted; + default: + sql_print_error("InnoDB: unsupported undo header state %u", + state); + goto corrupted; + case TRX_UNDO_CACHED: + if (UNIV_UNLIKELY(type != 0)) { + /* This undo page was not updated by MariaDB + 10.3 or later. The TRX_UNDO_TRX_NO field may + contain garbage. */ + break; + } + goto read_trx_no; + case TRX_UNDO_TO_PURGE: + if (UNIV_UNLIKELY(type == 1)) { + goto corrupted_type; + } + read_trx_no: + trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header); + if (id >> 48) { + sql_print_error("InnoDB: corrupted TRX_NO %llx", id); + goto corrupted; + } + if (id > max_trx_id) { + max_trx_id = id; + } + } + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + if (undo_header[TRX_UNDO_XID_EXISTS]) { + trx_undo_read_xid(undo_header, &xid); + } else { + xid.null(); + } + + trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + if (trx_id >> 48) { + sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id); + goto corrupted; + } + if (trx_id > max_trx_id) { + max_trx_id = trx_id; + } + + trx_undo_t* undo = trx_undo_mem_create( + rseg, id, trx_id, &xid, page_no, offset); + if (!undo) { + return undo; + } + + undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS]; + undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + block->page.frame); + + fil_addr_t last_addr = flst_get_last( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame); + + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; + + const buf_block_t* last = buf_page_get( + page_id_t(rseg->space->id, undo->last_page_no), 0, + RW_X_LATCH, &mtr); + + if (UNIV_UNLIKELY(!last)) { + ut_free(undo); + goto corrupted; + } + + if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec( + last, page_no, offset)) { + undo->top_offset = static_cast<uint16_t>( + rec - last->page.frame); + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + ut_ad(!undo->empty()); + } else { + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); + } + + undo->state = state; + + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(rseg->undo_list, undo); + } else { + UT_LIST_ADD_LAST(rseg->undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + } + + mtr.commit(); + return undo; +} + +/********************************************************************//** +Creates and initializes an undo log memory object. +@return own: the undo log memory object */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + trx_rseg_t* rseg, /*!< in: rollback segment memory object */ + ulint id, /*!< in: slot index within rseg */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open transaction identification */ + uint32_t page_no,/*!< in: undo log header page number */ + uint16_t offset) /*!< in: undo log header byte offset on page */ +{ + trx_undo_t* undo; + + ut_a(id < TRX_RSEG_N_SLOTS); + + undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo))); + + if (undo == NULL) { + + return(NULL); + } + + undo->id = id; + undo->state = TRX_UNDO_ACTIVE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->rseg = rseg; + + undo->hdr_page_no = page_no; + undo->hdr_offset = offset; + undo->last_page_no = page_no; + undo->size = 1; + + undo->top_undo_no = IB_ID_MAX; + undo->top_page_no = page_no; + undo->guess_block = NULL; + ut_ad(undo->empty()); + + return(undo); +} + +/********************************************************************//** +Initializes a cached undo log object for new use. */ +static +void +trx_undo_mem_init_for_reuse( +/*========================*/ + trx_undo_t* undo, /*!< in: undo log to init */ + trx_id_t trx_id, /*!< in: id of the trx for which the undo log + is created */ + const XID* xid, /*!< in: X/Open XA transaction identification*/ + uint16_t offset) /*!< in: undo log header byte offset on page */ +{ + ut_a(undo->id < TRX_RSEG_N_SLOTS); + + undo->state = TRX_UNDO_ACTIVE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->hdr_offset = offset; + undo->top_undo_no = IB_ID_MAX; + ut_ad(undo->empty()); +} + +/** Create an undo log. +@param[in,out] trx transaction +@param[in,out] rseg rollback segment +@param[out] undo undo log object +@param[out] err error code +@param[in,out] mtr mini-transaction +@return undo log block +@retval NULL on failure */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) +{ + ulint id; + buf_block_t* block = rseg->get(mtr, err); + + if (block) { + block = trx_undo_seg_create(rseg->space, block, &id, err, mtr); + } + + if (!block) { + return NULL; + } + + rseg->curr_size++; + + uint16_t offset = trx_undo_header_create(block, trx->id, mtr); + + *undo = trx_undo_mem_create(rseg, id, trx->id, &trx->xid, + block->page.id().page_no(), offset); + if (*undo == NULL) { + *err = DB_OUT_OF_MEMORY; + /* FIXME: this will not free the undo block to the file */ + return NULL; + } else if (rseg != trx->rsegs.m_redo.rseg) { + return block; + } + + if (trx->dict_operation) { + (*undo)->dict_operation = true; + mtr->write<1,mtr_t::MAYBE_NOP>(*block, + block->page.frame + offset + + TRX_UNDO_DICT_TRANS, 1U); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, + block->page.frame + offset + + TRX_UNDO_TABLE_ID, 0U); + } + + *err = DB_SUCCESS; + return block; +} + +/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ + +/** Reuse a cached undo log block. +@param[in,out] trx transaction +@param[in,out] rseg rollback segment +@param[out] pundo the undo log memory object +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL if none cached */ +static +buf_block_t* +trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, + mtr_t* mtr) +{ + trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached); + if (!undo) { + return NULL; + } + + ut_ad(undo->size == 1); + ut_ad(undo->id < TRX_RSEG_N_SLOTS); + + buf_block_t* block = buf_page_get(page_id_t(undo->rseg->space->id, + undo->hdr_page_no), + 0, RW_X_LATCH, mtr); + if (!block) { + return NULL; + } + + UT_LIST_REMOVE(rseg->undo_cached, undo); + MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED); + + *pundo = undo; + + uint16_t offset = trx_undo_header_create(block, trx->id, mtr); + + trx_undo_mem_init_for_reuse(undo, trx->id, &trx->xid, offset); + + if (rseg != trx->rsegs.m_redo.rseg) { + return block; + } + + if (trx->dict_operation) { + undo->dict_operation = TRUE; + mtr->write<1,mtr_t::MAYBE_NOP>(*block, + block->page.frame + offset + + TRX_UNDO_DICT_TRANS, 1U); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, + block->page.frame + offset + + TRX_UNDO_TABLE_ID, 0U); + } + + return block; +} + +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) +{ + ut_ad(mtr->get_log_mode() == MTR_LOG_ALL); + + trx_undo_t* undo = trx->rsegs.m_redo.undo; + + if (undo) { + return buf_page_get_gen( + page_id_t(undo->rseg->space->id, undo->last_page_no), + 0, RW_X_LATCH, undo->guess_block, + BUF_GET, mtr, err); + } + + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + + rseg->latch.wr_lock(SRW_LOCK_CALL); + buf_block_t* block = trx_undo_reuse_cached( + trx, rseg, &trx->rsegs.m_redo.undo, mtr); + + if (!block) { + block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo, + err, mtr); + ut_ad(!block == (*err != DB_SUCCESS)); + if (!block) { + goto func_exit; + } + } else { + *err = DB_SUCCESS; + } + + UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo); + +func_exit: + rseg->latch.wr_unlock(); + return block; +} + +/** Assign an undo log for a transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[in] rseg rollback segment +@param[out] undo the undo log +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, + dberr_t* err, mtr_t* mtr) +{ + ut_d(const bool is_temp = rseg == trx->rsegs.m_noredo.rseg); + ut_ad(rseg == trx->rsegs.m_redo.rseg + || rseg == trx->rsegs.m_noredo.rseg); + ut_ad(undo == (is_temp + ? &trx->rsegs.m_noredo.undo + : &trx->rsegs.m_redo.undo)); + ut_ad(mtr->get_log_mode() + == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL)); + + if (*undo) { + return buf_page_get_gen( + page_id_t(rseg->space->id, (*undo)->last_page_no), + 0, RW_X_LATCH, (*undo)->guess_block, + BUF_GET, mtr, err); + } + + DBUG_EXECUTE_IF( + "ib_create_table_fail_too_many_trx", + *err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL; + ); + + rseg->latch.wr_lock(SRW_LOCK_CALL); + + buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr); + + if (!block) { + block = trx_undo_create(trx, rseg, undo, err, mtr); + ut_ad(!block == (*err != DB_SUCCESS)); + if (!block) { + goto func_exit; + } + } else { + *err = DB_SUCCESS; + } + + UT_LIST_ADD_FIRST(rseg->undo_list, *undo); + +func_exit: + rseg->latch.wr_unlock(); + return block; +} + +/******************************************************************//** +Sets the state of the undo log segment at a transaction finish. +@return undo log segment header page, x-latched */ +buf_block_t* +trx_undo_set_state_at_finish( +/*=========================*/ + trx_undo_t* undo, /*!< in: undo log memory copy */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(undo->id < TRX_RSEG_N_SLOTS); + + buf_block_t *block= + buf_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0, + RW_X_LATCH, mtr); + /* This function is invoked during transaction commit, which is not + allowed to fail. If we get a corrupted undo header, we will crash here. */ + ut_a(block); + const uint16_t state = undo->size == 1 && + TRX_UNDO_PAGE_REUSE_LIMIT > + mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + block->page.frame) + ? TRX_UNDO_CACHED + : TRX_UNDO_TO_PURGE; + + undo->state= state; + mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->page.frame, + state); + return block; +} + +/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. +@param[in,out] trx transaction +@param[in,out] undo undo log +@param[in] rollback false=XA PREPARE, true=XA ROLLBACK +@param[in,out] mtr mini-transaction +@return undo log segment header page, x-latched */ +void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback, + mtr_t *mtr) +{ + ut_a(undo->id < TRX_RSEG_N_SLOTS); + + buf_block_t* block = buf_page_get( + page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0, + RW_X_LATCH, mtr); + if (UNIV_UNLIKELY(!block)) { + /* In case of !rollback the undo header page + corruption would leave the transaction object in an + unexpected (active) state. */ + ut_a(rollback); + return; + } + + if (rollback) { + ut_ad(undo->state == TRX_UNDO_PREPARED); + mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + block->page.frame, TRX_UNDO_ACTIVE); + return; + } + + /*------------------------------*/ + ut_ad(undo->state == TRX_UNDO_ACTIVE); + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + block->page.frame, undo->state); + uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + + block->page.frame); + mtr->write<1>(*block, block->page.frame + offset + TRX_UNDO_XID_EXISTS, + 1U); + + trx_undo_write_xid(block, offset, undo->xid, mtr); +} + +/** Free temporary undo log after commit or rollback. +The information is not needed after a commit or rollback, therefore +the data can be discarded. +@param undo temporary undo log */ +void trx_undo_commit_cleanup(trx_undo_t *undo) +{ + trx_rseg_t* rseg = undo->rseg; + ut_ad(rseg->space == fil_system.temp_space); + + rseg->latch.wr_lock(SRW_LOCK_CALL); + + UT_LIST_REMOVE(rseg->undo_list, undo); + + if (undo->state == TRX_UNDO_CACHED) { + UT_LIST_ADD_FIRST(rseg->undo_cached, undo); + MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED); + undo = nullptr; + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + + /* Delete first the undo log segment in the file */ + trx_undo_seg_free(undo); + + ut_ad(rseg->curr_size > undo->size); + rseg->curr_size -= undo->size; + } + + rseg->latch.wr_unlock(); + ut_free(undo); +} + +/** At shutdown, frees the undo logs of a transaction. */ +void trx_undo_free_at_shutdown(trx_t *trx) +{ + if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) { + switch (undo->state) { + case TRX_UNDO_PREPARED: + break; + case TRX_UNDO_CACHED: + case TRX_UNDO_TO_PURGE: + ut_ad(trx_state_eq(trx, + TRX_STATE_COMMITTED_IN_MEMORY)); + /* fall through */ + case TRX_UNDO_ACTIVE: + /* trx_t::commit_state() assigns + trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */ + ut_a(!srv_was_started + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + || srv_fast_shutdown); + break; + default: + ut_error; + } + + UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo); + ut_free(undo); + undo = NULL; + } + if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) { + ut_a(undo->state == TRX_UNDO_PREPARED); + + UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo); + ut_free(undo); + undo = NULL; + } +} |