diff options
Diffstat (limited to 'storage/innobase/trx/trx0trx.cc')
-rw-r--r-- | storage/innobase/trx/trx0trx.cc | 2300 |
1 files changed, 2300 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc new file mode 100644 index 00000000..cf8fa17c --- /dev/null +++ b/storage/innobase/trx/trx0trx.cc @@ -0,0 +1,2300 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file trx/trx0trx.cc +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" + +#ifdef WITH_WSREP +#include <mysql/service_wsrep.h> +#endif + +#include <mysql/service_thd_error_context.h> + +#include "btr0sea.h" +#include "lock0lock.h" +#include "log0log.h" +#include "que0que.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "trx0xa.h" +#include "ut0pool.h" +#include "ut0vec.h" + +#include <set> +#include <new> + +/** The bit pattern corresponding to TRX_ID_MAX */ +const byte trx_id_max_bytes[8] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** The bit pattern corresponding to max timestamp */ +const byte timestamp_max_bytes[7] = { + 0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f +}; + + +static const ulint MAX_DETAILED_ERROR_LEN = 256; + +/** Set of table_id */ +typedef std::set< + table_id_t, + std::less<table_id_t>, + ut_allocator<table_id_t> > table_id_set; + +/*************************************************************//** +Set detailed error message for the transaction. */ +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg) /*!< in: detailed error message */ +{ + strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1); + trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0'; +} + +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file) /*!< in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN); +} + +/********************************************************************//** +Initialize transaction object. +@param trx trx to initialize */ +static +void +trx_init( +/*=====*/ + trx_t* trx) +{ + trx->state = TRX_STATE_NOT_STARTED; + + trx->is_recovered = false; + + trx->op_info = ""; + + trx->active_commit_ordered = false; + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->check_foreigns = true; + + trx->check_unique_secondary = true; + + trx->lock.n_rec_locks = 0; + + trx->dict_operation = TRX_DICT_OP_NONE; + + trx->table_id = 0; + + trx->error_state = DB_SUCCESS; + + trx->error_key_num = ULINT_UNDEFINED; + + trx->undo_no = 0; + + trx->rsegs.m_redo.rseg = NULL; + + trx->rsegs.m_noredo.rseg = NULL; + + trx->read_only = false; + + trx->auto_commit = false; + + trx->will_lock = false; + + trx->ddl = false; + + trx->internal = false; + + ut_d(trx->start_file = 0); + + ut_d(trx->start_line = 0); + + trx->magic_n = TRX_MAGIC_N; + + trx->lock.que_state = TRX_QUE_RUNNING; + + trx->last_sql_stat_start.least_undo_no = 0; + + ut_ad(!trx->read_view.is_open()); + + trx->lock.rec_cached = 0; + + trx->lock.table_cached = 0; +#ifdef WITH_WSREP + ut_ad(!trx->wsrep); + ut_ad(!trx->wsrep_UK_scan); +#endif /* WITH_WSREP */ +} + +/** For managing the life-cycle of the trx_t instance that we get +from the pool. */ +struct TrxFactory { + + /** Initializes a transaction object. It must be explicitly started + with trx_start_if_not_started() before using it. The default isolation + level is TRX_ISO_REPEATABLE_READ. + @param trx Transaction instance to initialise */ + static void init(trx_t* trx) + { + /* Explicitly call the constructor of the already + allocated object. trx_t objects are allocated by + ut_zalloc_nokey() in Pool::Pool() which would not call + the constructors of the trx_t members. */ + new(&trx->mod_tables) trx_mod_tables_t(); + + new(&trx->lock.table_locks) lock_list(); + + new(&trx->read_view) ReadView(); + + trx->rw_trx_hash_pins = 0; + trx_init(trx); + + trx->dict_operation_lock_mode = 0; + + trx->xid = UT_NEW_NOKEY(xid_t()); + + trx->detailed_error = reinterpret_cast<char*>( + ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN)); + + trx->lock.lock_heap = mem_heap_create_typed( + 1024, MEM_HEAP_FOR_LOCK_HEAP); + + lock_trx_lock_list_init(&trx->lock.trx_locks); + + UT_LIST_INIT(trx->lock.evicted_tables, + &dict_table_t::table_LRU); + + UT_LIST_INIT( + trx->trx_savepoints, + &trx_named_savept_t::trx_savepoints); + + mutex_create(LATCH_ID_TRX, &trx->mutex); + } + + /** Release resources held by the transaction object. + @param trx the transaction for which to release resources */ + static void destroy(trx_t* trx) + { +#ifdef __SANITIZE_ADDRESS__ + /* Unpoison the memory for AddressSanitizer */ + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); +#elif !__has_feature(memory_sanitizer) + /* In Valgrind, we cannot cancel MEM_NOACCESS() without + changing the state of the V bits (which indicate + which bits are initialized). + We will declare the contents as initialized. + We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */ + MEM_MAKE_DEFINED(trx, sizeof *trx); +#endif + + ut_a(trx->magic_n == TRX_MAGIC_N); + ut_ad(!trx->mysql_thd); + + ut_a(trx->lock.wait_lock == NULL); + ut_a(trx->lock.wait_thr == NULL); + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock.lock_heap != NULL) { + mem_heap_free(trx->lock.lock_heap); + trx->lock.lock_heap = NULL; + } + + ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); + + UT_DELETE(trx->xid); + ut_free(trx->detailed_error); + + mutex_free(&trx->mutex); + + trx->mod_tables.~trx_mod_tables_t(); + + ut_ad(!trx->read_view.is_open()); + + trx->lock.table_locks.~lock_list(); + + trx->read_view.~ReadView(); + } +}; + +/** The lock strategy for TrxPool */ +struct TrxPoolLock { + TrxPoolLock() { } + + /** Create the mutex */ + void create() + { + mutex_create(LATCH_ID_TRX_POOL, &m_mutex); + } + + /** Acquire the mutex */ + void enter() { mutex_enter(&m_mutex); } + + /** Release the mutex */ + void exit() { mutex_exit(&m_mutex); } + + /** Free the mutex */ + void destroy() { mutex_free(&m_mutex); } + + /** Mutex to use */ + ib_mutex_t m_mutex; +}; + +/** The lock strategy for the TrxPoolManager */ +struct TrxPoolManagerLock { + TrxPoolManagerLock() { } + + /** Create the mutex */ + void create() + { + mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex); + } + + /** Acquire the mutex */ + void enter() { mutex_enter(&m_mutex); } + + /** Release the mutex */ + void exit() { mutex_exit(&m_mutex); } + + /** Free the mutex */ + void destroy() { mutex_free(&m_mutex); } + + /** Mutex to use */ + ib_mutex_t m_mutex; +}; + +/** Use explicit mutexes for the trx_t pool and its manager. */ +typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t; +typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t; + +/** The trx_t pool manager */ +static trx_pools_t* trx_pools; + +/** Size of on trx_t pool in bytes. */ +static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4; + +/** Create the trx_t pool */ +void +trx_pool_init() +{ + trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE)); + + ut_a(trx_pools != 0); +} + +/** Destroy the trx_t pool */ +void +trx_pool_close() +{ + UT_DELETE(trx_pools); + + trx_pools = 0; +} + +/** @return an allocated transaction */ +trx_t *trx_create() +{ + trx_t* trx = trx_pools->get(); + +#ifdef __SANITIZE_ADDRESS__ + /* Unpoison the memory for AddressSanitizer. + It may have been poisoned in trx_t::free().*/ + MEM_MAKE_ADDRESSABLE(trx, sizeof *trx); +#elif !__has_feature(memory_sanitizer) + /* In Valgrind, we cannot cancel MEM_NOACCESS() without + changing the state of the V bits (which indicate + which bits are initialized). + We will declare the contents as initialized. + We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */ + MEM_MAKE_DEFINED(trx, sizeof *trx); +#endif + + trx->assert_freed(); + + mem_heap_t* heap; + ib_alloc_t* alloc; + + /* We just got trx from pool, it should be non locking */ + ut_ad(!trx->will_lock); + ut_ad(!trx->rw_trx_hash_pins); + + DBUG_LOG("trx", "Create: " << trx); + + heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8); + + alloc = ib_heap_allocator_create(heap); + + trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4); + + ut_ad(trx->mod_tables.empty()); + ut_ad(trx->lock.n_rec_locks == 0); + ut_ad(trx->lock.table_cached == 0); + ut_ad(trx->lock.rec_cached == 0); + ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0); + +#ifdef WITH_WSREP + ut_ad(!trx->wsrep_UK_scan); +#endif /* WITH_WSREP */ + + trx_sys.register_trx(trx); + + return(trx); +} + +/** Free the memory to trx_pools */ +void trx_t::free() +{ + MEM_CHECK_DEFINED(this, sizeof *this); + + ut_ad(!n_mysql_tables_in_use); + ut_ad(!mysql_log_file_name); + ut_ad(!mysql_n_tables_locked); + ut_ad(!internal); + ut_ad(!will_lock); + ut_ad(error_state == DB_SUCCESS); + ut_ad(magic_n == TRX_MAGIC_N); + ut_ad(!read_only); + ut_ad(!lock.wait_lock); + + dict_operation= TRX_DICT_OP_NONE; + trx_sys.deregister_trx(this); + assert_freed(); + trx_sys.rw_trx_hash.put_pins(this); + + mysql_thd= nullptr; + + // FIXME: We need to avoid this heap free/alloc for each commit. + if (autoinc_locks) + { + ut_ad(ib_vector_is_empty(autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(autoinc_locks); + autoinc_locks= NULL; + } + + mod_tables.clear(); + + MEM_NOACCESS(&n_ref, sizeof n_ref); + /* do not poison mutex */ + MEM_NOACCESS(&id, sizeof id); + MEM_NOACCESS(&state, sizeof state); + MEM_NOACCESS(&is_recovered, sizeof is_recovered); +#ifdef WITH_WSREP + MEM_NOACCESS(&wsrep, sizeof wsrep); +#endif + read_view.mem_noaccess(); + MEM_NOACCESS(&lock, sizeof lock); + MEM_NOACCESS(&op_info, sizeof op_info); + MEM_NOACCESS(&isolation_level, sizeof isolation_level); + MEM_NOACCESS(&check_foreigns, sizeof check_foreigns); + MEM_NOACCESS(&is_registered, sizeof is_registered); + MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered); + MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary); + MEM_NOACCESS(&flush_log_later, sizeof flush_log_later); + MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later); + MEM_NOACCESS(&duplicates, sizeof duplicates); + MEM_NOACCESS(&dict_operation, sizeof dict_operation); + MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode); + MEM_NOACCESS(&start_time, sizeof start_time); + MEM_NOACCESS(&start_time_micro, sizeof start_time_micro); + MEM_NOACCESS(&commit_lsn, sizeof commit_lsn); + MEM_NOACCESS(&table_id, sizeof table_id); + MEM_NOACCESS(&mysql_thd, sizeof mysql_thd); + MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name); + MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset); + MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use); + MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked); + MEM_NOACCESS(&error_state, sizeof error_state); + MEM_NOACCESS(&error_info, sizeof error_info); + MEM_NOACCESS(&error_key_num, sizeof error_key_num); + MEM_NOACCESS(&graph, sizeof graph); + MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints); + MEM_NOACCESS(&undo_no, sizeof undo_no); + MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start); + MEM_NOACCESS(&rsegs, sizeof rsegs); + MEM_NOACCESS(&roll_limit, sizeof roll_limit); + MEM_NOACCESS(&in_rollback, sizeof in_rollback); + MEM_NOACCESS(&pages_undone, sizeof pages_undone); + MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows); + MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks); + MEM_NOACCESS(&read_only, sizeof read_only); + MEM_NOACCESS(&auto_commit, sizeof auto_commit); + MEM_NOACCESS(&will_lock, sizeof will_lock); + MEM_NOACCESS(&fts_trx, sizeof fts_trx); + MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id); + MEM_NOACCESS(&flush_tables, sizeof flush_tables); + MEM_NOACCESS(&ddl, sizeof ddl); + MEM_NOACCESS(&internal, sizeof internal); +#ifdef UNIV_DEBUG + MEM_NOACCESS(&start_line, sizeof start_line); + MEM_NOACCESS(&start_file, sizeof start_file); +#endif /* UNIV_DEBUG */ + MEM_NOACCESS(&xid, sizeof xid); + MEM_NOACCESS(&mod_tables, sizeof mod_tables); + MEM_NOACCESS(&detailed_error, sizeof detailed_error); +#ifdef WITH_WSREP + ut_ad(!wsrep_UK_scan); + MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan); +#endif /* WITH_WSREP */ + MEM_NOACCESS(&magic_n, sizeof magic_n); + trx_pools->mem_free(this); +} + +/** Transition to committed state, to release implicit locks. */ +inline void trx_t::commit_state() +{ + ut_ad(state == TRX_STATE_PREPARED + || state == TRX_STATE_PREPARED_RECOVERED + || state == TRX_STATE_ACTIVE); + /* This makes the transaction committed in memory and makes its + changes to data visible to other transactions. NOTE that there is a + small discrepancy from the strict formal visibility rules here: a + user of the database can see modifications made by another + transaction T even before the necessary redo log segment has been + flushed to the disk. If the database happens to crash before the + flush, the user has seen modifications from T which will never be a + committed transaction. However, any transaction T2 which sees the + modifications of the committing transaction T, and which also itself + makes modifications to the database, will get an lsn larger than the + committing transaction T. In the case where the log flush fails, and + T never gets committed, also T2 will never get committed. */ + trx_mutex_enter(this); + state= TRX_STATE_COMMITTED_IN_MEMORY; + trx_mutex_exit(this); + ut_ad(id || !is_referenced()); +} + +/** Release any explicit locks of a committing transaction. */ +inline void trx_t::release_locks() +{ + DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY); + DBUG_ASSERT(!is_referenced()); + + if (UT_LIST_GET_LEN(lock.trx_locks)) + { + lock_release(this); + lock.n_rec_locks = 0; + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(ib_vector_is_empty(autoinc_locks)); + mem_heap_empty(lock.lock_heap); + } + + lock.table_locks.clear(); +} + +/** At shutdown, frees a transaction object. */ +void +trx_free_at_shutdown(trx_t *trx) +{ + ut_ad(trx->is_recovered); + ut_a(trx_state_eq(trx, TRX_STATE_PREPARED) + || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) + || (trx_state_eq(trx, TRX_STATE_ACTIVE) + && (!srv_was_started + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + || (!srv_is_being_started + && !srv_undo_sources && srv_fast_shutdown)))); + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx->commit_state(); + trx->release_locks(); + trx_undo_free_at_shutdown(trx); + + ut_a(!trx->read_only); + + DBUG_LOG("trx", "Free prepared: " << trx); + trx->state = TRX_STATE_NOT_STARTED; + ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks)); + trx->id = 0; + trx->free(); +} + + +/** + Disconnect a prepared transaction from MySQL + @param[in,out] trx transaction +*/ +void trx_disconnect_prepared(trx_t *trx) +{ + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->mysql_thd); + ut_ad(!trx->mysql_log_file_name); + trx->read_view.close(); + trx->is_recovered= true; + trx->mysql_thd= NULL; + /* todo/fixme: suggest to do it at innodb prepare */ + trx->will_lock= false; + trx_sys.rw_trx_hash.put_pins(trx); +} + +/****************************************************************//** +Resurrect the table locks for a resurrected transaction. */ +static +void +trx_resurrect_table_locks( +/*======================*/ + trx_t* trx, /*!< in/out: transaction */ + const trx_undo_t* undo) /*!< in: undo log */ +{ + mtr_t mtr; + table_id_set tables; + + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(undo->rseg == trx->rsegs.m_redo.rseg); + + if (undo->empty()) { + return; + } + + mtr_start(&mtr); + + /* trx_rseg_mem_create() may have acquired an X-latch on this + page, so we cannot acquire an S-latch. */ + buf_block_t* block = trx_undo_page_get( + page_id_t(trx->rsegs.m_redo.rseg->space->id, + undo->top_page_no), &mtr); + buf_block_t* undo_block = block; + trx_undo_rec_t* undo_rec = block->frame + undo->top_offset; + + do { + ulint type; + undo_no_t undo_no; + table_id_t table_id; + ulint cmpl_info; + bool updated_extern; + + if (undo_block != block) { + mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX); + undo_block = block; + } + + trx_undo_rec_get_pars( + undo_rec, &type, &cmpl_info, + &updated_extern, &undo_no, &table_id); + tables.insert(table_id); + + undo_rec = trx_undo_get_prev_rec( + block, page_offset(undo_rec), undo->hdr_page_no, + undo->hdr_offset, false, &mtr); + } while (undo_rec); + + mtr_commit(&mtr); + + for (table_id_set::const_iterator i = tables.begin(); + i != tables.end(); i++) { + if (dict_table_t* table = dict_table_open_on_id( + *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) { + if (!table->is_readable()) { + mutex_enter(&dict_sys.mutex); + dict_table_close(table, TRUE, FALSE); + dict_sys.remove(table); + mutex_exit(&dict_sys.mutex); + continue; + } + + if (trx->state == TRX_STATE_PREPARED) { + trx->mod_tables.insert( + trx_mod_tables_t::value_type(table, + 0)); + } + lock_table_ix_resurrect(table, trx); + + DBUG_LOG("ib_trx", + "resurrect " << ib::hex(trx->id) + << " IX lock on " << table->name); + + dict_table_close(table, FALSE, FALSE); + } + } +} + + +/** + Resurrect the transactions that were doing inserts/updates the time of the + crash, they need to be undone. +*/ + +static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, + time_t start_time, ulonglong start_time_micro, + uint64_t *rows_to_undo) +{ + trx_state_t state; + /* + This is single-threaded startup code, we do not need the + protection of trx->mutex here. + */ + switch (undo->state) + { + case TRX_UNDO_ACTIVE: + state= TRX_STATE_ACTIVE; + break; + case TRX_UNDO_PREPARED: + /* + Prepared transactions are left in the prepared state + waiting for a commit or abort decision from MySQL + */ + ib::info() << "Transaction " << undo->trx_id + << " was in the XA prepared state."; + + state= TRX_STATE_PREPARED; + break; + default: + return; + } + + trx_t *trx= trx_create(); + trx->state= state; + ut_d(trx->start_file= __FILE__); + ut_d(trx->start_line= __LINE__); + + trx->rsegs.m_redo.undo= undo; + trx->undo_no= undo->top_undo_no + 1; + trx->rsegs.m_redo.rseg= rseg; + /* + For transactions with active data will not have rseg size = 1 + or will not qualify for purge limit criteria. So it is safe to increment + this trx_ref_count w/o mutex protection. + */ + ++trx->rsegs.m_redo.rseg->trx_ref_count; + *trx->xid= undo->xid; + trx->id= undo->trx_id; + trx->is_recovered= true; + trx->start_time= start_time; + trx->start_time_micro= start_time_micro; + + if (undo->dict_operation) + { + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + if (!trx->table_id) + trx->table_id= undo->table_id; + } + + trx_sys.rw_trx_hash.insert(trx); + trx_sys.rw_trx_hash.put_pins(trx); + trx_resurrect_table_locks(trx, undo); + if (trx_state_eq(trx, TRX_STATE_ACTIVE)) + *rows_to_undo+= trx->undo_no; +} + + +/** Initialize (resurrect) transactions at startup. */ +dberr_t trx_lists_init_at_db_start() +{ + ut_a(srv_is_being_started); + ut_ad(!srv_was_started); + + if (srv_operation == SRV_OPERATION_RESTORE) { + /* mariabackup --prepare only deals with + the redo log and the data files, not with + transactions or the data dictionary. */ + return trx_rseg_array_init(); + } + + if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + return DB_SUCCESS; + } + + purge_sys.create(); + if (dberr_t err = trx_rseg_array_init()) { + ib::info() << "Retry with innodb_force_recovery=5"; + return err; + } + + /* Look from the rollback segments if there exist undo logs for + transactions. */ + const time_t start_time = time(NULL); + const ulonglong start_time_micro= microsecond_interval_timer(); + uint64_t rows_to_undo = 0; + + for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) { + trx_undo_t* undo; + trx_rseg_t* rseg = trx_sys.rseg_array[i]; + + /* Some rollback segment may be unavailable, + especially if the server was previously run with a + non-default value of innodb_undo_logs. */ + if (rseg == NULL) { + continue; + } + /* Ressurrect other transactions. */ + for (undo = UT_LIST_GET_FIRST(rseg->undo_list); + undo != NULL; + undo = UT_LIST_GET_NEXT(undo_list, undo)) { + trx_t *trx = trx_sys.find(0, undo->trx_id, false); + if (!trx) { + trx_resurrect(undo, rseg, start_time, + start_time_micro, &rows_to_undo); + } else { + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_ad(trx->start_time == start_time); + ut_ad(trx->is_recovered); + ut_ad(trx->rsegs.m_redo.rseg == rseg); + ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count); + + trx->rsegs.m_redo.undo = undo; + if (undo->top_undo_no >= trx->undo_no) { + if (trx_state_eq(trx, + TRX_STATE_ACTIVE)) { + rows_to_undo -= trx->undo_no; + rows_to_undo += + undo->top_undo_no + 1; + } + + trx->undo_no = undo->top_undo_no + 1; + } + trx_resurrect_table_locks(trx, undo); + } + } + } + + if (const auto size = trx_sys.rw_trx_hash.size()) { + ib::info() << size + << " transaction(s) which must be rolled back or" + " cleaned up in total " << rows_to_undo + << " row operations to undo"; + ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id(); + } + + purge_sys.clone_oldest_view(); + return DB_SUCCESS; +} + +/** Assign a persistent rollback segment in a round-robin fashion, +evenly distributed between 0 and innodb_undo_logs-1 +@return persistent rollback segment +@retval NULL if innodb_read_only */ +static trx_rseg_t* trx_assign_rseg_low() +{ + if (high_level_read_only) { + ut_ad(!srv_available_undo_logs); + return(NULL); + } + + ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS); + + /* The first slot is always assigned to the system tablespace. */ + ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space); + + /* Choose a rollback segment evenly distributed between 0 and + innodb_undo_logs-1 in a round-robin fashion, skipping those + undo tablespaces that are scheduled for truncation. */ + static Atomic_counter<unsigned> rseg_slot; + unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS; + ut_d(if (trx_rseg_n_slots_debug) slot = 0); + trx_rseg_t* rseg; + +#ifdef UNIV_DEBUG + ulint start_scan_slot = slot; + bool look_for_rollover = false; +#endif /* UNIV_DEBUG */ + + bool allocated = false; + + do { + for (;;) { + rseg = trx_sys.rseg_array[slot]; + +#ifdef UNIV_DEBUG + /* Ensure that we are not revisiting the same + slot that we have already inspected. */ + if (look_for_rollover) { + ut_ad(start_scan_slot != slot); + } + look_for_rollover = true; +#endif /* UNIV_DEBUG */ + + ut_d(if (!trx_rseg_n_slots_debug)) + slot = (slot + 1) % TRX_SYS_N_RSEGS; + + if (rseg == NULL) { + continue; + } + + ut_ad(rseg->is_persistent()); + + if (rseg->space != fil_system.sys_space) { + if (rseg->skip_allocation + || !srv_undo_tablespaces) { + continue; + } + } else if (trx_rseg_t* next + = trx_sys.rseg_array[slot]) { + if (next->space != fil_system.sys_space + && srv_undo_tablespaces > 0) { + /** If dedicated + innodb_undo_tablespaces have + been configured, try to use them + instead of the system tablespace. */ + continue; + } + } + + break; + } + + /* By now we have only selected the rseg but not marked it + allocated. By marking it allocated we are ensuring that it will + never be selected for UNDO truncate purge. */ + mutex_enter(&rseg->mutex); + if (!rseg->skip_allocation) { + rseg->trx_ref_count++; + allocated = true; + } + mutex_exit(&rseg->mutex); + } while (!allocated); + + ut_ad(rseg->trx_ref_count > 0); + ut_ad(rseg->is_persistent()); + return(rseg); +} + +/** Assign a rollback segment for modifying temporary tables. +@return the assigned rollback segment */ +trx_rseg_t *trx_t::assign_temp_rseg() +{ + ut_ad(!rsegs.m_noredo.rseg); + ut_ad(!is_autocommit_non_locking()); + compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS)); + + /* Choose a temporary rollback segment between 0 and 127 + in a round-robin fashion. */ + static Atomic_counter<unsigned> rseg_slot; + trx_rseg_t* rseg = trx_sys.temp_rsegs[ + rseg_slot++ & (TRX_SYS_N_RSEGS - 1)]; + ut_ad(!rseg->is_persistent()); + rsegs.m_noredo.rseg = rseg; + + if (id == 0) { + trx_sys.register_rw(this); + } + + ut_ad(!rseg->is_persistent()); + return(rseg); +} + +/****************************************************************//** +Starts a transaction. */ +static +void +trx_start_low( +/*==========*/ + trx_t* trx, /*!< in: transaction */ + bool read_write) /*!< in: true if read-write transaction */ +{ + ut_ad(!trx->in_rollback); + ut_ad(!trx->is_recovered); + ut_ad(trx->start_line != 0); + ut_ad(trx->start_file != 0); + ut_ad(trx->roll_limit == 0); + ut_ad(trx->error_state == DB_SUCCESS); + ut_ad(trx->rsegs.m_redo.rseg == NULL); + ut_ad(trx->rsegs.m_noredo.rseg == NULL); + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0); + + /* Check whether it is an AUTOCOMMIT SELECT */ + trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd); + + trx->read_only = srv_read_only_mode + || (!trx->ddl && !trx->internal + && thd_trx_is_read_only(trx->mysql_thd)); + + if (!trx->auto_commit) { + trx->will_lock = true; + } else if (!trx->will_lock) { + trx->read_only = true; + } + +#ifdef WITH_WSREP + trx->xid->null(); +#endif /* WITH_WSREP */ + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + ut_a(trx->lock.table_locks.empty()); + + /* No other thread can access this trx object through rw_trx_hash, + still it can be found through trx_sys.trx_list. Sometimes it's + possible to indirectly protect trx_t::state by freezing + trx_sys.trx_list. + + For now we update it without mutex protection, because original code + did it this way. It has to be reviewed and fixed properly. */ + trx->state = TRX_STATE_ACTIVE; + + /* By default all transactions are in the read-only list unless they + are non-locking auto-commit read only transactions or background + (internal) transactions. Note: Transactions marked explicitly as + read only can write to temporary tables, we put those on the RO + list too. */ + + if (!trx->read_only + && (trx->mysql_thd == 0 || read_write || trx->ddl)) { + + /* Temporary rseg is assigned only if the transaction + updates a temporary table */ + trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); + ut_ad(trx->rsegs.m_redo.rseg != 0 + || srv_read_only_mode + || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); + + trx_sys.register_rw(trx); + } else { + if (!trx->is_autocommit_non_locking()) { + + /* If this is a read-only transaction that is writing + to a temporary table then it needs a transaction id + to write to the temporary table. */ + + if (read_write) { + ut_ad(!srv_read_only_mode); + trx_sys.register_rw(trx); + } + } else { + ut_ad(!read_write); + } + } + + trx->start_time = time(NULL); + trx->start_time_micro = trx->mysql_thd + ? thd_query_start_micro(trx->mysql_thd) + : microsecond_interval_timer(); + + ut_a(trx->error_state == DB_SUCCESS); + + MONITOR_INC(MONITOR_TRX_ACTIVE); +} + +/** Set the serialisation number for a persistent committed transaction. +@param[in,out] trx committed transaction with persistent changes */ +static +void +trx_serialise(trx_t* trx) +{ + trx_rseg_t *rseg = trx->rsegs.m_redo.rseg; + ut_ad(rseg); + ut_ad(mutex_own(&rseg->mutex)); + + if (rseg->last_page_no == FIL_NULL) { + mutex_enter(&purge_sys.pq_mutex); + } + + trx_sys.assign_new_trx_no(trx); + + /* If the rollback segment is not empty then the + new trx_t::no can't be less than any trx_t::no + already in the rollback segment. User threads only + produce events when a rollback segment is empty. */ + if (rseg->last_page_no == FIL_NULL) { + purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no, + *rseg)); + mutex_exit(&purge_sys.pq_mutex); + } +} + +/****************************************************************//** +Assign the transaction its history serialisation number and write the +update UNDO log record to the assigned rollback segment. */ +static +void +trx_write_serialisation_history( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + /* Change the undo log segment states from TRX_UNDO_ACTIVE to some + other state: these modifications to the file data structure define + the transaction as committed in the file based domain, at the + serialization point of the log sequence number lsn obtained below. */ + + /* We have to hold the rseg mutex because update log headers have + to be put to the history list in the (serialisation) order of the + UNDO trx number. This is required for the purge in-memory data + structures too. */ + + if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { + /* Undo log for temporary tables is discarded at transaction + commit. There is no purge for temporary tables, and also no + MVCC, because they are private to a session. */ + + mtr_t temp_mtr; + temp_mtr.start(); + temp_mtr.set_log_mode(MTR_LOG_NO_REDO); + + mutex_enter(&trx->rsegs.m_noredo.rseg->mutex); + trx_undo_set_state_at_finish(undo, &temp_mtr); + mutex_exit(&trx->rsegs.m_noredo.rseg->mutex); + temp_mtr.commit(); + } + + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + if (!rseg) { + ut_ad(!trx->rsegs.m_redo.undo); + return; + } + + trx_undo_t*& undo = trx->rsegs.m_redo.undo; + + if (!undo) { + return; + } + + ut_ad(!trx->read_only); + ut_ad(!undo || undo->rseg == rseg); + mutex_enter(&rseg->mutex); + + /* Assign the transaction serialisation number and add any + undo log to the purge queue. */ + trx_serialise(trx); + if (undo) { + UT_LIST_REMOVE(rseg->undo_list, undo); + trx_purge_add_undo_to_history(trx, undo, mtr); + } + + mutex_exit(&rseg->mutex); + + MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); +} + +/******************************************************************** +Finalize a transaction containing updates for a FTS table. */ +static +void +trx_finalize_for_fts_table( +/*=======================*/ + fts_trx_table_t* ftt) /* in: FTS trx table */ +{ + fts_t* fts = ftt->table->fts; + fts_doc_ids_t* doc_ids = ftt->added_doc_ids; + + ut_a(fts->add_wq); + + mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg); + + ib_wqueue_add(fts->add_wq, doc_ids, heap); + + /* fts_trx_table_t no longer owns the list. */ + ftt->added_doc_ids = NULL; +} + +/******************************************************************//** +Finalize a transaction containing updates to FTS tables. */ +static +void +trx_finalize_for_fts( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + bool is_commit) /*!< in: true if the transaction was + committed, false if it was rolled back. */ +{ + if (is_commit) { + const ib_rbt_node_t* node; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast<fts_savepoint_t*>( + ib_vector_last(trx->fts_trx->savepoints)); + + tables = savepoint->tables; + + for (node = rbt_first(tables); + node; + node = rbt_next(tables, node)) { + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + if ((*ftt)->added_doc_ids) { + trx_finalize_for_fts_table(*ftt); + } + } + } + + fts_trx_free(trx->fts_trx); + trx->fts_trx = NULL; +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed_low( +/*========================*/ + lsn_t lsn) /*!< in: lsn up to which logs are to be + flushed. */ +{ + bool flush = srv_file_flush_method != SRV_NOSYNC; + + switch (srv_flush_log_at_trx_commit) { + case 2: + /* Write the log but do not flush it to disk */ + flush = false; + /* fall through */ + case 1: + case 3: + /* Write the log and optionally flush it to disk */ + log_write_up_to(lsn, flush); + srv_inc_activity_count(); + return; + case 0: + /* Do nothing */ + return; + } + + ut_error; +} + +/**********************************************************************//** +If required, flushes the log to disk based on the value of +innodb_flush_log_at_trx_commit. */ +static +void +trx_flush_log_if_needed( +/*====================*/ + lsn_t lsn, /*!< in: lsn up to which logs are to be + flushed. */ + trx_t* trx) /*!< in/out: transaction */ +{ + trx->op_info = "flushing log"; + trx_flush_log_if_needed_low(lsn); + trx->op_info = ""; +} + +/**********************************************************************//** +For each table that has been modified by the given transaction: update +its dict_table_t::update_time with the current timestamp. Clear the list +of the modified tables at the end. */ +static +void +trx_update_mod_tables_timestamp( +/*============================*/ + trx_t* trx) /*!< in: transaction */ +{ + /* consider using trx->start_time if calling time() is too + expensive here */ + const time_t now = time(NULL); + + trx_mod_tables_t::const_iterator end = trx->mod_tables.end(); + + for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin(); + it != end; + ++it) { + + /* This could be executed by multiple threads concurrently + on the same table object. This is fine because time_t is + word size or less. And _purely_ _theoretically_, even if + time_t write is not atomic, likely the value of 'now' is + the same in all threads and even if it is not, getting a + "garbage" in table->update_time is justified because + protecting it with a latch here would be too performance + intrusive. */ + dict_table_t* table = it->first; + table->update_time = now; + } + + trx->mod_tables.clear(); +} + +/** Evict a table definition due to the rollback of ALTER TABLE. +@param[in] table_id table identifier */ +void trx_t::evict_table(table_id_t table_id) +{ + ut_ad(in_rollback); + + dict_table_t* table = dict_table_open_on_id( + table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + if (!table) { + return; + } + + if (!table->release()) { + /* This must be a DDL operation that is being rolled + back in an active connection. */ + ut_a(table->get_ref_count() == 1); + ut_ad(!is_recovered); + ut_ad(mysql_thd); + return; + } + + /* This table should only be locked by this transaction, if at all. */ + ut_ad(UT_LIST_GET_LEN(table->locks) <= 1); + const bool locked = UT_LIST_GET_LEN(table->locks); + ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this); + dict_sys.remove(table, true, locked); + if (locked) { + UT_LIST_ADD_FIRST(lock.evicted_tables, table); + } +} + +/** Mark a transaction committed in the main memory data structures. */ +inline void trx_t::commit_in_memory(const mtr_t *mtr) +{ + must_flush_log_later= false; + read_view.close(); + + if (is_autocommit_non_locking()) + { + ut_ad(id == 0); + ut_ad(read_only); + ut_ad(!will_lock); + ut_a(!is_recovered); + ut_ad(!rsegs.m_redo.rseg); + ut_ad(mysql_thd); + ut_ad(state == TRX_STATE_ACTIVE); + + /* Note: We are asserting without holding the lock mutex. But + that is OK because this transaction is not waiting and cannot + be rolled back and no new locks can (or should) be added + because it is flagged as a non-locking read-only transaction. */ + ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0); + + /* This state change is not protected by any mutex, therefore + there is an inherent race here around state transition during + printouts. We ignore this race for the sake of efficiency. + However, the freezing of trx_sys.trx_list will protect the trx_t + instance and it cannot be removed from the trx_list and freed + without first unfreezing trx_list. */ + state= TRX_STATE_NOT_STARTED; + + MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT); + + DBUG_LOG("trx", "Autocommit in memory: " << this); + } + else + { +#ifdef UNIV_DEBUG + if (!UT_LIST_GET_LEN(lock.trx_locks)) + for (auto l : lock.table_locks) + ut_ad(!l); +#endif /* UNIV_DEBUG */ + commit_state(); + + if (id) + { + trx_sys.deregister_rw(this); + + /* Wait for any implicit-to-explicit lock conversions to cease, + so that there will be no race condition in lock_release(). */ + while (UNIV_UNLIKELY(is_referenced())) + ut_delay(srv_spin_wait_delay); + } + else + ut_ad(read_only || !rsegs.m_redo.rseg); + + if (read_only || !rsegs.m_redo.rseg) + { + MONITOR_INC(MONITOR_TRX_RO_COMMIT); + } + else + { + trx_update_mod_tables_timestamp(this); + MONITOR_INC(MONITOR_TRX_RW_COMMIT); + is_recovered= false; + } + + release_locks(); + id= 0; + DEBUG_SYNC_C("after_trx_committed_in_memory"); + + while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) + { + UT_LIST_REMOVE(lock.evicted_tables, table); + dict_mem_table_free(table); + } + } + + ut_ad(!rsegs.m_redo.undo); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + + if (trx_rseg_t *rseg= rsegs.m_redo.rseg) + { + mutex_enter(&rseg->mutex); + ut_ad(rseg->trx_ref_count > 0); + --rseg->trx_ref_count; + mutex_exit(&rseg->mutex); + } + + if (mtr) + { + if (trx_undo_t *&undo= rsegs.m_noredo.undo) + { + ut_ad(undo->rseg == rsegs.m_noredo.rseg); + trx_undo_commit_cleanup(undo); + undo= nullptr; + } + + /* NOTE that we could possibly make a group commit more efficient + here: call os_thread_yield here to allow also other trxs to come + to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if the OS + does not crash. We may also flush the log files to disk, making + the transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group commit is that a group of transactions + gather behind a trx doing a physical disk write to log files, and + when that physical write has been completed, one of those + transactions does a write which commits the whole group. Note that + this group commit will only bring benefit if there are > 2 users + in the database. Then at least 2 users can gather behind one doing + the physical log write to disk. + + If we are calling trx_t::commit() under prepare_commit_mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the mutex. This is to make the group commit + algorithm to work. Otherwise, the prepare_commit mutex would + serialize all commits and prevent a group of transactions from + gathering. */ + + commit_lsn= mtr->commit_lsn(); + if (!commit_lsn) + /* Nothing to be done. */; + else if (flush_log_later) + /* Do nothing yet */ + must_flush_log_later= true; + else if (srv_flush_log_at_trx_commit) + trx_flush_log_if_needed(commit_lsn, this); + } + + ut_ad(!rsegs.m_noredo.undo); + + /* Free all savepoints, starting from the first. */ + trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints); + + trx_roll_savepoints_free(this, savep); + + if (fts_trx) + trx_finalize_for_fts(this, undo_no != 0); + +#ifdef WITH_WSREP + /* Serialization history has been written and the transaction is + committed in memory, which makes this commit ordered. Release commit + order critical section. */ + if (wsrep) + { + wsrep= false; + wsrep_commit_ordered(mysql_thd); + } + lock.was_chosen_as_wsrep_victim= false; +#endif /* WITH_WSREP */ + trx_mutex_enter(this); + dict_operation= TRX_DICT_OP_NONE; + + DBUG_LOG("trx", "Commit in memory: " << this); + state= TRX_STATE_NOT_STARTED; + + assert_freed(); + trx_init(this); + trx_mutex_exit(this); + + ut_a(error_state == DB_SUCCESS); + if (!srv_read_only_mode) + srv_wake_purge_thread_if_not_active(); +} + +/** Commit the transaction in a mini-transaction. +@param mtr mini-transaction (if there are any persistent modifications) */ +void trx_t::commit_low(mtr_t *mtr) +{ + ut_ad(!mtr || mtr->is_active()); + ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK); + ut_ad(!mtr == (aborted || !has_logged())); + ut_ad(!mtr || !aborted); + + /* undo_no is non-zero if we're doing the final commit. */ + if (fts_trx && undo_no) + { + ut_a(!is_autocommit_non_locking()); + /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of + dying. This is a possible scenario if there is a crash between + insert to DELETED table committing and transaction committing. The + fix would be able to return error from this function */ + if (dberr_t error= fts_commit(this)) + ut_a(error == DB_DUPLICATE_KEY); + } + +#ifndef DBUG_OFF + const bool debug_sync= mysql_thd && has_logged_persistent(); +#endif + + if (mtr) + { + trx_write_serialisation_history(this, mtr); + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this log + sequence number. The transaction becomes 'durable' when we write + the log to disk, but in the logical sense the commit in the + file-based data structures (undo logs etc.) happens here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come in + exactly the same order as commit lsn's, if the transactions have + different rollback segments. To get exactly the same order we + should hold the kernel mutex up to this point, adding to the + contention of the kernel mutex. However, if a transaction T2 is + able to see modifications made by a transaction T1, T2 will always + get a bigger transaction number and a bigger commit lsn than T1. */ + + mtr->commit(); + } +#ifndef DBUG_OFF + if (debug_sync) + DEBUG_SYNC_C("before_trx_state_committed_in_memory"); +#endif + + commit_in_memory(mtr); +} + + +void trx_t::commit() +{ + mtr_t *mtr= nullptr; + mtr_t local_mtr; + + if (has_logged()) + { + mtr= &local_mtr; + local_mtr.start(); + } + commit_low(mtr); +} + +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* We are reading trx->state without holding trx->mutex + here, because the commit or rollback should be invoked for a + running (or recovered prepared) transaction that is associated + with the current thread. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, true); + /* fall through */ + + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + /* If the trx is in a lock wait state, moves the waiting + query thread to the suspended state */ + + if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + ut_a(trx->lock.wait_thr != NULL); + trx->lock.wait_thr->state = QUE_THR_SUSPENDED; + trx->lock.wait_thr = NULL; + + trx->lock.que_state = TRX_QUE_RUNNING; + } + + ut_ad(trx->lock.n_active_thrs == 1); + return; + + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap) /*!< in: mem heap where created */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node))); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr) /*!< in: query thread */ +{ + commit_node_t* node; + + node = static_cast<commit_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + trx_t* trx; + + node->state = COMMIT_NODE_WAIT; + + trx = thr_get_trx(thr); + + ut_a(trx->lock.wait_thr == NULL); + ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT); + + trx_commit_or_rollback_prepare(trx); + + trx->lock.que_state = TRX_QUE_COMMITTING; + trx->commit(); + ut_ad(trx->lock.wait_thr == NULL); + trx->lock.que_state = TRX_QUE_RUNNING; + + thr = NULL; + } else { + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); + + trx_start_low(trx, true); + /* fall through */ + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + trx->op_info = "committing"; + trx->commit(); + MONITOR_DEC(MONITOR_TRX_ACTIVE); + trx->op_info = ""; + return(DB_SUCCESS); + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + ut_error; + return(DB_CORRUPTION); +} + +/**********************************************************************//** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +void +trx_commit_complete_for_mysql( +/*==========================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + if (trx->id != 0 + || !trx->must_flush_log_later + || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) { + + return; + } + + trx_flush_log_if_needed(trx->commit_lsn, trx); + + trx->must_flush_log_later = false; +} + +/**********************************************************************//** +Marks the latest SQL statement ended. */ +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /*!< in: trx handle */ +{ + ut_a(trx); + + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + case TRX_STATE_NOT_STARTED: + trx->undo_no = 0; + /* fall through */ + case TRX_STATE_ACTIVE: + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + + if (trx->fts_trx != NULL) { + fts_savepoint_laststmt_refresh(trx); + } + + return; + } + + ut_error; +} + +/**********************************************************************//** +Prints info about a transaction. */ +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: lock_number_of_rows_locked(&trx->lock) */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size) + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ +{ + ibool newline; + + fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx)); + + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + fputs(", not started", f); + goto state_ok; + case TRX_STATE_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong) difftime(time(NULL), trx->start_time)); + goto state_ok; + case TRX_STATE_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + goto state_ok; + } + fprintf(f, ", state %lu", (ulong) trx->state); + ut_ad(0); +state_ok: + const char* op_info = trx->op_info; + + if (*op_info) { + putc(' ', f); + fputs(op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + newline = TRUE; + + /* trx->lock.que_state of an ACTIVE transaction may change + while we are not holding trx->mutex. We perform a dirty read + for performance reasons. */ + + switch (trx->lock.que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->lock.que_state); + } + + if (n_trx_locks > 0 || heap_size > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) n_trx_locks, + (ulong) heap_size, + (ulong) n_rec_locks); + } + + if (trx->undo_no != 0) { + newline = TRUE; + fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no); + } + + if (newline) { + putc('\n', f); + } + + if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) { + innobase_mysql_print_thd( + f, trx->mysql_thd, static_cast<uint>(max_query_len)); + } +} + +/**********************************************************************//** +Prints info about a transaction. +The caller must hold lock_sys.mutex. +When possible, use trx_print() instead. */ +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ut_ad(lock_mutex_own()); + + trx_print_low(f, trx, max_query_len, + lock_number_of_rows_locked(&trx->lock), + UT_LIST_GET_LEN(trx->lock.trx_locks), + mem_heap_get_size(trx->lock.lock_heap)); +} + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys.mutex. */ +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len) /*!< in: max query length to print, + or 0 to use the default max length */ +{ + ulint n_rec_locks; + ulint n_trx_locks; + ulint heap_size; + + lock_mutex_enter(); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); + heap_size = mem_heap_get_size(trx->lock.lock_heap); + lock_mutex_exit(); + + trx_print_low(f, trx, max_query_len, + n_rec_locks, n_trx_locks, heap_size); +} + +/*******************************************************************//** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. +@return TRUE if weight(a) >= weight(b) */ +bool +trx_weight_ge( +/*==========*/ + const trx_t* a, /*!< in: transaction to be compared */ + const trx_t* b) /*!< in: transaction to be compared */ +{ + ibool a_notrans_edit; + ibool b_notrans_edit; + + /* If mysql_thd is NULL for a transaction we assume that it has + not edited non-transactional tables. */ + + a_notrans_edit = a->mysql_thd != NULL + && thd_has_edited_nontrans_tables(a->mysql_thd); + + b_notrans_edit = b->mysql_thd != NULL + && thd_has_edited_nontrans_tables(b->mysql_thd); + + if (a_notrans_edit != b_notrans_edit) { + + return(a_notrans_edit); + } + + /* Either both had edited non-transactional tables or both had + not, we fall back to comparing the number of altered/locked + rows. */ + + return(TRX_WEIGHT(a) >= TRX_WEIGHT(b)); +} + +/** Prepare a transaction. +@return log sequence number that makes the XA PREPARE durable +@retval 0 if no changes needed to be made durable */ +static lsn_t trx_prepare_low(trx_t *trx) +{ + ut_ad(!trx->is_recovered); + + mtr_t mtr; + + if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) { + ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg); + + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + mutex_enter(&undo->rseg->mutex); + trx_undo_set_state_at_prepare(trx, undo, false, &mtr); + mutex_exit(&undo->rseg->mutex); + + mtr.commit(); + } + + trx_undo_t* undo = trx->rsegs.m_redo.undo; + + if (!undo) { + /* There were no changes to persistent tables. */ + return(0); + } + + trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; + ut_ad(undo->rseg == rseg); + + mtr.start(); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE to + TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the file-based + world, at the serialization point of lsn. */ + + mutex_enter(&rseg->mutex); + trx_undo_set_state_at_prepare(trx, undo, false, &mtr); + mutex_exit(&rseg->mutex); + + /* Make the XA PREPARE durable. */ + mtr.commit(); + ut_ad(mtr.commit_lsn() > 0); + return(mtr.commit_lsn()); +} + +/****************************************************************//** +Prepares a transaction. */ +static +void +trx_prepare( +/*========*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Only fresh user transactions can be prepared. + Recovered transactions cannot. */ + ut_a(!trx->is_recovered); + + lsn_t lsn = trx_prepare_low(trx); + + DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE();); + + ut_a(trx->state == TRX_STATE_ACTIVE); + trx_mutex_enter(trx); + trx->state = TRX_STATE_PREPARED; + trx_mutex_exit(trx); + + if (lsn) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + We must not be holding any mutexes or latches here. */ + + trx_flush_log_if_needed(lsn, trx); + } +} + +/** XA PREPARE a transaction. +@param[in,out] trx transaction to prepare */ +void trx_prepare_for_mysql(trx_t* trx) +{ + trx_start_if_not_started_xa(trx, false); + + trx->op_info = "preparing"; + + trx_prepare(trx); + + trx->op_info = ""; +} + + +struct trx_recover_for_mysql_callback_arg +{ + XID *xid_list; + uint len; + uint count; +}; + + +static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element, + trx_recover_for_mysql_callback_arg *arg) +{ + DBUG_ASSERT(arg->len > 0); + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + /* + The state of a read-write transaction can only change from ACTIVE to + PREPARED while we are holding the element->mutex. But since it is + executed at startup no state change should occur. + */ + if (trx_state_eq(trx, TRX_STATE_PREPARED)) + { + ut_ad(trx->is_recovered); + ut_ad(trx->id); + if (arg->count == 0) + ib::info() << "Starting recovery for XA transactions..."; + XID& xid= arg->xid_list[arg->count]; + if (arg->count++ < arg->len) + { + trx->state= TRX_STATE_PREPARED_RECOVERED; + ib::info() << "Transaction " << trx->id + << " in prepared state after recovery"; + ib::info() << "Transaction contains changes to " << trx->undo_no + << " rows"; + xid= *trx->xid; + } + } + } + mutex_exit(&element->mutex); + /* Do not terminate upon reaching arg->len; count all transactions */ + return false; +} + + +static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element, + void*) +{ + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) + trx->state= TRX_STATE_PREPARED; + } + mutex_exit(&element->mutex); + return false; +} + + +/** + Find prepared transaction objects for recovery. + + @param[out] xid_list prepared transactions + @param[in] len number of slots in xid_list + + @return number of prepared transactions stored in xid_list +*/ + +int trx_recover_for_mysql(XID *xid_list, uint len) +{ + trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 }; + + ut_ad(xid_list); + ut_ad(len); + + /* Fill xid_list with PREPARED transactions. */ + trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg); + if (arg.count) + { + ib::info() << arg.count + << " transactions in prepared state after recovery"; + /* After returning the full list, reset the state, because + init_server_components() wants to recover the collection of + transactions twice, by first calling tc_log->open() and then + ha_recover() directly. */ + if (arg.count <= len) + trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback); + } + return int(std::min(arg.count, len)); +} + + +struct trx_get_trx_by_xid_callback_arg +{ + const XID *xid; + trx_t *trx; +}; + + +static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element, + trx_get_trx_by_xid_callback_arg *arg) +{ + my_bool found= 0; + mutex_enter(&element->mutex); + if (trx_t *trx= element->trx) + { + trx_mutex_enter(trx); + if (trx->is_recovered && + (trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) && + arg->xid->eq(reinterpret_cast<XID*>(trx->xid))) + { +#ifdef WITH_WSREP + /* The commit of a prepared recovered Galera + transaction needs a valid trx->xid for + invoking trx_sys_update_wsrep_checkpoint(). */ + if (!wsrep_is_wsrep_xid(trx->xid)) +#endif /* WITH_WSREP */ + /* Invalidate the XID, so that subsequent calls will not find it. */ + trx->xid->null(); + arg->trx= trx; + found= 1; + } + trx_mutex_exit(trx); + } + mutex_exit(&element->mutex); + return found; +} + +/** Look up an X/Open distributed transaction in XA PREPARE state. +@param[in] xid X/Open XA transaction identifier +@return transaction on match (the trx_t::xid will be invalidated); +note that the trx may have been committed before the caller acquires +trx_t::mutex +@retval NULL if no match */ +trx_t* trx_get_trx_by_xid(const XID* xid) +{ + trx_get_trx_by_xid_callback_arg arg= { xid, 0 }; + + if (xid) + trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg); + return arg.trx; +} + + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write) /*!< in: true if read write transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, read_write); + return; + + case TRX_STATE_ACTIVE: + if (trx->id == 0 && read_write) { + /* If the transaction is tagged as read-only then + it can only write to temp tables and for such + transactions we don't want to move them to the + trx_sys_t::rw_trx_hash. */ + if (!trx->read_only) { + trx_set_rw_mode(trx); + } + } + return; + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_low( +/*==========================*/ + trx_t* trx, /*!< in: transaction */ + bool read_write) /*!< in: true if read write transaction */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + trx_start_low(trx, read_write); + return; + + case TRX_STATE_ACTIVE: + if (read_write && trx->id == 0 && !trx->read_only) { + trx_set_rw_mode(trx); + } + return; + + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Starts a transaction for internal processing. */ +void +trx_start_internal_low( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + /* Ensure it is not flagged as an auto-commit-non-locking + transaction. */ + + trx->will_lock = true; + + trx->internal = true; + + trx_start_low(trx, true); +} + +/** Starts a read-only transaction for internal processing. +@param[in,out] trx transaction to be started */ +void +trx_start_internal_read_only_low( + trx_t* trx) +{ + /* Ensure it is not flagged as an auto-commit-non-locking + transaction. */ + + trx->will_lock = true; + + trx->internal = true; + + trx_start_low(trx, false); +} + +/*************************************************************//** +Starts the transaction for a DDL operation. */ +void +trx_start_for_ddl_low( +/*==================*/ + trx_t* trx, /*!< in/out: transaction */ + trx_dict_op_t op) /*!< in: dictionary operation type */ +{ + switch (trx->state) { + case TRX_STATE_NOT_STARTED: + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + + trx_set_dict_operation(trx, op); + trx->ddl= true; + trx_start_internal_low(trx); + return; + + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + break; + } + + ut_error; +} + +/*************************************************************//** +Set the transaction as a read-write transaction if it is not already +tagged as such. Read-only transactions that are writing to temporary +tables are assigned an ID and a rollback segment but are not added +to the trx read-write list because their updates should not be visible +to other transactions and therefore their changes can be ignored by +by MVCC. */ +void +trx_set_rw_mode( +/*============*/ + trx_t* trx) /*!< in/out: transaction that is RW */ +{ + ut_ad(trx->rsegs.m_redo.rseg == 0); + ut_ad(!trx->is_autocommit_non_locking()); + ut_ad(!trx->read_only); + ut_ad(trx->id == 0); + + if (high_level_read_only) { + return; + } + + trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); + ut_ad(trx->rsegs.m_redo.rseg != 0); + + trx_sys.register_rw(trx); + + /* So that we can see our own changes. */ + if (trx->read_view.is_open()) { + trx->read_view.set_creator_trx_id(trx->id); + } +} + +bool trx_t::has_stats_table_lock() const +{ + for (lock_list::const_iterator it= lock.table_locks.begin(), + end= lock.table_locks.end(); it != end; ++it) + { + const lock_t *lock= *it; + if (lock && lock->un_member.tab_lock.table->is_stats_table()) + return true; + } + + return false; +} |