diff options
Diffstat (limited to 'storage/innobase/row/row0uins.cc')
-rw-r--r-- | storage/innobase/row/row0uins.cc | 608 |
1 files changed, 608 insertions, 0 deletions
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc new file mode 100644 index 00000000..4cf4a873 --- /dev/null +++ b/storage/innobase/row/row0uins.cc @@ -0,0 +1,608 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0uins.cc +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "row0log.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "fil0fil.h" + +/************************************************************************* +IMPORTANT NOTE: Any operation that generates redo MUST check that there +is enough space in the redo log before for that operation. This is +done by calling log_free_check(). The reason for checking the +availability of the redo log space before the start of the operation is +that we MUST not hold any synchonization objects when performing the +check. +If you make a change in this module make sure that no codepath is +introduced where a call to log_free_check() is bypassed. */ + +/***************************************************************//** +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_clust_rec( +/*==========================*/ + undo_node_t* node) /*!< in: undo node */ +{ + ibool success; + dberr_t err; + ulint n_tries = 0; + mtr_t mtr; + dict_index_t* index = node->pcur.btr_cur.index; + bool online; + + ut_ad(index->is_primary()); + ut_ad(node->trx->in_rollback); + + mtr.start(); + if (index->table->is_temporary()) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + mtr.set_log_mode(MTR_LOG_NO_REDO); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(index->table->id >= DICT_HDR_FIRST_ID); + online = false; + } else { + index->set_modified(mtr); + ut_ad(lock_table_has_locks(index->table)); + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + ut_ad(node->trx->dict_operation_lock_mode + != RW_X_LATCH); + ut_ad(node->table->id != DICT_INDEXES_ID); + ut_ad(node->table->id != DICT_COLUMNS_ID); + mtr_s_lock_index(index, &mtr); + } + } + + /* This is similar to row_undo_mod_clust(). The DDL thread may + already have copied this row from the log to the new table. + We must log the removal, so that the row will be correctly + purged. However, we can log the removal out of sync with the + B-tree modification. */ + + success = btr_pcur_restore_position( + online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : (node->rec_type == TRX_UNDO_INSERT_METADATA) + ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &node->pcur, &mtr); + ut_a(success); + + rec_t* rec = btr_pcur_get_rec(&node->pcur); + + ut_ad(rec_get_trx_id(rec, index) == node->trx->id + || node->table->is_temporary()); + ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant()) + || rec_is_alter_metadata(rec, index->table->not_redundant())); + ut_ad(rec_is_metadata(rec, index->table->not_redundant()) + == (node->rec_type == TRX_UNDO_INSERT_METADATA)); + + if (online && dict_index_is_online_ddl(index)) { + mem_heap_t* heap = NULL; + const rec_offs* offsets = rec_get_offsets( + rec, index, NULL, index->n_core_fields, + ULINT_UNDEFINED, &heap); + row_log_table_delete(rec, index, offsets, NULL); + mem_heap_free(heap); + } else { + switch (node->table->id) { + case DICT_INDEXES_ID: + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode + == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + + dict_drop_index_tree(&node->pcur, node->trx, &mtr); + mtr.commit(); + + mtr.start(); + success = btr_pcur_restore_position( + BTR_MODIFY_LEAF, &node->pcur, &mtr); + ut_a(success); + break; + case DICT_COLUMNS_ID: + /* This is rolling back an INSERT into SYS_COLUMNS. + If it was part of an instant ALTER TABLE operation, we + must evict the table definition, so that it can be + reloaded after the dictionary operation has been + completed. At this point, any corresponding operation + to the metadata record will have been rolled back. */ + ut_ad(!online); + ut_ad(node->trx->dict_operation_lock_mode + == RW_X_LATCH); + ut_ad(node->rec_type == TRX_UNDO_INSERT_REC); + if (rec_get_n_fields_old(rec) + != DICT_NUM_FIELDS__SYS_COLUMNS) { + break; + } + ulint len; + const byte* data = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { + break; + } + node->trx->evict_table(mach_read_from_8(data)); + } + } + + if (btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr)) { + err = DB_SUCCESS; + goto func_exit; + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr.start(); + if (index->table->is_temporary()) { + mtr.set_log_mode(MTR_LOG_NO_REDO); + } else { + index->set_modified(mtr); + } + + success = btr_pcur_restore_position( + BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, + &node->pcur, &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true, + &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + +func_exit: + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) { + /* When rolling back the very first instant ADD COLUMN + operation, reset the root page to the basic state. */ + btr_reset_instant(*index, true, &mtr); + } + + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + return(err); +} + +/***************************************************************//** +Removes a secondary index entry if found. +@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_low( +/*========================*/ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to remove */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_pcur_t pcur; + dberr_t err = DB_SUCCESS; + mtr_t mtr; + const bool modify_leaf = mode == BTR_MODIFY_LEAF; + + row_mtr_start(&mtr, index, !modify_leaf); + + if (modify_leaf) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock_index(index, &mtr); + } else { + ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); + mtr_sx_lock_index(index, &mtr); + } + + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } + + if (dict_index_is_spatial(index)) { + if (modify_leaf) { + mode |= BTR_RTREE_DELETE_MARK; + } + btr_pcur_get_btr_cur(&pcur)->thr = thr; + mode |= BTR_RTREE_UNDO_INS; + } + + switch (row_search_index_entry(index, entry, mode, &pcur, &mtr)) { + case ROW_BUFFERED: + case ROW_NOT_DELETED_REF: + /* These are invalid outcomes, because the mode passed + to row_search_index_entry() did not include any of the + flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ + ut_error; + case ROW_NOT_FOUND: + break; + case ROW_FOUND: + if (dict_index_is_spatial(index) + && rec_get_deleted_flag( + btr_pcur_get_rec(&pcur), + dict_table_is_comp(index->table))) { + ib::error() << "Record found in index " << index->name + << " is deleted marked on insert rollback."; + ut_ad(0); + } + + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (modify_leaf) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr) + ? DB_SUCCESS : DB_FAIL; + } else { + /* Passing rollback=false here, because we are + deleting a secondary index record: the distinction + only matters when deleting a record that contains + externally stored columns. */ + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, + false, &mtr); + } + } + + btr_pcur_close(&pcur); +func_exit_no_pcur: + mtr_commit(&mtr); + + return(err); +} + +/***************************************************************//** +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec( +/*====================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low( + BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, + index, entry, thr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/** Parse an insert undo record. +@param[in,out] node row rollback state +@param[in] dict_locked whether the data dictionary cache is locked */ +static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked) +{ + dict_index_t* clust_index; + byte* ptr; + undo_no_t undo_no; + table_id_t table_id; + ulint dummy; + bool dummy_extern; + + ut_ad(node->state == UNDO_INSERT_PERSISTENT + || node->state == UNDO_INSERT_TEMPORARY); + ut_ad(node->trx->in_rollback); + ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr)); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy, + &dummy_extern, &undo_no, &table_id); + + node->update = NULL; + if (node->state == UNDO_INSERT_PERSISTENT) { + node->table = dict_table_open_on_id(table_id, dict_locked, + DICT_TABLE_OP_NORMAL); + } else if (!dict_locked) { + mutex_enter(&dict_sys.mutex); + node->table = dict_sys.get_temporary_table(table_id); + mutex_exit(&dict_sys.mutex); + } else { + node->table = dict_sys.get_temporary_table(table_id); + } + + if (!node->table) { + return false; + } + + switch (node->rec_type) { + default: + ut_ad("wrong undo record type" == 0); + goto close_table; + case TRX_UNDO_INSERT_METADATA: + case TRX_UNDO_INSERT_REC: + break; + case TRX_UNDO_RENAME_TABLE: + dict_table_t* table = node->table; + ut_ad(!table->is_temporary()); + ut_ad(dict_table_is_file_per_table(table) + == !is_system_tablespace(table->space_id)); + size_t len = mach_read_from_2(node->undo_rec) + + size_t(node->undo_rec - ptr) - 2; + ptr[len] = 0; + const char* name = reinterpret_cast<char*>(ptr); + if (strcmp(table->name.m_name, name)) { + dict_table_rename_in_cache(table, name, false, + table_id != 0); + } + goto close_table; + } + + if (UNIV_UNLIKELY(!node->table->is_accessible())) { +close_table: + /* Normally, tables should not disappear or become + unaccessible during ROLLBACK, because they should be + protected by InnoDB table locks. Corruption could be + a valid exception. + + FIXME: When running out of temporary tablespace, it + would probably be better to just drop all temporary + tables (and temporary undo log records) of the current + connection, instead of doing this rollback. */ + dict_table_close(node->table, dict_locked, FALSE); + node->table = NULL; + return false; + } else { + ut_ad(!node->table->skip_alter_undo); + clust_index = dict_table_get_first_index(node->table); + + if (clust_index != NULL) { + if (node->rec_type == TRX_UNDO_INSERT_REC) { + ptr = trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, + node->heap); + } else { + node->ref = &trx_undo_metadata; + if (!row_undo_search_clust_to_pcur(node)) { + /* An error probably occurred during + an insert into the clustered index, + after we wrote the undo log record. */ + goto close_table; + } + return true; + } + + if (!row_undo_search_clust_to_pcur(node)) { + /* An error probably occurred during + an insert into the clustered index, + after we wrote the undo log record. */ + goto close_table; + } + if (node->table->n_v_cols) { + trx_undo_read_v_cols(node->table, ptr, + node->row, false); + } + + } else { + ib::warn() << "Table " << node->table->name + << " has no indexes," + " ignoring the table"; + goto close_table; + } + } + + return true; +} + +/***************************************************************//** +Removes secondary index records. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +row_undo_ins_remove_sec_rec( +/*========================*/ + undo_node_t* node, /*!< in/out: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err = DB_SUCCESS; + dict_index_t* index = node->index; + mem_heap_t* heap; + + heap = mem_heap_create(1024); + + while (index != NULL) { + dtuple_t* entry; + + if (index->type & DICT_FTS) { + dict_table_next_uncorrupted_index(index); + continue; + } + + /* An insert undo record TRX_UNDO_INSERT_REC will + always contain all fields of the index. It does not + matter if any indexes were created afterwards; all + index entries can be reconstructed from the row. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record, or a statement is being rolled + back because an error occurred while storing + off-page columns. + + Because secondary index entries are inserted + after the clustered index record, we may + assume that the secondary index record does + not exist. */ + } else { + err = row_undo_ins_remove_sec(index, entry, thr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto func_exit; + } + } + + mem_heap_empty(heap); + dict_table_next_uncorrupted_index(index); + } + +func_exit: + node->index = index; + mem_heap_free(heap); + return(err); +} + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + bool dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; + + if (!row_undo_ins_parse_undo_rec(node, dict_locked)) { + return DB_SUCCESS; + } + + /* Iterate over all the indexes and undo the insert.*/ + + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + switch (node->rec_type) { + default: + ut_ad("wrong undo record type" == 0); + /* fall through */ + case TRX_UNDO_INSERT_REC: + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); + + dict_table_skip_corrupt_index(node->index); + + err = row_undo_ins_remove_sec_rec(node, thr); + + if (err != DB_SUCCESS) { + break; + } + + log_free_check(); + + if (node->table->id == DICT_INDEXES_ID) { + ut_ad(!node->table->is_temporary()); + if (!dict_locked) { + mutex_enter(&dict_sys.mutex); + } + err = row_undo_ins_remove_clust_rec(node); + if (!dict_locked) { + mutex_exit(&dict_sys.mutex); + } + } else { + err = row_undo_ins_remove_clust_rec(node); + } + + if (err == DB_SUCCESS && node->table->stat_initialized) { + /* Not protected by dict_sys.mutex for + performance reasons, we would rather get garbage + in stat_n_rows (which is just an estimate anyway) + than protecting the following code with a latch. */ + dict_table_n_rows_dec(node->table); + + /* Do not attempt to update statistics when + executing ROLLBACK in the InnoDB SQL + interpreter, because in that case we would + already be holding dict_sys.mutex, which + would be acquired when updating statistics. */ + if (!dict_locked) { + dict_stats_update_if_needed(node->table, + *node->trx); + } + } + break; + + case TRX_UNDO_INSERT_METADATA: + log_free_check(); + ut_ad(!node->table->is_temporary()); + err = row_undo_ins_remove_clust_rec(node); + } + + dict_table_close(node->table, dict_locked, FALSE); + + node->table = NULL; + + return(err); +} |