diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/dict | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | storage/innobase/dict/dict0boot.cc | 440 | ||||
-rw-r--r-- | storage/innobase/dict/dict0crea.cc | 1906 | ||||
-rw-r--r-- | storage/innobase/dict/dict0defrag_bg.cc | 434 | ||||
-rw-r--r-- | storage/innobase/dict/dict0dict.cc | 4859 | ||||
-rw-r--r-- | storage/innobase/dict/dict0load.cc | 3213 | ||||
-rw-r--r-- | storage/innobase/dict/dict0mem.cc | 1379 | ||||
-rw-r--r-- | storage/innobase/dict/dict0stats.cc | 4724 | ||||
-rw-r--r-- | storage/innobase/dict/dict0stats_bg.cc | 424 | ||||
-rw-r--r-- | storage/innobase/dict/drop.cc | 297 |
9 files changed, 17676 insertions, 0 deletions
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc new file mode 100644 index 00000000..5516bce9 --- /dev/null +++ b/storage/innobase/dict/dict0boot.cc @@ -0,0 +1,440 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0boot.cc +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0boot.h" +#include "dict0crea.h" +#include "btr0btr.h" +#include "dict0load.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "os0file.h" + +/** The DICT_HDR page identifier */ +static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO}; + +/** @return the DICT_HDR block, x-latched */ +static buf_block_t *dict_hdr_get(mtr_t *mtr) +{ + /* We assume that the DICT_HDR page is always readable and available. */ + return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr); +} + +/**********************************************************************//** +Returns a new table, index, or space id. */ +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + uint32_t* space_id) /*!< out: space id + (not assigned if NULL) */ +{ + ib_id_t id; + mtr_t mtr; + + mtr.start(); + buf_block_t* dict_hdr = dict_hdr_get(&mtr); + + if (table_id) { + id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID + + dict_hdr->page.frame); + id++; + mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID + + dict_hdr->page.frame, id); + *table_id = id; + } + + if (index_id) { + id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID + + dict_hdr->page.frame); + id++; + mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID + + dict_hdr->page.frame, id); + *index_id = id; + } + + if (space_id) { + *space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + + dict_hdr->page.frame); + if (fil_assign_new_space_id(space_id)) { + mtr.write<4>(*dict_hdr, + DICT_HDR + DICT_HDR_MAX_SPACE_ID + + dict_hdr->page.frame, *space_id); + } + } + + mtr.commit(); +} + +/** Update dict_sys.row_id in the dictionary header file page. */ +void dict_hdr_flush_row_id(row_id_t id) +{ + mtr_t mtr; + mtr.start(); + buf_block_t* d= dict_hdr_get(&mtr); + byte *row_id= DICT_HDR + DICT_HDR_ROW_ID + d->page.frame; + if (mach_read_from_8(row_id) < id) + mtr.write<8>(*d, row_id, id); + mtr.commit(); +} + +/** Create the DICT_HDR page on database initialization. +@return error code */ +dberr_t dict_create() +{ + ulint root_page_no; + + dberr_t err; + mtr_t mtr; + mtr.start(); + compile_time_assert(DICT_HDR_SPACE == 0); + + /* Create the dictionary header file block in a new, allocated file + segment in the system tablespace */ + buf_block_t* d = fseg_create(fil_system.sys_space, + DICT_HDR + DICT_HDR_FSEG_HEADER, &mtr, + &err); + if (!d) { + goto func_exit; + } + ut_a(d->page.id() == hdr_page_id); + + /* Start counting row, table, index, and tree ids from + DICT_HDR_FIRST_ID */ + mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->page.frame, + DICT_HDR_FIRST_ID); + mtr.write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->page.frame, + DICT_HDR_FIRST_ID); + mtr.write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->page.frame, + DICT_HDR_FIRST_ID); + + ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + + d->page.frame)); + + /* Obsolete, but we must initialize it anyway. */ + mtr.write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->page.frame, + DICT_HDR_FIRST_ID); + + /* Create the B-tree roots for the clustered indexes of the basic + system tables */ + + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_TABLES_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_UNIQUE, + fil_system.sys_space, DICT_TABLE_IDS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_COLUMNS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_INDEXES_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_FIELDS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->page.frame, + root_page_no); +func_exit: + mtr.commit(); + return err ? err : dict_boot(); +} + +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +dberr_t dict_boot() +{ + dict_table_t* table; + dict_index_t* index; + mem_heap_t* heap; + mtr_t mtr; + + static_assert(DICT_NUM_COLS__SYS_TABLES == 8, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_TABLES == 10, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_COLUMNS == 7, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_COLUMNS == 9, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_INDEXES == 8, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_INDEXES == 10, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FIELDS == 3, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FIELDS == 5, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FOREIGN == 4, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN == 6, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2, + "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6, "compatibility"); + + mtr.start(); + /* Create the hash tables etc. */ + dict_sys.create(); + + dberr_t err; + const buf_block_t *d = buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, + nullptr, BUF_GET, &mtr, &err); + if (!d) { + mtr.commit(); + return err; + } + + heap = mem_heap_create(450); + + dict_sys.lock(SRW_LOCK_CALL); + + const byte* dict_hdr = &d->page.frame[DICT_HDR]; + + /* Because we only write new row ids to disk-based data structure + (dictionary header) when it is divisible by + DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover + the latest value of the row id counter. Therefore we advance + the counter at the database startup to avoid overlapping values. + Note that when a user after database startup first time asks for + a new row id, then because the counter is now divisible by + ..._MARGIN, it will immediately be updated to the disk-based + header. */ + + dict_sys.recover_row_id(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID)); + if (uint32_t max_space_id + = mach_read_from_4(dict_hdr + DICT_HDR_MAX_SPACE_ID)) { + max_space_id--; + fil_assign_new_space_id(&max_space_id); + } + + /* Insert into the dictionary cache the descriptions of the basic + system tables */ + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_TABLES], + fil_system.sys_space, + DICT_NUM_COLS__SYS_TABLES, 0, 0, 0); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, + MAX_FULL_NAME_LEN); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8); + /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ + dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); + /* The low order bit of TYPE is always set to 1. If ROW_FORMAT + is not REDUNDANT or COMPACT, this field matches table->flags. */ + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); + /* MIX_LEN may contain additional table flags when + ROW_FORMAT!=REDUNDANT. */ + dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + + table->id = DICT_TABLES_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_tables = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 1); + + dict_mem_index_add_field(index, "NAME", 0); + + index->id = DICT_TABLES_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_TABLE_IDS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS)); + ut_a(err == DB_SUCCESS); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_COLUMNS], + fil_system.sys_space, + DICT_NUM_COLS__SYS_COLUMNS, 0, 0, 0); + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); + + table->id = DICT_COLUMNS_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_columns = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_COLUMNS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_INDEXES], + fil_system.sys_space, + DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + /* SYS_INDEXES.SPACE is only read by in dict_drop_index_tree() */ + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4); + + table->id = DICT_INDEXES_ID; + + dict_table_add_system_columns(table, heap); + /* The column SYS_INDEXES.MERGE_THRESHOLD was "instantly" + added in MySQL 5.7 and MariaDB 10.2.2. Assign it DEFAULT NULL. + Because of file format compatibility, we must treat SYS_INDEXES + as a special case, relaxing some debug assertions + for DICT_INDEXES_ID. */ + dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD) + ->def_val.len = UNIV_SQL_NULL; + table->add_to_cache(); + dict_sys.sys_indexes = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_INDEXES_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_FIELDS], + fil_system.sys_space, + DICT_NUM_COLS__SYS_FIELDS, 0, 0, 0); + dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); + + table->id = DICT_FIELDS_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_fields = table; + mem_heap_free(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "INDEX_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_FIELDS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + mtr.commit(); + + err = ibuf_init_at_db_start(); + + if (err == DB_SUCCESS || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO) { + err = DB_SUCCESS; + /* Load definitions of other indexes on system tables */ + + dict_load_sys_table(dict_sys.sys_tables); + dict_load_sys_table(dict_sys.sys_columns); + dict_load_sys_table(dict_sys.sys_indexes); + dict_load_sys_table(dict_sys.sys_fields); + dict_sys.unlock(); + dict_sys.load_sys_tables(); + } else { + dict_sys.unlock(); + } + + return err; +} diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc new file mode 100644 index 00000000..cce5f2f2 --- /dev/null +++ b/storage/innobase/dict/dict0crea.cc @@ -0,0 +1,1906 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0crea.cc +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0crea.h" +#include "btr0pcur.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "btr0sea.h" +#endif /* BTR_CUR_HASH_ADAPT */ +#include "page0page.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "lock0lock.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0mysql.h" +#include "pars0pars.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "ut0vec.h" +#include "fts0priv.h" +#include "srv0start.h" +#include "log.h" + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_TABLES system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_tables_tuple( +/*=========================*/ + const dict_table_t* table, /*!< in: table */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + ulint type; + + ut_ad(table); + ut_ad(!table->space || table->space->id == table->space_id); + ut_ad(heap); + ut_ad(table->n_cols >= DATA_N_SYS_COLS); + + entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_tables); + + /* 0: NAME -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__NAME); + + dfield_set_data(dfield, + table->name.m_name, strlen(table->name.m_name)); + + /* 1: DB_TRX_ID added later */ + /* 2: DB_ROLL_PTR added later */ + /* 3: ID -------------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 4: N_COLS ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__N_COLS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + /* If there is any virtual column, encode it in N_COLS */ + mach_write_to_4(ptr, dict_table_encode_n_col( + ulint(table->n_cols - DATA_N_SYS_COLS), + ulint(table->n_v_def)) + | (ulint(table->flags & DICT_TF_COMPACT) << 31)); + dfield_set_data(dfield, ptr, 4); + + /* 5: TYPE (table flags) -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__TYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + /* Validate the table flags and convert them to what is saved in + SYS_TABLES.TYPE. Table flag values 0 and 1 are both written to + SYS_TABLES.TYPE as 1. */ + type = dict_tf_to_sys_tables_type(table->flags); + mach_write_to_4(ptr, type); + + dfield_set_data(dfield, ptr, 4); + + /* 6: MIX_ID (obsolete) ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_ID); + + ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8)); + + dfield_set_data(dfield, ptr, 8); + + /* 7: MIX_LEN (additional flags) --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_LEN); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + /* Be sure all non-used bits are zero. */ + ut_a(!(table->flags2 & DICT_TF2_UNUSED_BIT_MASK)); + mach_write_to_4(ptr, table->flags2); + + dfield_set_data(dfield, ptr, 4); + + /* 8: CLUSTER_NAME ---------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__CLUSTER_ID); + dfield_set_null(dfield); /* not supported */ + + /* 9: SPACE ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__SPACE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, table->space_id); + + dfield_set_data(dfield, ptr, 4); + /*----------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_COLUMNS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_columns_tuple( +/*==========================*/ + const dict_table_t* table, /*!< in: table */ + ulint i, /*!< in: column number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + const dict_col_t* column; + dfield_t* dfield; + byte* ptr; + const char* col_name; + ulint num_base = 0; + ulint v_col_no = ULINT_UNDEFINED; + + ut_ad(table); + ut_ad(heap); + + /* Any column beyond table->n_def would be virtual columns */ + if (i >= table->n_def) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + table, i - table->n_def); + column = &v_col->m_col; + num_base = v_col->num_base; + v_col_no = column->ind; + } else { + column = dict_table_get_nth_col(table, i); + ut_ad(!column->is_virtual()); + } + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_columns); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + if (v_col_no != ULINT_UNDEFINED) { + /* encode virtual column's position in MySQL table and InnoDB + table in "POS" */ + mach_write_to_4(ptr, dict_create_v_col_pos( + i - table->n_def, v_col_no)); + } else { + mach_write_to_4(ptr, i); + } + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME); + + if (i >= table->n_def) { + col_name = dict_table_get_v_col_name(table, i - table->n_def); + } else { + col_name = dict_table_get_col_name(table, i); + } + + dfield_set_data(dfield, col_name, strlen(col_name)); + + /* 5: MTYPE --------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->mtype); + + dfield_set_data(dfield, ptr, 4); + + /* 6: PRTYPE -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->prtype); + + dfield_set_data(dfield, ptr, 4); + + /* 7: LEN ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->len); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PREC ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, num_base); + + dfield_set_data(dfield, ptr, 4); + /*---------------------------------*/ + + return(entry); +} + +/** Based on a table object, this function builds the entry to be inserted +in the SYS_VIRTUAL system table. Each row maps a virtual column to one of +its base column. +@param[in] table table +@param[in] v_col_n virtual column number +@param[in] b_col_n base column sequence num +@param[in] heap memory heap +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_virtual_tuple( + const dict_table_t* table, + ulint v_col_n, + ulint b_col_n, + mem_heap_t* heap) +{ + dtuple_t* entry; + const dict_col_t* base_column; + dfield_t* dfield; + byte* ptr; + + ut_ad(table); + ut_ad(heap); + + ut_ad(v_col_n < table->n_v_def); + dict_v_col_t* v_col = dict_table_get_nth_v_col(table, v_col_n); + base_column = v_col->base_col[b_col_n]; + + entry = dtuple_create(heap, DICT_NUM_COLS__SYS_VIRTUAL + + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_virtual); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__TABLE_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + ulint v_col_no = dict_create_v_col_pos(v_col_n, v_col->m_col.ind); + mach_write_to_4(ptr, v_col_no); + + dfield_set_data(dfield, ptr, 4); + + /* 2: BASE_POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__BASE_POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, base_column->ind); + + dfield_set_data(dfield, ptr, 4); + + /* 3: DB_TRX_ID added later */ + /* 4: DB_ROLL_PTR added later */ + + /*---------------------------------*/ + return(entry); +} + +/***************************************************************//** +Builds a table definition to insert. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_build_table_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + tab_node_t* node) /*!< in: table create node */ +{ + ut_ad(dict_sys.locked()); + dict_table_t* table = node->table; + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id == UINT32_MAX); + dict_hdr_get_new_id(&table->id, nullptr, nullptr); + + /* Always set this bit for all new created tables */ + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + DICT_TF2_FLAG_UNSET(table, + DICT_TF2_FTS_AUX_HEX_NAME);); + + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) { + /* This table will need a new tablespace. */ + + ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0 + || dict_table_has_atomic_blobs(table)); + /* Get a new tablespace ID */ + dict_hdr_get_new_id(NULL, NULL, &table->space_id); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_out_of_space_ids", + table->space_id = UINT32_MAX; + ); + + if (table->space_id == UINT32_MAX) { + return DB_ERROR; + } + } else { + ut_ad(dict_tf_get_rec_format(table->flags) + != REC_FORMAT_COMPRESSED); + table->space = fil_system.sys_space; + table->space_id = TRX_SYS_SPACE; + } + + ins_node_set_new_row(node->tab_def, + dict_create_sys_tables_tuple(table, node->heap)); + return DB_SUCCESS; +} + +/** Builds a SYS_VIRTUAL row definition to insert. +@param[in] node table create node */ +static +void +dict_build_v_col_def_step( + tab_node_t* node) +{ + dtuple_t* row; + + row = dict_create_sys_virtual_tuple(node->table, node->col_no, + node->base_col_no, + node->heap); + ins_node_set_new_row(node->v_col_def, row); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_INDEXES system table. +@return the tuple which should be inserted */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dtuple_t* +dict_create_sys_indexes_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + + ut_ad(dict_sys.locked()); + ut_ad(index); + ut_ad(index->table->space || !UT_LIST_GET_LEN(index->table->indexes) + || index->table->file_unreadable); + ut_ad(!index->table->space + || index->table->space->id == index->table->space_id); + ut_ad(heap); + + entry = dtuple_create( + heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_indexes); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TABLE_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: ID ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__NAME); + + if (!index->is_committed()) { + ulint len = strlen(index->name) + 1; + char* name = static_cast<char*>( + mem_heap_alloc(heap, len)); + *name = *TEMP_INDEX_PREFIX_STR; + memcpy(name + 1, index->name, len - 1); + dfield_set_data(dfield, name, len); + } else { + dfield_set_data(dfield, index->name, strlen(index->name)); + } + + /* 5: N_FIELDS ----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__N_FIELDS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->n_fields); + + dfield_set_data(dfield, ptr, 4); + + /* 6: TYPE --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TYPE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->type); + + dfield_set_data(dfield, ptr, 4); + + /* 7: SPACE --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__SPACE); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->table->space_id); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PAGE_NO --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__PAGE_NO); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, FIL_NULL); + + dfield_set_data(dfield, ptr, 4); + + /* 9: MERGE_THRESHOLD ----------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, DICT_INDEX_MERGE_THRESHOLD_DEFAULT); + + dfield_set_data(dfield, ptr, 4); + + /*--------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_FIELDS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_fields_tuple( +/*=========================*/ + const dict_index_t* index, /*!< in: index */ + ulint fld_no, /*!< in: field number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dict_field_t* field; + dfield_t* dfield; + byte* ptr; + bool wide_pos = false; + + ut_ad(index); + ut_ad(heap); + + for (unsigned j = 0; j < index->n_fields; j++) { + const dict_field_t* f = dict_index_get_nth_field(index, j); + if (f->prefix_len || f->descending) { + wide_pos = true; + break; + } + } + + field = dict_index_get_nth_field(index, fld_no); + + entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_fields); + + /* 0: INDEX_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/ + + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS); + + ptr = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + if (wide_pos) { + /* If there are column prefixes or columns with + descending order in the index, then we write the + field number to the 16 most significant bits, + the DESC flag to bit 15, and the prefix length + in the 15 least significant bits. */ + mach_write_to_4(ptr, (fld_no << 16) + | (!!field->descending) << 15 + | field->prefix_len); + } else { + /* Else we store the number of the field to the 2 LOW bytes. + This is to keep the storage format compatible with + InnoDB versions < 4.0.14. */ + + mach_write_to_4(ptr, fld_no); + } + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: COL_NAME -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME); + + dfield_set_data(dfield, field->name, strlen(field->name)); + /*---------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. +@return the tuple for search */ +static +dtuple_t* +dict_create_search_tuple( +/*=====================*/ + const dtuple_t* tuple, /*!< in: the tuple inserted in the SYS_INDEXES + table */ + mem_heap_t* heap) /*!< in: memory heap from which the memory for + the built tuple is allocated */ +{ + dtuple_t* search_tuple; + const dfield_t* field1; + dfield_t* field2; + + ut_ad(tuple && heap); + + search_tuple = dtuple_create(heap, 2); + + field1 = dtuple_get_nth_field(tuple, 0); + field2 = dtuple_get_nth_field(search_tuple, 0); + + dfield_copy(field2, field1); + + field1 = dtuple_get_nth_field(tuple, 1); + field2 = dtuple_get_nth_field(search_tuple, 1); + + dfield_copy(field2, field1); + + ut_ad(dtuple_validate(search_tuple)); + + return(search_tuple); +} + +/***************************************************************//** +Builds an index definition row to insert. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_build_index_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + ind_node_t* node) /*!< in: index create node */ +{ + dict_table_t* table; + dict_index_t* index; + dtuple_t* row; + trx_t* trx; + + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + index = node->index; + + table = dict_table_open_on_name( + node->table_name, true, DICT_ERR_IGNORE_TABLESPACE); + + if (!table) { + return DB_TABLE_NOT_FOUND; + } + + index->table = table; + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + dict_hdr_get_new_id(NULL, &index->id, NULL); + + node->page_no = FIL_NULL; + row = dict_create_sys_indexes_tuple(index, node->heap); + node->ind_row = row; + + ins_node_set_new_row(node->ind_def, row); + + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + table->release(); + + return(DB_SUCCESS); +} + +/***************************************************************//** +Builds an index definition without updating SYSTEM TABLES. +@return DB_SUCCESS or error code */ +void +dict_build_index_def( +/*=================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + ut_ad(dict_sys.locked()); + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + dict_hdr_get_new_id(NULL, &index->id, NULL); + + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; +} + +/***************************************************************//** +Builds a field definition row to insert. */ +static +void +dict_build_field_def_step( +/*======================*/ + ind_node_t* node) /*!< in: index create node */ +{ + dict_index_t* index; + dtuple_t* row; + + index = node->index; + + row = dict_create_sys_fields_tuple(index, node->field_no, node->heap); + + ins_node_set_new_row(node->field_def, row); +} + +/***************************************************************//** +Creates an index tree for the index. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_create_index_tree_step( +/*========================*/ + ind_node_t* node) /*!< in: index create node */ +{ + mtr_t mtr; + btr_pcur_t pcur; + dict_index_t* index; + dtuple_t* search_tuple; + + ut_ad(dict_sys.locked()); + + index = node->index; + + if (index->type == DICT_FTS) { + /* FTS index does not need an index tree */ + return(DB_SUCCESS); + } + + /* Run a mini-transaction in which the index tree is allocated for + the index and its root address is written to the index entry in + sys_indexes */ + + mtr.start(); + + search_tuple = dict_create_search_tuple(node->ind_row, node->heap); + node->page_no = FIL_NULL; + pcur.btr_cur.page_cur.index = + UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + dberr_t err = btr_pcur_open(search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (err != DB_SUCCESS) { +func_exit: + mtr.commit(); + return err; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + if (UNIV_UNLIKELY(btr_pcur_is_after_last_on_page(&pcur))) { +corrupted: + err = DB_CORRUPTION; + goto func_exit; + } + + ulint len; + byte* data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur), + DICT_FLD__SYS_INDEXES__ID, + &len); + if (UNIV_UNLIKELY(len != 8 || mach_read_from_8(data) != index->id)) { + goto corrupted; + } + + data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur), + DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) { + goto corrupted; + } + + if (index->is_readable()) { + index->set_modified(mtr); + + node->page_no = btr_create( + index->type, index->table->space, + index->id, index, &mtr, &err); + + DBUG_EXECUTE_IF("ib_import_create_index_failure_1", + node->page_no = FIL_NULL; + err = DB_OUT_OF_FILE_SPACE; ); + } + + mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data, + node->page_no); + goto func_exit; +} + +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return error code */ +dberr_t +dict_create_index_tree_in_mem( +/*==========================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx) /*!< in: InnoDB transaction handle */ +{ + mtr_t mtr; + + ut_ad(dict_sys.locked()); + ut_ad(!(index->type & DICT_FTS)); + + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + /* Currently this function is being used by temp-tables only. + Import/Discard of temp-table is blocked and so this assert. */ + ut_ad(index->is_readable()); + ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED)); + + dberr_t err; + index->page = btr_create(index->type, index->table->space, + index->id, index, &mtr, &err); + mtr_commit(&mtr); + + index->trx_id = trx->id; + + return err; +} + +/** Drop the index tree associated with a row in SYS_INDEXES table. +@param[in,out] pcur persistent cursor on rec +@param[in,out] trx dictionary transaction +@param[in,out] mtr mini-transaction +@return tablespace ID to drop (if this is the clustered index) +@retval 0 if no tablespace is to be dropped */ +uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr) +{ + rec_t *rec= btr_pcur_get_rec(pcur); + + ut_ad(!trx || dict_sys.locked()); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + btr_pcur_store_position(pcur, mtr); + + static_assert(DICT_FLD__SYS_INDEXES__TABLE_ID == 0, "compatibility"); + static_assert(DICT_FLD__SYS_INDEXES__ID == 1, "compatibility"); + + ulint len= rec_get_n_fields_old(rec); + if (len < DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD || + len > DICT_NUM_FIELDS__SYS_INDEXES) + { +rec_corrupted: + sql_print_error("InnoDB: Corrupted SYS_INDEXES record"); + return 0; + } + + if (rec_get_1byte_offs_flag(rec)) + { + if (rec_1_get_field_end_info(rec, 0) != 8 || + rec_1_get_field_end_info(rec, 1) != 8 + 8) + goto rec_corrupted; + } + else if (rec_2_get_field_end_info(rec, 0) != 8 || + rec_2_get_field_end_info(rec, 1) != 8 + 8) + goto rec_corrupted; + + const byte *p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) + goto rec_corrupted; + const uint32_t type= mach_read_from_4(p); + p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) + goto rec_corrupted; + const uint32_t root_page_no= mach_read_from_4(p); + p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + if (len != 4) + goto rec_corrupted; + + const uint32_t space_id= mach_read_from_4(p); + ut_ad(root_page_no == FIL_NULL || space_id <= SRV_SPACE_ID_UPPER_BOUND); + + if (space_id && (type & DICT_CLUSTERED)) + return space_id; + + if (root_page_no == FIL_NULL) + /* The tree has already been freed */; + else if (fil_space_t*s= fil_space_t::get(space_id)) + { + /* Ensure that the tablespace file exists + in order to avoid a crash in buf_page_get_gen(). */ + if (root_page_no < s->get_size()) + { + static_assert(FIL_NULL == 0xffffffff, "compatibility"); + static_assert(DICT_FLD__SYS_INDEXES__PAGE_NO == + DICT_FLD__SYS_INDEXES__SPACE + 1, "compatibility"); + mtr->memset(btr_pcur_get_block(pcur), page_offset(p + 4), 4, 0xff); + btr_free_if_exists(s, root_page_no, mach_read_from_8(rec + 8), mtr); + } + s->release(); + } + + return 0; +} + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as a memory data + structure */ + mem_heap_t* heap) /*!< in: heap where created */ +{ + tab_node_t* node; + + node = static_cast<tab_node_t*>( + mem_heap_alloc(heap, sizeof(tab_node_t))); + + node->common.type = QUE_NODE_CREATE_TABLE; + + node->table = table; + + node->state = TABLE_BUILD_TABLE_DEF; + node->heap = mem_heap_create(256); + + node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables, + heap); + node->tab_def->common.parent = node; + + node->col_def = ins_node_create(INS_DIRECT, dict_sys.sys_columns, + heap); + node->col_def->common.parent = node; + + node->v_col_def = ins_node_create(INS_DIRECT, dict_sys.sys_virtual, + heap); + node->v_col_def->common.parent = node; + + return(node); +} + +/** Creates an index create graph. +@param[in] index index to create, built as a memory data structure +@param[in] table table name +@param[in,out] heap heap where created +@param[in] mode encryption mode (for creating a table) +@param[in] key_id encryption key identifier (for creating a table) +@param[in] add_v new virtual columns added in the same clause with + add index +@return own: index create node */ +ind_node_t* +ind_create_graph_create( + dict_index_t* index, + const char* table, + mem_heap_t* heap, + fil_encryption_t mode, + uint32_t key_id, + const dict_add_v_col_t* add_v) +{ + ind_node_t* node; + + node = static_cast<ind_node_t*>( + mem_heap_alloc(heap, sizeof(ind_node_t))); + + node->common.type = QUE_NODE_CREATE_INDEX; + + node->index = index; + + node->table_name = table; + + node->key_id = key_id; + node->mode = mode; + node->add_v = add_v; + + node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; + node->heap = mem_heap_create(256); + + node->ind_def = ins_node_create(INS_DIRECT, + dict_sys.sys_indexes, heap); + node->ind_def->common.parent = node; + + node->field_def = ins_node_create(INS_DIRECT, + dict_sys.sys_fields, heap); + node->field_def->common.parent = node; + + return(node); +} + +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + tab_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + node = static_cast<tab_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = TABLE_BUILD_TABLE_DEF; + } + + if (node->state == TABLE_BUILD_TABLE_DEF) { + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = dict_build_table_def_step(thr, node); + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = TABLE_BUILD_COL_DEF; + node->col_no = 0; + + thr->run_node = node->tab_def; + + return(thr); + } + + if (node->state == TABLE_BUILD_COL_DEF) { + if (node->col_no + DATA_N_SYS_COLS + < (static_cast<ulint>(node->table->n_def) + + static_cast<ulint>(node->table->n_v_def))) { + + ulint i = node->col_no++; + if (i + DATA_N_SYS_COLS >= node->table->n_def) { + i += DATA_N_SYS_COLS; + } + + ins_node_set_new_row( + node->col_def, + dict_create_sys_columns_tuple(node->table, i, + node->heap)); + + thr->run_node = node->col_def; + + return(thr); + } else { + /* Move on to SYS_VIRTUAL table */ + node->col_no = 0; + node->base_col_no = 0; + node->state = TABLE_BUILD_V_COL_DEF; + } + } + + if (node->state == TABLE_BUILD_V_COL_DEF) { + + if (node->col_no < static_cast<ulint>(node->table->n_v_def)) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + node->table, node->col_no); + + /* If no base column */ + while (v_col->num_base == 0) { + node->col_no++; + if (node->col_no == static_cast<ulint>( + (node->table)->n_v_def)) { + node->state = TABLE_ADD_TO_CACHE; + break; + } + + v_col = dict_table_get_nth_v_col( + node->table, node->col_no); + node->base_col_no = 0; + } + + if (node->state != TABLE_ADD_TO_CACHE) { + ut_ad(node->col_no == v_col->v_pos); + dict_build_v_col_def_step(node); + + if (node->base_col_no + < unsigned{v_col->num_base} - 1) { + /* move on to next base column */ + node->base_col_no++; + } else { + /* move on to next virtual column */ + node->col_no++; + node->base_col_no = 0; + } + + thr->run_node = node->v_col_def; + + return(thr); + } + } else { + node->state = TABLE_ADD_TO_CACHE; + } + } + + if (node->state == TABLE_ADD_TO_CACHE) { + node->table->can_be_evicted = !node->table->fts; + node->table->add_to_cache(); + + err = DB_SUCCESS; + } + +function_exit: + trx->error_state = err; + + if (err != DB_SUCCESS) { + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +static dberr_t dict_create_index_space(const ind_node_t &node) +{ + dict_table_t *table= node.index->table; + if (table->space || (table->flags2 & DICT_TF2_DISCARDED)) + return DB_SUCCESS; + ut_ad(table->space_id); + ut_ad(table->space_id < SRV_TMP_SPACE_ID); + /* Determine the tablespace flags. */ + const bool has_data_dir= DICT_TF_HAS_DATA_DIR(table->flags); + ut_ad(!has_data_dir || table->data_dir_path); + char* filepath= fil_make_filepath(has_data_dir + ? table->data_dir_path : nullptr, + table->name, IBD, has_data_dir); + if (!filepath) + return DB_OUT_OF_MEMORY; + + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of + the table we create here. */ + dberr_t err; + table->space= fil_ibd_create(table->space_id, table->name, filepath, + dict_tf_to_fsp_flags(table->flags), + FIL_IBD_FILE_INITIAL_SIZE, + node.mode, node.key_id, &err); + ut_ad((err != DB_SUCCESS) == !table->space); + ut_free(filepath); + + return err; +} + +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ind_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + node = static_cast<ind_node_t*>(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = INDEX_BUILD_INDEX_DEF; + } + + if (node->state == INDEX_BUILD_INDEX_DEF) { + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + err = dict_build_index_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_BUILD_FIELD_DEF; + node->field_no = 0; + + thr->run_node = node->ind_def; + + return(thr); + } + + if (node->state == INDEX_BUILD_FIELD_DEF) { + err = dict_create_index_space(*node); + if (err != DB_SUCCESS) { + dict_mem_index_free(node->index); + node->index = nullptr; + goto function_exit; + } + + if (node->field_no < (node->index)->n_fields) { + + dict_build_field_def_step(node); + + node->field_no++; + + thr->run_node = node->field_def; + + return(thr); + } else { + node->state = INDEX_ADD_TO_CACHE; + } + } + + if (node->state == INDEX_ADD_TO_CACHE) { + err = dict_index_add_to_cache(node->index, FIL_NULL, + node->add_v); + + ut_ad(!node->index == (err != DB_SUCCESS)); + + if (!node->index) { + goto function_exit; + } + + ut_ad(!node->index->is_instant()); + ut_ad(node->index->n_core_null_bytes + == ((dict_index_is_clust(node->index) + && node->index->table->supports_instant()) + ? dict_index_t::NO_CORE_NULL_BYTES + : UT_BITS_IN_BYTES( + unsigned(node->index->n_nullable)))); + node->index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(node->index->n_nullable))); + node->state = INDEX_CREATE_INDEX_TREE; + } + + if (node->state == INDEX_CREATE_INDEX_TREE) { + + err = dict_create_index_tree_step(node); + + DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail", + err = DB_OUT_OF_MEMORY;); + + if (err != DB_SUCCESS) { + dict_table_t* table = node->index->table; + /* If this is a FTS index, we will need to remove + it from fts->cache->indexes list as well */ + if (!(node->index->type & DICT_FTS)) { + } else if (auto fts = table->fts) { + fts_index_cache_t* index_cache; + + mysql_mutex_lock(&fts->cache->init_lock); + + index_cache = (fts_index_cache_t*) + fts_find_index_cache( + fts->cache, + node->index); + + if (index_cache->words) { + rbt_free(index_cache->words); + index_cache->words = 0; + } + + ib_vector_remove( + fts->cache->indexes, + *reinterpret_cast<void**>(index_cache)); + + mysql_mutex_unlock(&fts->cache->init_lock); + } + +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!node->index->search_info->ref_count); +#endif /* BTR_CUR_HASH_ADAPT */ + dict_index_remove_from_cache(table, node->index); + node->index = NULL; + + goto function_exit; + } + + node->index->page = node->page_no; + /* These should have been set in + dict_build_index_def_step() and + dict_index_add_to_cache(). */ + ut_ad(node->index->trx_id == trx->id); + ut_ad(node->index->table->def_trx_id == trx->id); + } + +function_exit: + trx->error_state = err; + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return nullptr; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +bool dict_sys_t::load_sys_tables() +{ + ut_ad(!srv_any_background_activity()); + bool mismatch= false; + lock(SRW_LOCK_CALL); + if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_foreign->indexes) == 3 && + sys_foreign->n_cols == DICT_NUM_COLS__SYS_FOREIGN + DATA_N_SYS_COLS) + prevent_eviction(sys_foreign); + else + { + sys_foreign= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN"); + } + if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1 && + sys_foreign_cols->n_cols == + DICT_NUM_COLS__SYS_FOREIGN_COLS + DATA_N_SYS_COLS) + prevent_eviction(sys_foreign_cols); + else + { + sys_foreign_cols= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN_COLS"); + } + if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_virtual->indexes) == 1 && + sys_virtual->n_cols == DICT_NUM_COLS__SYS_VIRTUAL + DATA_N_SYS_COLS) + prevent_eviction(sys_virtual); + else + { + sys_virtual= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_VIRTUAL"); + } + unlock(); + return mismatch; +} + +dberr_t dict_sys_t::create_or_check_sys_tables() +{ + if (sys_tables_exist()) + return DB_SUCCESS; + + if (srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) + return DB_READ_ONLY; + + if (load_sys_tables()) + { + sql_print_information("InnoDB: Set innodb_read_only=1 " + "or innodb_force_recovery=3 to start up"); + return DB_CORRUPTION; + } + + if (sys_tables_exist()) + return DB_SUCCESS; + + trx_t *trx= trx_create(); + trx_start_for_ddl(trx); + + { + /* Do not bother with transactional memory; this is only + executed at startup, with no conflicts present. */ + LockMutexGuard g{SRW_LOCK_CALL}; + trx->mutex_lock(); + lock_table_create(dict_sys.sys_tables, LOCK_X, trx); + lock_table_create(dict_sys.sys_columns, LOCK_X, trx); + lock_table_create(dict_sys.sys_indexes, LOCK_X, trx); + lock_table_create(dict_sys.sys_fields, LOCK_X, trx); + trx->mutex_unlock(); + } + + row_mysql_lock_data_dictionary(trx); + + /* NOTE: when designing InnoDB's foreign key support in 2001, Heikki Tuuri + made a mistake and defined table names and the foreign key id to be of type + CHAR (internally, really VARCHAR). The type should have been VARBINARY. */ + + /* System tables are always created inside the system tablespace. */ + const auto srv_file_per_table_backup= srv_file_per_table; + srv_file_per_table= 0; + dberr_t error; + span<const char> tablename; + + if (!sys_foreign) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," + " REF_NAME CHAR, N_COLS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN (ID);\n" + "CREATE INDEX FOR_IND" + " ON SYS_FOREIGN (FOR_NAME);\n" + "CREATE INDEX REF_IND" + " ON SYS_FOREIGN (REF_NAME);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_FOREIGN]; +err_exit: + sql_print_error("InnoDB: Creation of %.*s failed: %s", + int(tablename.size()), tablename.data(), + ut_strerr(error)); + trx->rollback(); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + srv_file_per_table= srv_file_per_table_backup; + return error; + } + } + if (!sys_foreign_cols) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN_COLS() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN_COLS(ID CHAR, POS INT," + " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN_COLS (ID, POS);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_FOREIGN_COLS]; + goto err_exit; + } + } + if (!sys_virtual) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_VIRTUAL() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_VIRTUAL(TABLE_ID BIGINT,POS INT,BASE_POS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX BASE_IDX" + " ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_VIRTUAL]; + goto err_exit; + } + } + + trx->commit(); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + srv_file_per_table= srv_file_per_table_backup; + + lock(SRW_LOCK_CALL); + if (sys_foreign); + else if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN]))) + { + tablename= SYS_TABLE[SYS_FOREIGN]; +load_fail: + unlock(); + sql_print_error("InnoDB: Failed to CREATE TABLE %.*s", + int(tablename.size()), tablename.data()); + return DB_TABLE_NOT_FOUND; + } + else + prevent_eviction(sys_foreign); + + if (sys_foreign_cols); + else if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS]))) + { + tablename= SYS_TABLE[SYS_FOREIGN_COLS]; + goto load_fail; + } + else + prevent_eviction(sys_foreign_cols); + + if (sys_virtual); + else if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL]))) + { + tablename= SYS_TABLE[SYS_VIRTUAL]; + goto load_fail; + } + else + prevent_eviction(sys_virtual); + + unlock(); + return DB_SUCCESS; +} + +/****************************************************************//** +Evaluate the given foreign key SQL statement. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_foreign_eval_sql( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* sql, /*!< in: SQL string to evaluate */ + const char* name, /*!< in: table name (for diagnostics) */ + const char* id, /*!< in: foreign key id */ + trx_t* trx) /*!< in/out: transaction */ +{ + FILE* ef = dict_foreign_err_file; + + dberr_t error = que_eval_sql(info, sql, trx); + + switch (error) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in foreign key constraint creation for table ", + ef); + ut_print_name(ef, trx, name); + fputs(".\nA foreign key constraint of name ", ef); + ut_print_name(ef, trx, id); + fputs("\nalready exists." + " (Note that internally InnoDB adds 'databasename'\n" + "in front of the user-defined constraint name.)\n" + "Note that InnoDB's FOREIGN KEY system tables store\n" + "constraint names as case-insensitive, with the\n" + "MariaDB standard latin1_swedish_ci collation. If you\n" + "create tables or databases whose names differ only in\n" + "the character case, then collisions in constraint\n" + "names can occur. Workaround: name your constraints\n" + "explicitly with unique names.\n", + ef); + goto release; + default: + sql_print_error("InnoDB: " + "Foreign key constraint creation failed: %s", + ut_strerr(error)); + + mysql_mutex_lock(&dict_foreign_err_mutex); + ut_print_timestamp(ef); + fputs(" Internal error in foreign key constraint creation" + " for table ", ef); + ut_print_name(ef, trx, name); + fputs(".\n" + "See the MariaDB .err log in the datadir" + " for more information.\n", ef); +release: + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + return error; +} + +/********************************************************************//** +Add a single foreign key field definition to the data dictionary tables in +the database. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_create_add_foreign_field_to_dictionary( +/*========================================*/ + ulint field_nr, /*!< in: field number */ + const char* table_name, /*!< in: table name */ + const dict_foreign_t* foreign, /*!< in: foreign */ + trx_t* trx) /*!< in/out: transaction */ +{ + DBUG_ENTER("dict_create_add_foreign_field_to_dictionary"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_int4_literal(info, "pos", field_nr); + + pars_info_add_str_literal(info, "for_col_name", + foreign->foreign_col_names[field_nr]); + + pars_info_add_str_literal(info, "ref_col_name", + foreign->referenced_col_names[field_nr]); + + DBUG_RETURN(dict_foreign_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN_COLS VALUES" + "(:id, :pos, :for_col_name, :ref_col_name);\n" + "END;\n", + table_name, foreign->id, trx)); +} + +/********************************************************************//** +Construct foreign key constraint defintion from data dictionary information. +*/ +static +char* +dict_foreign_def_get( +/*=================*/ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx) /*!< in: trx */ +{ + char* fk_def = (char *)mem_heap_alloc(foreign->heap, 4*1024); + const char* tbname; + char tablebuf[MAX_TABLE_NAME_LEN + 1] = ""; + unsigned i; + char* bufend; + + tbname = dict_remove_db_name(foreign->id); + bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN, + tbname, strlen(tbname), trx->mysql_thd); + tablebuf[bufend - tablebuf] = '\0'; + + sprintf(fk_def, + (char *)"CONSTRAINT %s FOREIGN KEY (", (char *)tablebuf); + + for(i = 0; i < foreign->n_fields; i++) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->foreign_col_names[i], + strlen(foreign->foreign_col_names[i]), + trx->mysql_thd); + strcat(fk_def, buf); + if (i < static_cast<unsigned>(foreign->n_fields-1)) { + strcat(fk_def, (char *)","); + } + } + + strcat(fk_def,(char *)") REFERENCES "); + + bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN, + foreign->referenced_table_name, + strlen(foreign->referenced_table_name), + trx->mysql_thd); + tablebuf[bufend - tablebuf] = '\0'; + + strcat(fk_def, tablebuf); + strcat(fk_def, " ("); + + for(i = 0; i < foreign->n_fields; i++) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + bufend = innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->referenced_col_names[i], + strlen(foreign->referenced_col_names[i]), + trx->mysql_thd); + buf[bufend - buf] = '\0'; + strcat(fk_def, buf); + if (i < (uint)foreign->n_fields-1) { + strcat(fk_def, (char *)","); + } + } + strcat(fk_def, (char *)")"); + + return fk_def; +} + +/********************************************************************//** +Convert foreign key column names from data dictionary to SQL-layer. +*/ +static +void +dict_foreign_def_get_fields( +/*========================*/ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx, /*!< in: trx */ + char** field, /*!< out: foreign column */ + char** field2, /*!< out: referenced column */ + ulint col_no) /*!< in: column number */ +{ + char* bufend; + char* fieldbuf = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1); + char* fieldbuf2 = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1); + + bufend = innobase_convert_name(fieldbuf, MAX_TABLE_NAME_LEN, + foreign->foreign_col_names[col_no], + strlen(foreign->foreign_col_names[col_no]), + trx->mysql_thd); + + fieldbuf[bufend - fieldbuf] = '\0'; + + bufend = innobase_convert_name(fieldbuf2, MAX_TABLE_NAME_LEN, + foreign->referenced_col_names[col_no], + strlen(foreign->referenced_col_names[col_no]), + trx->mysql_thd); + + fieldbuf2[bufend - fieldbuf2] = '\0'; + *field = fieldbuf; + *field2 = fieldbuf2; +} + +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ +{ + dberr_t error; + + DBUG_ENTER("dict_create_add_foreign_to_dictionary"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_str_literal(info, "for_name", name); + + pars_info_add_str_literal(info, "ref_name", + foreign->referenced_table_name); + + pars_info_add_int4_literal(info, "n_cols", + ulint(foreign->n_fields) + | (ulint(foreign->type) << 24)); + + DBUG_PRINT("dict_create_add_foreign_to_dictionary", + ("'%s', '%s', '%s', %d", foreign->id, name, + foreign->referenced_table_name, + foreign->n_fields + (foreign->type << 24))); + + error = dict_foreign_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN VALUES" + "(:id, :for_name, :ref_name, :n_cols);\n" + "END;\n" + , name, foreign->id, trx); + + if (error != DB_SUCCESS) { + + if (error == DB_DUPLICATE_KEY) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + char tablename[MAX_TABLE_NAME_LEN + 1] = ""; + char* fk_def; + + innobase_convert_name(tablename, MAX_TABLE_NAME_LEN, + name, strlen(name), trx->mysql_thd); + + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->id, strlen(foreign->id), trx->mysql_thd); + + fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx); + + ib_push_warning(trx, error, + "Create or Alter table %s with foreign key constraint" + " failed. Foreign key constraint %s" + " already exists on data dictionary." + " Foreign key constraint names need to be unique in database." + " Error in foreign key definition: %s.", + tablename, buf, fk_def); + } + + DBUG_RETURN(error); + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + error = dict_create_add_foreign_field_to_dictionary( + i, name, foreign, trx); + + if (error != DB_SUCCESS) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + char tablename[MAX_TABLE_NAME_LEN + 1] = ""; + char* field=NULL; + char* field2=NULL; + char* fk_def; + + innobase_convert_name(tablename, MAX_TABLE_NAME_LEN, + name, strlen(name), trx->mysql_thd); + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->id, strlen(foreign->id), trx->mysql_thd); + fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx); + dict_foreign_def_get_fields((dict_foreign_t*)foreign, trx, &field, &field2, i); + + ib_push_warning(trx, error, + "Create or Alter table %s with foreign key constraint" + " failed. Error adding foreign key constraint name %s" + " fields %s or %s to the dictionary." + " Error in foreign key definition: %s.", + tablename, buf, i+1, fk_def); + + DBUG_RETURN(error); + } + } + + DBUG_RETURN(error); +} + +/** Check if a foreign constraint is on the given column name. +@param[in] col_name column name to be searched for fk constraint +@param[in] table table to which foreign key constraint belongs +@return true if fk constraint is present on the table, false otherwise. */ +static +bool +dict_foreign_base_for_stored( + const char* col_name, + const dict_table_t* table) +{ + /* Loop through each stored column and check if its base column has + the same name as the column name being checked */ + dict_s_col_list::const_iterator it; + for (it = table->s_cols->begin(); + it != table->s_cols->end(); ++it) { + dict_s_col_t s_col = *it; + + for (ulint j = 0; j < s_col.num_base; j++) { + if (strcmp(col_name, dict_table_get_col_name( + table, + s_col.base_col[j]->ind)) == 0) { + return(true); + } + } + } + + return(false); +} + +/** Check if a foreign constraint is on columns served as base columns +of any stored column. This is to prevent creating SET NULL or CASCADE +constraint on such columns +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@return true if yes, otherwise, false */ +bool +dict_foreigns_has_s_base_col( + const dict_foreign_set& local_fk_set, + const dict_table_t* table) +{ + dict_foreign_t* foreign; + + if (table->s_cols == NULL) { + return (false); + } + + for (dict_foreign_set::const_iterator it = local_fk_set.begin(); + it != local_fk_set.end(); ++it) { + + foreign = *it; + ulint type = foreign->type; + + type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION + | DICT_FOREIGN_ON_UPDATE_NO_ACTION); + + if (type == 0) { + continue; + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + /* Check if the constraint is on a column that + is a base column of any stored column */ + if (dict_foreign_base_for_stored( + foreign->foreign_col_names[i], table)) { + return(true); + } + } + } + + return(false); +} + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) +{ + ut_ad(dict_sys.locked()); + + if (!dict_sys.sys_foreign) + { + sql_print_error("InnoDB: Table SYS_FOREIGN not found" + " in internal data dictionary"); + return DB_ERROR; + } + + for (auto fk : local_fk_set) + if (dberr_t error= + dict_create_add_foreign_to_dictionary(table->name.m_name, fk, trx)) + return error; + + return DB_SUCCESS; +} diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc new file mode 100644 index 00000000..bec6da8e --- /dev/null +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -0,0 +1,434 @@ +/***************************************************************************** + +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0defrag_bg.cc +Defragmentation routines. + +Created 25/08/2016 Jan Lindström +*******************************************************/ + +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "btr0btr.h" +#include "srv0start.h" +#include "trx0trx.h" +#include "lock0lock.h" +#include "row0mysql.h" + +static mysql_mutex_t defrag_pool_mutex; + +/** Iterator type for iterating over the elements of objects of type +defrag_pool_t. */ +typedef defrag_pool_t::iterator defrag_pool_iterator_t; + +/** Pool where we store information on which tables are to be processed +by background defragmentation. */ +defrag_pool_t defrag_pool; + + +/*****************************************************************//** +Initialize the defrag pool, called once during thread initialization. */ +void +dict_defrag_pool_init(void) +/*=======================*/ +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_init(0, &defrag_pool_mutex, nullptr); +} + +/*****************************************************************//** +Free the resources occupied by the defrag pool, called once during +thread de-initialization. */ +void +dict_defrag_pool_deinit(void) +/*=========================*/ +{ + ut_ad(!srv_read_only_mode); + + mysql_mutex_destroy(&defrag_pool_mutex); +} + +/*****************************************************************//** +Get an index from the auto defrag pool. The returned index id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_defrag_pool_get( +/*=======================*/ + table_id_t* table_id, /*!< out: table id, or unmodified if + list is empty */ + index_id_t* index_id) /*!< out: index id, or unmodified if + list is empty */ +{ + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&defrag_pool_mutex); + + if (defrag_pool.empty()) { + mysql_mutex_unlock(&defrag_pool_mutex); + return(false); + } + + defrag_pool_item_t& item = defrag_pool.back(); + *table_id = item.table_id; + *index_id = item.index_id; + + defrag_pool.pop_back(); + + mysql_mutex_unlock(&defrag_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index) /*!< in: table to add */ +{ + defrag_pool_item_t item; + + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&defrag_pool_mutex); + + /* quit if already in the list */ + for (defrag_pool_iterator_t iter = defrag_pool.begin(); + iter != defrag_pool.end(); + ++iter) { + if ((*iter).table_id == index->table->id + && (*iter).index_id == index->id) { + mysql_mutex_unlock(&defrag_pool_mutex); + return; + } + } + + item.table_id = index->table->id; + item.index_id = index->id; + defrag_pool.push_back(item); + if (defrag_pool.size() == 1) { + /* Kick off dict stats optimizer work */ + dict_stats_schedule_now(); + } + mysql_mutex_unlock(&defrag_pool_mutex); +} + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index) /*!< in: if given, remove this index */ +{ + ut_a((table && !index) || (!table && index)); + ut_ad(!srv_read_only_mode); + ut_ad(dict_sys.frozen()); + + mysql_mutex_lock(&defrag_pool_mutex); + + defrag_pool_iterator_t iter = defrag_pool.begin(); + while (iter != defrag_pool.end()) { + if ((table && (*iter).table_id == table->id) + || (index + && (*iter).table_id == index->table->id + && (*iter).index_id == index->id)) { + /* erase() invalidates the iterator */ + iter = defrag_pool.erase(iter); + if (index) + break; + } else { + iter++; + } + } + + mysql_mutex_unlock(&defrag_pool_mutex); +} + +/*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +static void dict_stats_process_entry_from_defrag_pool(THD *thd) +{ + table_id_t table_id; + index_id_t index_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first index from the auto defrag pool */ + if (!dict_stats_defrag_pool_get(&table_id, &index_id)) + /* no index in defrag pool */ + return; + + /* If the table is no longer cached, we've already lost the in + memory stats so there's nothing really to write to disk. */ + MDL_ticket *mdl= nullptr; + if (dict_table_t *table= + dict_table_open_on_id(table_id, false, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED, + thd, &mdl)) + { + if (dict_index_t *index= !table->corrupted + ? dict_table_find_index_on_id(table, index_id) : nullptr) + if (index->is_btree()) + dict_stats_save_defrag_stats(index); + dict_table_close(table, false, thd, mdl); + } +} + +/** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +void dict_defrag_process_entries_from_defrag_pool(THD *thd) +{ + while (!defrag_pool.empty()) + dict_stats_process_entry_from_defrag_pool(thd); +} + +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd) +{ + if (index->is_ibuf()) + return DB_SUCCESS; + + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) + { +release_and_exit: + if (table_stats) + dict_table_close(table_stats, false, thd, mdl_table); + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) + goto release_and_exit; + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) + { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx_t *trx= trx_create(); + trx->mysql_thd= thd; + trx_start_internal(trx); + dberr_t ret= trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) + ret= lock_table_for_trx(index_stats, trx, LOCK_X); + row_mysql_lock_data_dictionary(trx); + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed", + index->stat_defrag_n_pages_freed, + nullptr, + "Number of pages freed during" + " last defragmentation run.", + trx); + if (ret == DB_SUCCESS) + trx->commit(); + else + trx->rollback(); + + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + + row_mysql_unlock_data_dictionary(trx); + trx->free(); + + return ret; +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +static +ulint +btr_get_size_and_reserved( + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ + ulint dummy; + + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + + if (index->page == FIL_NULL + || dict_index_is_online_ddl(index) + || !index->is_committed() + || !index->table->space) { + return(ULINT_UNDEFINED); + } + + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err); + *used = 0; + if (!root) { + return ULINT_UNDEFINED; + } + + mtr->x_lock_space(index->table->space); + + ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->page.frame, used, mtr); + if (flag == BTR_TOTAL_SIZE) { + n += fseg_n_reserved_pages(*root, + PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, mtr); + *used += dummy; + } + + return(n); +} + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_defrag_stats( +/*============================*/ + dict_index_t* index) /*!< in: index */ +{ + if (index->is_ibuf()) + return DB_SUCCESS; + if (!index->is_readable()) + return dict_stats_report_error(index->table, true); + + const time_t now= time(nullptr); + mtr_t mtr; + ulint n_leaf_pages; + mtr.start(); + mtr_sx_lock_index(index, &mtr); + ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); + mtr.commit(); + + if (n_leaf_reserved == ULINT_UNDEFINED) + return DB_SUCCESS; + + THD *thd= current_thd; + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) + { +release_and_exit: + if (table_stats) + dict_table_close(table_stats, false, thd, mdl_table); + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) + goto release_and_exit; + + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) + { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx_t *trx= trx_create(); + trx->mysql_thd= thd; + trx_start_internal(trx); + dberr_t ret= trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) + ret= lock_table_for_trx(index_stats, trx, LOCK_X); + + row_mysql_lock_data_dictionary(trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_page_split", + index->stat_defrag_n_page_split, nullptr, + "Number of new page splits on leaves" + " since last defragmentation.", trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_defrag", + n_leaf_pages, nullptr, + "Number of leaf pages when" + " this stat is saved to disk", trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_reserved", + n_leaf_reserved, nullptr, + "Number of pages reserved for" + " this index leaves" + " when this stat is saved to disk", trx); + + if (ret == DB_SUCCESS) + trx->commit(); + else + trx->rollback(); + + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + + return ret; +} diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc new file mode 100644 index 00000000..5bc7ab6e --- /dev/null +++ b/storage/innobase/dict/dict0dict.cc @@ -0,0 +1,4859 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0dict.cc +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include <my_config.h> +#include <string> + +#include "ha_prototypes.h" +#include <mysqld.h> +#include <strfunc.h> + +#include "dict0dict.h" +#include "fts0fts.h" +#include "fil0fil.h" +#include <algorithm> +#include "sql_class.h" +#include "sql_table.h" +#include <mysql/service_thd_mdl.h> + +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "buf0buf.h" +#include "data0type.h" +#include "dict0boot.h" +#include "dict0load.h" +#include "dict0crea.h" +#include "dict0mem.h" +#include "dict0stats.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "lock0lock.h" +#include "mach0data.h" +#include "mem0mem.h" +#include "page0page.h" +#include "page0zip.h" +#include "pars0pars.h" +#include "pars0sym.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "row0log.h" +#include "row0merge.h" +#include "row0mysql.h" +#include "row0upd.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "trx0undo.h" +#include "trx0purge.h" + +#include <vector> +#include <algorithm> + +/** the dictionary system */ +dict_sys_t dict_sys; + +/** System table names; @see dict_system_id_t */ +const span<const char> dict_sys_t::SYS_TABLE[]= +{ + {C_STRING_WITH_LEN("SYS_TABLES")},{C_STRING_WITH_LEN("SYS_INDEXES")}, + {C_STRING_WITH_LEN("SYS_COLUMNS")},{C_STRING_WITH_LEN("SYS_FIELDS")}, + {C_STRING_WITH_LEN("SYS_FOREIGN")},{C_STRING_WITH_LEN("SYS_FOREIGN_COLS")}, + {C_STRING_WITH_LEN("SYS_VIRTUAL")} +}; + +/** Diagnostic message for exceeding the mutex_lock_wait() timeout */ +const char dict_sys_t::fatal_msg[]= + "innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch. " + "Please refer to " + "https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/"; + +/** Percentage of compression failures that are allowed in a single +round */ +ulong zip_failure_threshold_pct = 5; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +ulong zip_pad_max = 50; + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ +#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table + hash table fixed size in bytes */ +#define DICT_POOL_PER_VARYING 4 /*!< buffer pool max size per data + dictionary varying size in bytes */ + +/** Identifies generated InnoDB foreign key names */ +static char dict_ibfk[] = "_ibfk_"; + +/*******************************************************************//** +Tries to find column names for the index and sets the col field of the +index. +@param[in] index index +@param[in] add_v new virtual columns added along with an add index call +@return whether the column names were found */ +static +bool +dict_index_find_cols( + dict_index_t* index, + const dict_add_v_col_t* add_v); +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + dict_index_t* index); /*!< in: user representation of + a clustered index */ +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + dict_index_t* index); /*!< in: user representation of + a non-clustered index */ +/**********************************************************************//** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_index_t* index); /*!< in: user representation of an FTS index */ + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict); /*!< in: TRUE if page being evicted + to make room in the table LRU list */ +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if validate OK */ +static +ibool +dict_lru_validate(void); +/*===================*/ +#endif /* UNIV_DEBUG */ + +/* Stream for storing detailed information about the latest foreign key +and unique key errors. Only created if !srv_read_only_mode */ +FILE* dict_foreign_err_file = NULL; +/* mutex protecting the foreign and unique error buffers */ +mysql_mutex_t dict_foreign_err_mutex; + +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ +{ + for (; *name1 == *name2; name1++, name2++) { + if (*name1 == '/') { + return(TRUE); + } + ut_a(*name1); /* the names must contain '/' */ + } + return(FALSE); +} + +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ +{ + const char* s = strchr(name, '/'); + ut_a(s); + + return(s + 1); +} + +/** Decrement the count of open handles */ +void dict_table_close(dict_table_t *table) +{ + if (table->get_ref_count() == 1 && + dict_stats_is_persistent_enabled(table) && + strchr(table->name.m_name, '/')) + { + /* It looks like we are closing the last handle. The user could + have executed FLUSH TABLES in order to have the statistics reloaded + from the InnoDB persistent statistics tables. We must acquire + exclusive dict_sys.latch to prevent a race condition with another + thread concurrently acquiring a handle on the table. */ + dict_sys.lock(SRW_LOCK_CALL); + if (table->release()) + { + table->stats_mutex_lock(); + if (table->get_ref_count() == 0) + dict_stats_deinit(table); + table->stats_mutex_unlock(); + } + dict_sys.unlock(); + } + else + table->release(); +} + +/** Decrements the count of open handles of a table. +@param[in,out] table table +@param[in] dict_locked whether dict_sys.latch is being held +@param[in] thd thread to release MDL +@param[in] mdl metadata lock or NULL if the thread + is a foreground one. */ +void +dict_table_close( + dict_table_t* table, + bool dict_locked, + THD* thd, + MDL_ticket* mdl) +{ + if (!dict_locked) + dict_table_close(table); + else + { + if (table->release() && dict_stats_is_persistent_enabled(table) && + strchr(table->name.m_name, '/')) + { + /* Force persistent stats re-read upon next open of the table so + that FLUSH TABLE can be used to forcibly fetch stats from disk if + they have been manually modified. */ + table->stats_mutex_lock(); + if (table->get_ref_count() == 0) + dict_stats_deinit(table); + table->stats_mutex_unlock(); + } + + ut_ad(dict_lru_validate()); + ut_ad(dict_sys.find(table)); + } + + if (!thd || !mdl); + else if (MDL_context *mdl_context= static_cast<MDL_context*> + (thd_mdl_context(thd))) + mdl_context->release_lock(mdl); +} + +/** Check if the table has a given (non_virtual) column. +@param[in] table table object +@param[in] col_name column name +@param[in] col_nr column number guessed, 0 as default +@return column number if the table has the specified column, +otherwise table->n_def */ +ulint +dict_table_has_column( + const dict_table_t* table, + const char* col_name, + ulint col_nr) +{ + ulint col_max = table->n_def; + + ut_ad(table); + ut_ad(col_name); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + if (col_nr < col_max + && innobase_strcasecmp( + col_name, dict_table_get_col_name(table, col_nr)) == 0) { + return(col_nr); + } + + /** The order of column may changed, check it with other columns */ + for (ulint i = 0; i < col_max; i++) { + if (i != col_nr + && innobase_strcasecmp( + col_name, dict_table_get_col_name(table, i)) == 0) { + + return(i); + } + } + + return(col_max); +} + +/** Retrieve the column name. +@param[in] table the table of this column */ +const char* dict_col_t::name(const dict_table_t& table) const +{ + ut_ad(table.magic_n == DICT_TABLE_MAGIC_N); + + size_t col_nr; + const char *s; + + if (is_virtual()) { + col_nr = size_t(reinterpret_cast<const dict_v_col_t*>(this) + - table.v_cols); + ut_ad(col_nr < table.n_v_def); + s = table.v_col_names; + } else { + col_nr = size_t(this - table.cols); + ut_ad(col_nr < table.n_def); + s = table.col_names; + } + + if (s) { + for (size_t i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + +/** Returns a virtual column's name. +@param[in] table target table +@param[in] col_nr virtual column number (nth virtual column) +@return column name or NULL if column number out of range. */ +const char* +dict_table_get_v_col_name( + const dict_table_t* table, + ulint col_nr) +{ + const char* s; + + ut_ad(table); + ut_ad(col_nr < table->n_v_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + if (col_nr >= table->n_v_def) { + return(NULL); + } + + s = table->v_col_names; + + if (s != NULL) { + for (ulint i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + +/** Search virtual column's position in InnoDB according to its position +in original table's position +@param[in] table target table +@param[in] col_nr column number (nth column in the MySQL table) +@return virtual column's position in InnoDB, ULINT_UNDEFINED if not find */ +static +ulint +dict_table_get_v_col_pos_for_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i; + + ut_ad(table); + ut_ad(col_nr < static_cast<ulint>(table->n_t_def)); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + for (i = 0; i < table->n_v_def; i++) { + if (col_nr == dict_get_v_col_mysql_pos( + table->v_cols[i].m_col.ind)) { + break; + } + } + + if (i == table->n_v_def) { + return(ULINT_UNDEFINED); + } + + return(i); +} + +/** Returns a virtual column's name according to its original +MySQL table position. +@param[in] table target table +@param[in] col_nr column number (nth column in the table) +@return column name. */ +static +const char* +dict_table_get_v_col_name_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr); + + if (i == ULINT_UNDEFINED) { + return(NULL); + } + + return(dict_table_get_v_col_name(table, i)); +} + +/** Get nth virtual column according to its original MySQL table position +@param[in] table target table +@param[in] col_nr column number in MySQL Table definition +@return dict_v_col_t ptr */ +dict_v_col_t* +dict_table_get_nth_v_col_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr); + + if (i == ULINT_UNDEFINED) { + return(NULL); + } + + return(dict_table_get_nth_v_col(table, i)); +} + + +/** Get all the FTS indexes on a table. +@param[in] table table +@param[out] indexes all FTS indexes on this table +@return number of FTS indexes */ +ulint +dict_table_get_all_fts_indexes( + const dict_table_t* table, + ib_vector_t* indexes) +{ + dict_index_t* index; + + ut_a(ib_vector_size(indexes) == 0); + + for (index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + + if (index->type == DICT_FTS) { + ib_vector_push(indexes, &index); + } + } + + return(ib_vector_size(indexes)); +} + +/** Looks for column n in an index. +@param[in] index index +@param[in] n column number +@param[in] inc_prefix true=consider column prefixes too +@param[in] is_virtual true==virtual column +@param[out] prefix_col_pos col num if prefix +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_col_or_prefix_pos( + const dict_index_t* index, + ulint n, + bool inc_prefix, + bool is_virtual, + ulint* prefix_col_pos) +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + if (prefix_col_pos) { + *prefix_col_pos = ULINT_UNDEFINED; + } + + if (is_virtual) { + col = &(dict_table_get_nth_v_col(index->table, n)->m_col); + } else { + col = dict_table_get_nth_col(index->table, n); + } + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos(col, index)); + } + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + if (prefix_col_pos) { + *prefix_col_pos = pos; + } + if (inc_prefix || field->prefix_len == 0) { + return(pos); + } + } + } + + return(ULINT_UNDEFINED); +} + +/** Check if the index contains a column or a prefix of that column. +@param[in] n column number +@param[in] is_virtual whether it is a virtual col +@return whether the index contains the column or its prefix */ +bool dict_index_t::contains_col_or_prefix(ulint n, bool is_virtual) const +{ + ut_ad(magic_n == DICT_INDEX_MAGIC_N); + + if (is_primary()) { + return(!is_virtual); + } + + const dict_col_t* col = is_virtual + ? &dict_table_get_nth_v_col(table, n)->m_col + : dict_table_get_nth_col(table, n); + + for (ulint pos = 0; pos < n_fields; pos++) { + if (col == fields[pos].col) { + return true; + } + } + + return false; +} + +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ +{ + const dict_field_t* field; + const dict_field_t* field2; + ulint n_fields; + ulint pos; + + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + field2 = dict_index_get_nth_field(index2, n); + + n_fields = dict_index_get_n_fields(index); + + /* Are we looking for a MBR (Minimum Bound Box) field of + a spatial index */ + bool is_mbr_fld = (n == 0 && dict_index_is_spatial(index2)); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + /* The first field of a spatial index is a transformed + MBR (Minimum Bound Box) field made out of original column, + so its field->col still points to original cluster index + col, but the actual content is different. So we cannot + consider them equal if neither of them is MBR field */ + if (pos == 0 && dict_index_is_spatial(index) && !is_mbr_fld) { + continue; + } + + if (field->col == field2->col + && (field->prefix_len == 0 + || (field->prefix_len >= field2->prefix_len + && field2->prefix_len != 0))) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +/** Parse the table file name into table name and database name. +@tparam dict_frozen whether the caller holds dict_sys.latch +@param[in,out] db_name database name buffer +@param[in,out] tbl_name table name buffer +@param[out] db_name_len database name length +@param[out] tbl_name_len table name length +@return whether the table name is visible to SQL */ +template<bool dict_frozen> +bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1], + char (&tbl_name)[NAME_LEN + 1], + size_t *db_name_len, size_t *tbl_name_len) const +{ + char db_buf[MAX_DATABASE_NAME_LEN + 1]; + char tbl_buf[MAX_TABLE_NAME_LEN + 1]; + + if (!dict_frozen) + dict_sys.freeze(SRW_LOCK_CALL); /* protect against renaming */ + ut_ad(dict_sys.frozen()); + const size_t db_len= name.dblen(); + ut_ad(db_len <= MAX_DATABASE_NAME_LEN); + + memcpy(db_buf, mdl_name.m_name, db_len); + db_buf[db_len]= 0; + + size_t tbl_len= strlen(mdl_name.m_name + db_len + 1); + const bool is_temp= mdl_name.is_temporary(); + + if (is_temp); + else if (const char *is_part= static_cast<const char*> + (memchr(mdl_name.m_name + db_len + 1, '#', tbl_len))) + tbl_len= static_cast<size_t>(is_part - &mdl_name.m_name[db_len + 1]); + + memcpy(tbl_buf, mdl_name.m_name + db_len + 1, tbl_len); + tbl_buf[tbl_len]= 0; + + if (!dict_frozen) + dict_sys.unfreeze(); + + *db_name_len= filename_to_tablename(db_buf, db_name, + MAX_DATABASE_NAME_LEN + 1, true); + + if (is_temp) + return false; + + *tbl_name_len= filename_to_tablename(tbl_buf, tbl_name, + MAX_TABLE_NAME_LEN + 1, true); + return true; +} + +template bool +dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1], + size_t*, size_t*) const; + +/** Acquire MDL shared for the table name. +@tparam trylock whether to use non-blocking operation +@param[in,out] table table object +@param[in,out] thd background thread +@param[out] mdl mdl ticket +@param[in] table_op operation to perform when opening +@return table object after locking MDL shared +@retval nullptr if the table is not readable, or if trylock && MDL blocked */ +template<bool trylock> +dict_table_t* +dict_acquire_mdl_shared(dict_table_t *table, + THD *thd, + MDL_ticket **mdl, + dict_table_op_t table_op) +{ + if (!table || !mdl) + return table; + + MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd)); + size_t db_len; + dict_table_t *not_found= nullptr; + + if (trylock) + { + dict_sys.freeze(SRW_LOCK_CALL); + db_len= dict_get_db_name_len(table->name.m_name); + dict_sys.unfreeze(); + } + else + { + ut_ad(dict_sys.frozen_not_locked()); + db_len= dict_get_db_name_len(table->name.m_name); + } + + if (db_len == 0) + return table; /* InnoDB system tables are not covered by MDL */ + + if (!mdl_context) + return nullptr; + + table_id_t table_id= table->id; + char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1]; + char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1]; + size_t tbl_len; + bool unaccessible= false; + + if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len)) + /* The name of an intermediate table starts with #sql */ + return table; + +retry: + if (!unaccessible && (!table->is_readable() || table->corrupted)) + { + if (*mdl) + { + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + unaccessible= true; + } + + if (!trylock) + table->release(); + + if (unaccessible) + return nullptr; + + if (!trylock) + dict_sys.unfreeze(); + + { + MDL_request request; + MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED, + MDL_EXPLICIT); + if (trylock + ? mdl_context->try_acquire_lock(&request) + : mdl_context->acquire_lock(&request, + /* FIXME: use compatible type, and maybe + remove this parameter altogether! */ + static_cast<double>(global_system_variables + .lock_wait_timeout))) + { + *mdl= nullptr; + if (trylock) + return nullptr; + } + else + { + *mdl= request.ticket; + if (trylock && !*mdl) + return nullptr; + } + } + + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_sys.find_table(table_id); + if (table) + table->acquire(); + if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) + { + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + table= dict_load_table_on_id(table_id, + table_op == DICT_TABLE_OP_LOAD_TABLESPACE + ? DICT_ERR_IGNORE_RECOVER_LOCK + : DICT_ERR_IGNORE_FK_NOKEY); + if (table) + table->acquire(); + dict_sys.unlock(); + dict_sys.freeze(SRW_LOCK_CALL); + } + + if (!table || !table->is_accessible()) + { + table= nullptr; +return_without_mdl: + if (trylock) + dict_sys.unfreeze(); + if (*mdl) + { + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + return not_found; + } + + size_t db1_len, tbl1_len; + + if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len)) + { + /* The table was renamed to #sql prefix. + Release MDL (if any) for the old name and return. */ + goto return_without_mdl; + } + + if (*mdl) + { + if (db_len == db1_len && tbl_len == tbl1_len && + !memcmp(db_buf, db_buf1, db_len) && + !memcmp(tbl_buf, tbl_buf1, tbl_len)) + { + if (trylock) + dict_sys.unfreeze(); + return table; + } + + /* The table was renamed. Release MDL for the old name and + try to acquire MDL for the new name. */ + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + + db_len= db1_len; + tbl_len= tbl1_len; + + memcpy(tbl_buf, tbl_buf1, tbl_len + 1); + memcpy(db_buf, db_buf1, db_len + 1); + goto retry; +} + +template dict_table_t* dict_acquire_mdl_shared<false> +(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t); +template dict_table_t* dict_acquire_mdl_shared<true> +(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t); + +/** Look up a table by numeric identifier. +@param[in] table_id table identifier +@param[in] dict_locked data dictionary locked +@param[in] table_op operation to perform when opening +@param[in,out] thd background thread, or NULL to not acquire MDL +@param[out] mdl mdl ticket, or NULL +@return table, NULL if does not exist */ +dict_table_t *dict_table_open_on_id(table_id_t table_id, bool dict_locked, + dict_table_op_t table_op, THD *thd, + MDL_ticket **mdl) +{ + if (!dict_locked) + dict_sys.freeze(SRW_LOCK_CALL); + + dict_table_t *table= dict_sys.find_table(table_id); + + if (table) + { + table->acquire(); + if (thd && !dict_locked) + table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op); + } + else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) + { + if (!dict_locked) + { + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + } + table= dict_load_table_on_id(table_id, + table_op == DICT_TABLE_OP_LOAD_TABLESPACE + ? DICT_ERR_IGNORE_RECOVER_LOCK + : DICT_ERR_IGNORE_FK_NOKEY); + if (table) + table->acquire(); + if (!dict_locked) + { + dict_sys.unlock(); + if (table && thd) + { + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op); + dict_sys.unfreeze(); + } + return table; + } + } + + if (!dict_locked) + dict_sys.unfreeze(); + + return table; +} + +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +unsigned +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) +{ + ulint pos= dict_index_get_nth_col_pos(dict_table_get_first_index(table), + n, prefix_col_pos); + DBUG_ASSERT(pos <= dict_index_t::MAX_N_FIELDS); + return static_cast<unsigned>(pos); +} + +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ +{ + const dict_index_t* index; + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + col = dict_table_get_nth_col(table, n); + + index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/** Initialise the data dictionary cache. */ +void dict_sys_t::create() +{ + ut_ad(this == &dict_sys); + ut_ad(!is_initialised()); + m_initialised= true; + UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU); + UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU); + + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); + + table_hash.create(hash_size); + table_id_hash.create(hash_size); + temp_id_hash.create(hash_size); + + latch.SRW_LOCK_INIT(dict_operation_lock_key); + + if (!srv_read_only_mode) + { + dict_foreign_err_file= os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); + } + + mysql_mutex_init(dict_foreign_err_mutex_key, &dict_foreign_err_mutex, + nullptr); +} + + +void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) +{ + ulonglong now= my_hrtime_coarse().val, old= 0; + if (latch_ex_wait_start.compare_exchange_strong + (old, now, std::memory_order_relaxed, std::memory_order_relaxed)) + { + latch.wr_lock(SRW_LOCK_ARGS(file, line)); + latch_ex_wait_start.store(0, std::memory_order_relaxed); + ut_ad(!latch_readers); + ut_ad(!latch_ex); + ut_d(latch_ex= pthread_self()); + return; + } + + ut_ad(old); + /* We could have old > now due to our use of my_hrtime_coarse(). */ + ulong waited= old <= now ? static_cast<ulong>((now - old) / 1000000) : 0; + const ulong threshold= srv_fatal_semaphore_wait_threshold; + + if (waited >= threshold) + ib::fatal() << fatal_msg; + + if (waited > threshold / 4) + ib::warn() << "A long wait (" << waited + << " seconds) was observed for dict_sys.latch"; + latch.wr_lock(SRW_LOCK_ARGS(file, line)); + ut_ad(!latch_readers); + ut_ad(!latch_ex); + ut_d(latch_ex= pthread_self()); +} + +#ifdef UNIV_PFS_RWLOCK +ATTRIBUTE_NOINLINE void dict_sys_t::unlock() +{ + ut_ad(latch_ex == pthread_self()); + ut_ad(!latch_readers); + ut_d(latch_ex= 0); + latch.wr_unlock(); +} + +ATTRIBUTE_NOINLINE void dict_sys_t::freeze(const char *file, unsigned line) +{ + latch.rd_lock(file, line); + ut_ad(!latch_ex); + ut_d(latch_readers++); +} + +ATTRIBUTE_NOINLINE void dict_sys_t::unfreeze() +{ + ut_ad(!latch_ex); + ut_ad(latch_readers--); + latch.rd_unlock(); +} +#endif /* UNIV_PFS_RWLOCK */ + +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@param[in] table_name Table name +@param[in] dict_locked whether dict_sys.latch is being held exclusively +@param[in] ignore_err error to be ignored when loading the table +@return table +@retval nullptr if does not exist */ +dict_table_t* +dict_table_open_on_name( + const char* table_name, + bool dict_locked, + dict_err_ignore_t ignore_err) +{ + dict_table_t *table; + DBUG_ENTER("dict_table_open_on_name"); + DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name)); + + const span<const char> name{table_name, strlen(table_name)}; + + if (!dict_locked) + { + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_sys.find_table(name); + if (table) + { + ut_ad(table->cached); + if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) && + !table->is_readable() && table->corrupted) + { + ulint algo = table->space->get_compression_algo(); + if (algo <= PAGE_ALGORITHM_LAST && !fil_comp_algo_loaded(algo)) { + my_printf_error(ER_PROVIDER_NOT_LOADED, + "Table %s is compressed with %s, which is not currently loaded. " + "Please load the %s provider plugin to open the table", + MYF(ME_ERROR_LOG), table->name, + page_compression_algorithms[algo], page_compression_algorithms[algo]); + } else { + my_printf_error(ER_TABLE_CORRUPT, + "Table %s is corrupted. Please drop the table and recreate.", + MYF(ME_ERROR_LOG), table->name); + } + dict_sys.unfreeze(); + DBUG_RETURN(nullptr); + } + table->acquire(); + dict_sys.unfreeze(); + DBUG_RETURN(table); + } + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + } + + table= dict_sys.load_table(name, ignore_err); + + if (table) + { + ut_ad(table->cached); + if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) && + !table->is_readable() && table->corrupted) + { + ib::error() << "Table " << table->name + << " is corrupted. Please drop the table and recreate."; + if (!dict_locked) + dict_sys.unlock(); + DBUG_RETURN(nullptr); + } + + table->acquire(); + } + + ut_ad(dict_lru_validate()); + if (!dict_locked) + dict_sys.unlock(); + + DBUG_RETURN(table); +} + +/**********************************************************************//** +Adds system columns to a table object. */ +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ +{ + ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->cached); + + /* NOTE: the system columns MUST be added in the following order + (so that they can be indexed by the numerical value of DATA_ROW_ID, + etc.) and as the last columns of the table memory object. + The clustered index will not always physically contain all system + columns. */ + + dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS, + DATA_ROW_ID | DATA_NOT_NULL, + DATA_ROW_ID_LEN); + + compile_time_assert(DATA_ROW_ID == 0); + dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS, + DATA_TRX_ID | DATA_NOT_NULL, + DATA_TRX_ID_LEN); + compile_time_assert(DATA_TRX_ID == 1); + dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS, + DATA_ROLL_PTR | DATA_NOT_NULL, + DATA_ROLL_PTR_LEN); + compile_time_assert(DATA_ROLL_PTR == 2); + + /* This check reminds that if a new system column is added to + the program, it should be dealt with here */ + compile_time_assert(DATA_N_SYS_COLS == 3); +} + +/** Add the table definition to the data dictionary cache */ +void dict_table_t::add_to_cache() +{ + cached = TRUE; + + dict_sys.add(this); +} + +/** Add a table definition to the data dictionary cache */ +inline void dict_sys_t::add(dict_table_t* table) +{ + ut_ad(!find(table)); + + ulint fold = my_crc32c(0, table->name.m_name, + strlen(table->name.m_name)); + + table->autoinc_mutex.init(); + table->lock_mutex_init(); + + /* Look for a table with the same name: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(name_hash, &table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + !strcmp(table2->name.m_name, table->name.m_name)); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different name */ + HASH_SEARCH_ALL(name_hash, &table_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + + /* Look for a table with the same id: error if such exists */ + hash_table_t* id_hash = table->is_temporary() + ? &temp_id_hash : &table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); + { + dict_table_t* table2; + HASH_SEARCH(id_hash, id_hash, id_fold, + dict_table_t*, table2, ut_ad(table2->cached), + table2->id == table->id); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different id */ + HASH_SEARCH_ALL(id_hash, id_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); + } + + UT_LIST_ADD_FIRST(table->can_be_evicted ? table_LRU : table_non_LRU, + table); + ut_ad(dict_lru_validate()); +} + +/** Test whether a table can be evicted from dict_sys.table_LRU. +@param table table to be considered for eviction +@return whether the table can be evicted */ +TRANSACTIONAL_TARGET +static bool dict_table_can_be_evicted(dict_table_t *table) +{ + ut_ad(dict_sys.locked()); + ut_a(table->can_be_evicted); + ut_a(table->foreign_set.empty()); + ut_a(table->referenced_set.empty()); + + if (table->get_ref_count() == 0) { + /* The transaction commit and rollback are called from + outside the handler interface. This means that there is + a window where the table->n_ref_count can be zero but + the table instance is in "use". */ + + if (lock_table_has_locks(table)) { + return false; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* We cannot really evict the table if adaptive hash + index entries are pointing to any of its indexes. */ + for (const dict_index_t* index + = dict_table_get_first_index(table); + index; index = dict_table_get_next_index(index)) { + if (index->n_ahi_pages()) { + return false; + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + ut_ad(!table->fts); + return true; + } + + return false; +} + +#ifdef BTR_CUR_HASH_ADAPT +/** @return a clone of this */ +dict_index_t *dict_index_t::clone() const +{ + ut_ad(n_fields); + ut_ad(is_btree()); + ut_ad(online_status == ONLINE_INDEX_COMPLETE); + ut_ad(is_committed()); + ut_ad(!is_dummy); + ut_ad(!parser); + ut_ad(!online_log); + ut_ad(!rtr_track); + + const size_t size= sizeof *this + n_fields * sizeof(*fields) + +#ifdef BTR_CUR_ADAPT + sizeof *search_info + +#endif + 1 + strlen(name) + + n_uniq * (sizeof *stat_n_diff_key_vals + + sizeof *stat_n_sample_sizes + + sizeof *stat_n_non_null_key_vals); + + mem_heap_t* heap= mem_heap_create(size); + dict_index_t *index= static_cast<dict_index_t*> + (mem_heap_alloc(heap, sizeof *this)); + *index= *this; + index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key); + index->heap= heap; + index->name= mem_heap_strdup(heap, name); + index->fields= static_cast<dict_field_t*> + (mem_heap_dup(heap, fields, n_fields * sizeof *fields)); +#ifdef BTR_CUR_ADAPT + index->search_info= btr_search_info_create(index->heap); +#endif /* BTR_CUR_ADAPT */ + index->stat_n_diff_key_vals= static_cast<ib_uint64_t*> + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_diff_key_vals)); + index->stat_n_sample_sizes= static_cast<ib_uint64_t*> + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_sample_sizes)); + index->stat_n_non_null_key_vals= static_cast<ib_uint64_t*> + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_non_null_key_vals)); + new (&index->zip_pad.mutex) std::mutex(); + return index; +} + +/** Clone this index for lazy dropping of the adaptive hash. +@return this or a clone */ +dict_index_t *dict_index_t::clone_if_needed() +{ + if (!search_info->ref_count) + return this; + dict_index_t *prev= UT_LIST_GET_PREV(indexes, this); + + table->autoinc_mutex.wr_lock(); + UT_LIST_REMOVE(table->indexes, this); + UT_LIST_ADD_LAST(table->freed_indexes, this); + dict_index_t *index= clone(); + set_freed(); + if (prev) + UT_LIST_INSERT_AFTER(table->indexes, prev, index); + else + UT_LIST_ADD_FIRST(table->indexes, index); + table->autoinc_mutex.wr_unlock(); + return index; +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/** Evict unused, unlocked tables from table_LRU. +@param half whether to consider half the tables only (instead of all) +@return number of tables evicted */ +ulint dict_sys_t::evict_table_LRU(bool half) +{ +#ifdef MYSQL_DYNAMIC_PLUGIN + constexpr ulint max_tables = 400; +#else + extern ulong tdc_size; + const ulint max_tables = tdc_size; +#endif + ulint n_evicted = 0; + + lock(SRW_LOCK_CALL); + ut_ad(dict_lru_validate()); + + const ulint len = UT_LIST_GET_LEN(table_LRU); + + if (len < max_tables) { +func_exit: + unlock(); + return(n_evicted); + } + + const ulint check_up_to = half ? len / 2 : 0; + ulint i = len; + + /* Find a suitable candidate to evict from the cache. Don't scan the + entire LRU list. Only scan pct_check list entries. */ + + for (dict_table_t *table = UT_LIST_GET_LAST(table_LRU); + table && i > check_up_to && (len - n_evicted) > max_tables; --i) { + dict_table_t* prev_table = UT_LIST_GET_PREV(table_LRU, table); + + if (dict_table_can_be_evicted(table)) { + remove(table, true); + ++n_evicted; + } + + table = prev_table; + } + + goto func_exit; +} + +/** Looks for an index with the given id given a table instance. +@param[in] table table instance +@param[in] id index id +@return index or NULL */ +dict_index_t* +dict_table_find_index_on_id( + const dict_table_t* table, + index_id_t id) +{ + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (id == index->id) { + /* Found */ + + return(index); + } + } + + return(NULL); +} + +/** Function object to remove a foreign key constraint from the +referenced_set of the referenced table. The foreign key object is +also removed from the dictionary cache. The foreign key constraint +is not removed from the foreign_set of the table containing the +constraint. */ +struct dict_foreign_remove_partial +{ + void operator()(dict_foreign_t* foreign) { + dict_table_t* table = foreign->referenced_table; + if (table != NULL) { + table->referenced_set.erase(foreign); + } + dict_foreign_free(foreign); + } +}; + +/** This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@param[in] old_path Pathname +@param[in] tablename Contains new base name +@return own: new full pathname */ +static char *dir_pathname(const char *old_path, span<const char> tablename) +{ + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + const char *base_name= tablename.data(); + for (const char *last= tablename.end(); last > tablename.data(); last--) + { + if (last[-1] == '/') + { + base_name= last; + break; + } + } + const size_t base_name_len= tablename.end() - base_name; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + const char *last_slash= strrchr(old_path, '/'); +#ifdef _WIN32 + if (const char *last= strrchr(old_path, '\\')) + if (last > last_slash) + last_slash= last; +#endif + + size_t dir_len= last_slash + ? size_t(last_slash - old_path) + : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + size_t new_path_len= dir_len + base_name_len + sizeof "/.ibd"; + char *new_path= static_cast<char*>(ut_malloc_nokey(new_path_len)); + memcpy(new_path, old_path, dir_len); + snprintf(new_path + dir_len, new_path_len - dir_len, "/%.*s.ibd", + int(base_name_len), base_name); + return new_path; +} + +/** Rename the data file. +@param new_name name of the table +@param replace whether to replace the file with the new name + (as part of rolling back TRUNCATE) */ +dberr_t +dict_table_t::rename_tablespace(span<const char> new_name, bool replace) const +{ + ut_ad(dict_table_is_file_per_table(this)); + ut_ad(!is_temporary()); + + if (!space) + return DB_SUCCESS; + + const char *old_path= UT_LIST_GET_FIRST(space->chain)->name; + const bool data_dir= DICT_TF_HAS_DATA_DIR(flags); + char *path= data_dir + ? dir_pathname(old_path, new_name) + : fil_make_filepath(nullptr, new_name, IBD, false); + dberr_t err; + if (!path) + err= DB_OUT_OF_MEMORY; + else if (!strcmp(path, old_path)) + err= DB_SUCCESS; + else if (data_dir && + DB_SUCCESS != RemoteDatafile::create_link_file(new_name, path)) + err= DB_TABLESPACE_EXISTS; + else + { + space->x_lock(); + err= space->rename(path, true, replace); + if (data_dir) + { + if (err == DB_SUCCESS) + new_name= {name.m_name, strlen(name.m_name)}; + RemoteDatafile::delete_link_file(new_name); + } + space->x_unlock(); + } + + ut_free(path); + return err; +} + +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + span<const char> new_name, /*!< in: new name */ + bool replace_new_file) + /*!< in: whether to replace the + file with the new name + (as part of rolling back TRUNCATE) */ +{ + dict_foreign_t* foreign; + char old_name[MAX_FULL_NAME_LEN + 1]; + + ut_ad(dict_sys.locked()); + + /* store the old/current name to an automatic variable */ + const size_t old_name_len = strlen(table->name.m_name); + ut_a(old_name_len < sizeof old_name); + strcpy(old_name, table->name.m_name); + + const uint32_t fold= my_crc32c(0, new_name.data(), new_name.size()); + ut_a(!dict_sys.find_table(new_name)); + + if (!dict_table_is_file_per_table(table)) { + } else if (dberr_t err = table->rename_tablespace(new_name, + replace_new_file)) { + return err; + } + + /* Remove table from the hash tables of tables */ + HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash, + my_crc32c(0, table->name.m_name, old_name_len), table); + + bool keep_mdl_name = !table->name.is_temporary(); + + if (!keep_mdl_name) { + } else if (const char* s = static_cast<const char*> + (memchr(new_name.data(), '/', new_name.size()))) { + keep_mdl_name = new_name.end() - s >= 5 + && !memcmp(s, "/#sql", 5); + } + + if (keep_mdl_name) { + /* Preserve the original table name for + dict_table_t::parse_name() and dict_acquire_mdl_shared(). */ + table->mdl_name.m_name = mem_heap_strdup(table->heap, + table->name.m_name); + } + + if (new_name.size() > strlen(table->name.m_name)) { + /* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid + memory fragmentation, we assume a repeated calls of + ut_realloc() with the same size do not cause fragmentation */ + ut_a(new_name.size() <= MAX_FULL_NAME_LEN); + + table->name.m_name = static_cast<char*>( + ut_realloc(table->name.m_name, MAX_FULL_NAME_LEN + 1)); + } + memcpy(table->name.m_name, new_name.data(), new_name.size()); + table->name.m_name[new_name.size()] = '\0'; + + if (!keep_mdl_name) { + table->mdl_name.m_name = table->name.m_name; + } + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold, + table); + + if (table->name.is_temporary()) { + /* In ALTER TABLE we think of the rename table operation + in the direction table -> temporary table (#sql...) + as dropping the table with the old name and creating + a new with the new name. Thus we kind of drop the + constraints from the dictionary cache here. The foreign key + constraints will be inherited to the new table from the + system tables through a call of dict_load_foreigns. */ + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it + = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + } + + /* Make the set of referencing constraints empty */ + table->referenced_set.clear(); + + return(DB_SUCCESS); + } + + /* Update the table name fields in foreign constraints, and update also + the constraint id of new format >= 4.0.18 constraints. Note that at + this point we have already changed table->name to the new name. */ + + dict_foreign_set fk_set; + + for (;;) { + + dict_foreign_set::iterator it + = table->foreign_set.begin(); + + if (it == table->foreign_set.end()) { + break; + } + + foreign = *it; + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (strlen(foreign->foreign_table_name) + < strlen(table->name.m_name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name.m_name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + } else { + strcpy(foreign->foreign_table_name, + table->name.m_name); + dict_mem_foreign_table_name_lookup_set(foreign, FALSE); + } + if (strchr(foreign->id, '/')) { + /* This is a >= 4.0.18 format id */ + + ulint db_len; + char* old_id; + char old_name_cs_filename[MAX_FULL_NAME_LEN+1]; + uint errors = 0; + + /* All table names are internally stored in charset + my_charset_filename (except the temp tables and the + partition identifier suffix in partition tables). The + foreign key constraint names are internally stored + in UTF-8 charset. The variable fkid here is used + to store foreign key constraint name in charset + my_charset_filename for comparison further below. */ + char fkid[MAX_TABLE_NAME_LEN * 2 + 20]; + + /* The old table name in my_charset_filename is stored + in old_name_cs_filename */ + + strcpy(old_name_cs_filename, old_name); + old_name_cs_filename[MAX_FULL_NAME_LEN] = '\0'; + if (!dict_table_t::is_temporary_name(old_name)) { + innobase_convert_to_system_charset( + strchr(old_name_cs_filename, '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* There has been an error to convert + old table into UTF-8. This probably + means that the old table name is + actually in UTF-8. */ + innobase_convert_to_filename_charset( + strchr(old_name_cs_filename, + '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN); + } else { + /* Old name already in + my_charset_filename */ + strcpy(old_name_cs_filename, old_name); + old_name_cs_filename[MAX_FULL_NAME_LEN] + = '\0'; + } + } + + strncpy(fkid, foreign->id, (sizeof fkid) - 1); + fkid[(sizeof fkid) - 1] = '\0'; + + const bool on_tmp = dict_table_t::is_temporary_name( + fkid); + + if (!on_tmp) { + innobase_convert_to_filename_charset( + strchr(fkid, '/') + 1, + strchr(foreign->id, '/') + 1, + MAX_TABLE_NAME_LEN+20); + } + + old_id = mem_strdup(foreign->id); + + if (strlen(fkid) > strlen(old_name_cs_filename) + + ((sizeof dict_ibfk) - 1) + && !memcmp(fkid, old_name_cs_filename, + strlen(old_name_cs_filename)) + && !memcmp(fkid + strlen(old_name_cs_filename), + dict_ibfk, (sizeof dict_ibfk) - 1)) { + + /* This is a generated >= 4.0.18 format id */ + + char table_name[MAX_TABLE_NAME_LEN + 1]; + uint errors = 0; + + if (strlen(table->name.m_name) + > strlen(old_name)) { + foreign->id = static_cast<char*>( + mem_heap_alloc( + foreign->heap, + strlen(table->name.m_name) + + strlen(old_id) + 1)); + } + + /* Convert the table name to UTF-8 */ + strncpy(table_name, table->name.m_name, + MAX_TABLE_NAME_LEN); + table_name[MAX_TABLE_NAME_LEN] = '\0'; + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(table->name.m_name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted + from charset my_charset_filename to + UTF-8. This means that the table name + is already in UTF-8 (#mysql50#). */ + strncpy(table_name, table->name.m_name, + MAX_TABLE_NAME_LEN); + table_name[MAX_TABLE_NAME_LEN] = '\0'; + } + + /* Replace the prefix 'databasename/tablename' + with the new names */ + strcpy(foreign->id, table_name); + if (on_tmp) { + strcat(foreign->id, + old_id + strlen(old_name)); + } else { + sprintf(strchr(foreign->id, '/') + 1, + "%s%s", + strchr(table_name, '/') +1, + strstr(old_id, "_ibfk_") ); + } + + } else { + /* This is a >= 4.0.18 format id where the user + gave the id name */ + db_len = dict_get_db_name_len( + table->name.m_name) + 1; + + if (db_len - 1 + > dict_get_db_name_len(foreign->id)) { + + foreign->id = static_cast<char*>( + mem_heap_alloc( + foreign->heap, + db_len + strlen(old_id) + 1)); + } + + /* Replace the database prefix in id with the + one from table->name */ + + memcpy(foreign->id, + table->name.m_name, db_len); + + strcpy(foreign->id + db_len, + dict_remove_db_name(old_id)); + } + + ut_free(old_id); + } + + table->foreign_set.erase(it); + fk_set.insert(foreign); + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.insert(foreign); + } + } + + ut_a(table->foreign_set.empty()); + table->foreign_set.swap(fk_set); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (strlen(foreign->referenced_table_name) + < strlen(table->name.m_name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, table->name.m_name); + + dict_mem_referenced_table_name_lookup_set( + foreign, TRUE); + } else { + /* Use the same buffer */ + strcpy(foreign->referenced_table_name, + table->name.m_name); + + dict_mem_referenced_table_name_lookup_set( + foreign, FALSE); + } + } + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ +{ + ut_ad(dict_sys.locked()); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->is_temporary()); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, &dict_sys.table_id_hash, + ut_fold_ull(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, &dict_sys.table_id_hash, + ut_fold_ull(table->id), table); +} + +/** Evict a table definition from the InnoDB data dictionary cache. +@param[in,out] table cached table definition to be evicted +@param[in] lru whether this is part of least-recently-used eviction +@param[in] keep whether to keep (not free) the object */ +void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep) +{ + dict_foreign_t* foreign; + dict_index_t* index; + + ut_ad(dict_lru_validate()); + ut_a(table->get_ref_count() == 0); + ut_a(table->n_rec_locks == 0); + ut_ad(find(table)); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + } + + /* Remove the indexes from the cache */ + + for (index = UT_LIST_GET_LAST(table->indexes); + index != NULL; + index = UT_LIST_GET_LAST(table->indexes)) { + + dict_index_remove_from_cache_low(table, index, lru); + } + + /* Remove table from the hash tables of tables */ + + HASH_DELETE(dict_table_t, name_hash, &table_hash, + my_crc32c(0, table->name.m_name, + strlen(table->name.m_name)), + table); + + hash_table_t* id_hash = table->is_temporary() + ? &temp_id_hash : &table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); + HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table); + + /* Remove table from LRU or non-LRU list. */ + if (table->can_be_evicted) { + UT_LIST_REMOVE(table_LRU, table); + } else { + UT_LIST_REMOVE(table_non_LRU, table); + } + + /* Free virtual column template if any */ + if (table->vc_templ != NULL) { + dict_free_vc_templ(table->vc_templ); + UT_DELETE(table->vc_templ); + } + + table->lock_mutex_destroy(); + + if (keep) { + table->autoinc_mutex.destroy(); + return; + } + +#ifdef BTR_CUR_HASH_ADAPT + if (table->fts) { + fts_optimize_remove_table(table); + table->fts->~fts_t(); + table->fts = nullptr; + } + + table->autoinc_mutex.wr_lock(); + + ulint freed = UT_LIST_GET_LEN(table->freed_indexes); + + table->vc_templ = NULL; + table->id = 0; + table->autoinc_mutex.wr_unlock(); + + if (UNIV_UNLIKELY(freed != 0)) { + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + table->autoinc_mutex.destroy(); + dict_mem_table_free(table); +} + +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ +{ + static const char* reserved_names[] = { + "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR" + }; + + compile_time_assert(UT_ARR_SIZE(reserved_names) == DATA_N_SYS_COLS); + + for (ulint i = 0; i < UT_ARR_SIZE(reserved_names); i++) { + if (innobase_strcasecmp(name, reserved_names[i]) == 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +/** Adds an index to the dictionary cache, with possible indexing newly +added column. +@param[in,out] index index; NOTE! The index memory + object is freed in this function! +@param[in] page_no root page number of the index +@param[in] add_v virtual columns being added along with ADD INDEX +@return DB_SUCCESS, or DB_CORRUPTION */ +dberr_t +dict_index_add_to_cache( + dict_index_t*& index, + ulint page_no, + const dict_add_v_col_t* add_v) +{ + dict_index_t* new_index; + ulint n_ord; + ulint i; + + ut_ad(dict_sys.locked()); + ut_ad(index->n_def == index->n_fields); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(!dict_index_is_ibuf(index)); + + ut_d(mem_heap_validate(index->heap)); + ut_a(!dict_index_is_clust(index) + || UT_LIST_GET_LEN(index->table->indexes) == 0); + ut_ad(dict_index_is_clust(index) || !index->table->no_rollback()); + + if (!dict_index_find_cols(index, add_v)) { + + dict_mem_index_free(index); + index = NULL; + return DB_CORRUPTION; + } + + /* Build the cache internal representation of the index, + containing also the added system fields */ + + if (dict_index_is_clust(index)) { + new_index = dict_index_build_internal_clust(index); + } else { + new_index = (index->type & DICT_FTS) + ? dict_index_build_internal_fts(index) + : dict_index_build_internal_non_clust(index); + new_index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(new_index->n_nullable))); + } + + /* Set the n_fields value in new_index to the actual defined + number of fields in the cache internal representation */ + + new_index->n_fields = new_index->n_def; + new_index->trx_id = index->trx_id; + new_index->set_committed(index->is_committed()); + new_index->nulls_equal = index->nulls_equal; + + n_ord = new_index->n_uniq; + /* Flag the ordering columns and also set column max_prefix */ + + for (i = 0; i < n_ord; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + + /* Check the column being added in the index for + the first time and flag the ordering column. */ + if (field->col->ord_part == 0 ) { + field->col->max_prefix = field->prefix_len; + field->col->ord_part = 1; + } else if (field->prefix_len == 0) { + /* Set the max_prefix for a column to 0 if + its prefix length is 0 (for this index) + even if it was a part of any other index + with some prefix length. */ + field->col->max_prefix = 0; + } else if (field->col->max_prefix != 0 + && field->prefix_len + > field->col->max_prefix) { + /* Set the max_prefix value based on the + prefix_len. */ + ut_ad(field->col->is_binary() + || field->prefix_len % field->col->mbmaxlen == 0 + || field->prefix_len % 4 == 0); + field->col->max_prefix = field->prefix_len; + } + ut_ad(field->col->ord_part == 1); + } + + new_index->stat_n_diff_key_vals = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_diff_key_vals))); + + new_index->stat_n_sample_sizes = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_sample_sizes))); + + new_index->stat_n_non_null_key_vals = + static_cast<ib_uint64_t*>(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_non_null_key_vals))); + + new_index->stat_index_size = 1; + new_index->stat_n_leaf_pages = 1; + + new_index->stat_defrag_n_pages_freed = 0; + new_index->stat_defrag_n_page_split = 0; + + new_index->stat_defrag_sample_next_slot = 0; + memset(&new_index->stat_defrag_data_size_sample, + 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE); + + /* Add the new index as the last index for the table */ + + UT_LIST_ADD_LAST(new_index->table->indexes, new_index); +#ifdef BTR_CUR_ADAPT + new_index->search_info = btr_search_info_create(new_index->heap); +#endif /* BTR_CUR_ADAPT */ + + new_index->page = unsigned(page_no); + new_index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key); + + new_index->n_core_fields = new_index->n_fields; + + dict_mem_index_free(index); + index = new_index; + return DB_SUCCESS; +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +TRANSACTIONAL_TARGET +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict) /*!< in: TRUE if index being evicted + to make room in the table LRU list */ +{ + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(dict_sys.locked()); + ut_ad(table->id); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!index->freed()); +#endif /* BTR_CUR_HASH_ADAPT */ + + /* No need to acquire the dict_index_t::lock here because + there can't be any active operations on this index (or table). */ + + if (index->online_log) { + row_log_free(index->online_log); + index->online_log = NULL; + } + + /* Remove the index from the list of indexes of the table */ + UT_LIST_REMOVE(table->indexes, index); + + /* The index is being dropped, remove any compression stats for it. */ + if (!lru_evict && DICT_TF_GET_ZIP_SSIZE(index->table->flags)) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index.erase(index->id); + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + /* Remove the index from affected virtual column index list */ + index->detach_columns(); + +#ifdef BTR_CUR_HASH_ADAPT + /* We always create search info whether or not adaptive + hash index is enabled or not. */ + /* We are not allowed to free the in-memory index struct + dict_index_t until all entries in the adaptive hash index + that point to any of the page belonging to his b-tree index + are dropped. This is so because dropping of these entries + require access to dict_index_t struct. To avoid such scenario + We keep a count of number of such pages in the search_info and + only free the dict_index_t struct when this count drops to + zero. See also: dict_table_can_be_evicted() */ + + if (index->n_ahi_pages()) { + table->autoinc_mutex.wr_lock(); + index->set_freed(); + UT_LIST_ADD_LAST(table->freed_indexes, index); + table->autoinc_mutex.wr_unlock(); + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + index->lock.free(); + + dict_mem_index_free(index); +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ +{ + dict_index_remove_from_cache_low(table, index, FALSE); +} + +/** Tries to find column names for the index and sets the col field of the +index. +@param[in] table table +@param[in,out] index index +@param[in] add_v new virtual columns added along with an add index call +@return whether the column names were found */ +static +bool +dict_index_find_cols( + dict_index_t* index, + const dict_add_v_col_t* add_v) +{ + std::vector<ulint, ut_allocator<ulint> > col_added; + std::vector<ulint, ut_allocator<ulint> > v_col_added; + + const dict_table_t* table = index->table; + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(dict_sys.locked()); + + for (ulint i = 0; i < index->n_fields; i++) { + ulint j; + dict_field_t* field = dict_index_get_nth_field(index, i); + + for (j = 0; j < table->n_cols; j++) { + if (!innobase_strcasecmp(dict_table_get_col_name(table, j), + field->name)) { + + /* Check if same column is being assigned again + which suggest that column has duplicate name. */ + bool exists = + std::find(col_added.begin(), + col_added.end(), j) + != col_added.end(); + + if (exists) { + /* Duplicate column found. */ + goto dup_err; + } + + field->col = dict_table_get_nth_col(table, j); + + col_added.push_back(j); + + goto found; + } + } + + /* Let's check if it is a virtual column */ + for (j = 0; j < table->n_v_cols; j++) { + if (!strcmp(dict_table_get_v_col_name(table, j), + field->name)) { + + /* Check if same column is being assigned again + which suggest that column has duplicate name. */ + bool exists = + std::find(v_col_added.begin(), + v_col_added.end(), j) + != v_col_added.end(); + + if (exists) { + /* Duplicate column found. */ + break; + } + + field->col = reinterpret_cast<dict_col_t*>( + dict_table_get_nth_v_col(table, j)); + + v_col_added.push_back(j); + + goto found; + } + } + + if (add_v) { + for (j = 0; j < add_v->n_v_col; j++) { + if (!strcmp(add_v->v_col_name[j], + field->name)) { + field->col = const_cast<dict_col_t*>( + &add_v->v_col[j].m_col); + goto found; + } + } + } + +dup_err: +#ifdef UNIV_DEBUG + /* It is an error not to find a matching column. */ + ib::error() << "No matching column for " << field->name + << " in index " << index->name + << " of table " << table->name; +#endif /* UNIV_DEBUG */ + return(FALSE); + +found: + ; + } + + return(TRUE); +} + +/** Add a column to an index. +@param index index +@param table table +@param col column +@param prefix_len column prefix length +@param descending whether to use descending order */ +void dict_index_add_col(dict_index_t *index, const dict_table_t *table, + dict_col_t *col, ulint prefix_len, bool descending) +{ + dict_field_t* field; + const char* col_name; + + if (col->is_virtual()) { + dict_v_col_t* v_col = reinterpret_cast<dict_v_col_t*>(col); + /* Register the index with the virtual column index list */ + v_col->v_indexes.push_front(dict_v_idx_t(index, index->n_def)); + col_name = dict_table_get_v_col_name_mysql( + table, dict_col_get_no(col)); + } else { + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + } + + dict_mem_index_add_field(index, col_name, prefix_len); + + field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1); + + field->col = col; + field->fixed_len = static_cast<uint16_t>( + dict_col_get_fixed_size( + col, dict_table_is_comp(table))) + & ((1U << 10) - 1); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = static_cast<uint16_t>(prefix_len) + & ((1U << 10) - 1); + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) { + field->fixed_len = 0; + } + + field->descending = descending; + + /* The comparison limit above must be constant. If it were + changed, the disk format of some fixed-length columns would + change, which would be a disaster. */ + compile_time_assert(DICT_MAX_FIXED_COL_LEN == 768); + + if (!(col->prtype & DATA_NOT_NULL)) { + index->n_nullable++; + } +} + +/*******************************************************************//** +Copies fields contained in index2 to index1. */ +static +void +dict_index_copy( +/*============*/ + dict_index_t* index1, /*!< in: index to copy to */ + const dict_index_t* index2, /*!< in: index to copy from */ + ulint start, /*!< in: first position to copy */ + ulint end) /*!< in: last position to copy */ +{ + dict_field_t* field; + ulint i; + + /* Copy fields contained in index2 */ + + for (i = start; i < end; i++) { + + field = dict_index_get_nth_field(index2, i); + + dict_index_add_col(index1, index2->table, field->col, + field->prefix_len, field->descending); + } +} + +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ +{ + ulint i; + + if (dict_index_is_ibuf(index)) { + dtuple_set_types_binary(tuple, n_fields); + + return; + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* ifield; + dtype_t* dfield_type; + + ifield = dict_index_get_nth_field(index, i); + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dict_col_copy_type(dict_field_get_col(ifield), dfield_type); + if (dict_index_is_spatial(index) + && DATA_GEOMETRY_MTYPE(dfield_type->mtype)) { + dfield_type->prtype |= DATA_GIS_MBR; + } + } +} + +/** Copies types of virtual columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). +@param[in,out] tuple data tuple +@param[in] table table +*/ +void +dict_table_copy_v_types( + dtuple_t* tuple, + const dict_table_t* table) +{ + /* tuple could have more virtual columns than existing table, + if we are calling this for creating index along with adding + virtual columns */ + ulint n_fields = ut_min(dtuple_get_n_v_fields(tuple), + static_cast<ulint>(table->n_v_def)); + + for (ulint i = 0; i < n_fields; i++) { + + dfield_t* dfield = dtuple_get_nth_v_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type( + &(dict_table_get_nth_v_col(table, i)->m_col), + dtype); + } +} +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + dfield_t* dfield = dtuple_get_nth_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type(dict_table_get_nth_col(table, i), dtype); + } + + dict_table_copy_v_types(tuple, table); +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + dict_index_t* index) /*!< in: user representation of + a clustered index */ +{ + dict_table_t* table = index->table; + dict_index_t* new_index; + dict_field_t* field; + ulint trx_id_pos; + ulint i; + ibool* indexed; + + ut_ad(index->is_primary()); + ut_ad(!index->has_virtual()); + + ut_ad(dict_sys.locked()); + + /* Create a new index object with certainly enough fields */ + new_index = dict_mem_index_create(index->table, index->name, + index->type, + unsigned(index->n_fields + + table->n_cols)); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy the fields of index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + if (dict_index_is_unique(index)) { + /* Only the fields defined so far are needed to identify + the index entry uniquely */ + + new_index->n_uniq = new_index->n_def; + } else { + /* Also the row id is needed to identify the entry */ + new_index->n_uniq = unsigned(new_index->n_def + 1) + & dict_index_t::MAX_N_FIELDS; + } + + new_index->trx_id_offset = 0; + + /* Add system columns, trx id first */ + + trx_id_pos = new_index->n_def; + + compile_time_assert(DATA_ROW_ID == 0); + compile_time_assert(DATA_TRX_ID == 1); + compile_time_assert(DATA_ROLL_PTR == 2); + + if (!dict_index_is_unique(index)) { + dict_index_add_col(new_index, table, + dict_table_get_sys_col( + table, DATA_ROW_ID), + 0); + trx_id_pos++; + } + + dict_index_add_col( + new_index, table, + dict_table_get_sys_col(table, DATA_TRX_ID), 0); + + for (i = 0; i < trx_id_pos; i++) { + + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(new_index, i), + dict_table_is_comp(table)); + + if (fixed_size == 0) { + new_index->trx_id_offset = 0; + + break; + } + + dict_field_t* field = dict_index_get_nth_field( + new_index, i); + if (field->prefix_len > 0) { + new_index->trx_id_offset = 0; + + break; + } + + /* Add fixed_size to new_index->trx_id_offset. + Because the latter is a bit-field, an overflow + can theoretically occur. Check for it. */ + fixed_size += new_index->trx_id_offset; + + new_index->trx_id_offset = static_cast<unsigned>(fixed_size) + & ((1U << 12) - 1); + + if (new_index->trx_id_offset != fixed_size) { + /* Overflow. Pretend that this is a + variable-length PRIMARY KEY. */ + ut_ad(0); + new_index->trx_id_offset = 0; + break; + } + } + + dict_index_add_col( + new_index, table, + dict_table_get_sys_col(table, DATA_ROLL_PTR), 0); + + /* Remember the table columns already contained in new_index */ + indexed = static_cast<ibool*>( + ut_zalloc_nokey(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index non-system columns of table not yet included + there */ + for (i = 0; i + DATA_N_SYS_COLS < ulint(table->n_cols); i++) { + dict_col_t* col = dict_table_get_nth_col(table, i); + ut_ad(col->mtype != DATA_SYS); + + if (!indexed[col->ind]) { + dict_index_add_col(new_index, table, col, 0); + } + } + + ut_free(indexed); + + ut_ad(UT_LIST_GET_LEN(table->indexes) == 0); + + new_index->n_core_null_bytes = table->supports_instant() + ? dict_index_t::NO_CORE_NULL_BYTES + : static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(new_index->n_nullable))); + new_index->cached = TRUE; + + return(new_index); +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + dict_index_t* index) /*!< in: user representation of + a non-clustered index */ +{ + dict_field_t* field; + dict_index_t* new_index; + dict_index_t* clust_index; + dict_table_t* table = index->table; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_sys.locked()); + + /* The clustered index should be the first in the list of indexes */ + clust_index = UT_LIST_GET_FIRST(table->indexes); + + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + ut_ad(!dict_index_is_ibuf(clust_index)); + + /* Create a new index */ + new_index = dict_mem_index_create( + index->table, index->name, index->type, + ulint(index->n_fields + 1 + clust_index->n_uniq)); + + /* Copy other relevant data from the old index + struct to the new struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + /* Remember the table columns already contained in new_index */ + indexed = static_cast<ibool*>( + ut_zalloc_nokey(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + if (field->col->is_virtual()) { + continue; + } + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index the columns necessary to determine the clustered + index entry uniquely */ + + for (i = 0; i < clust_index->n_uniq; i++) { + field = dict_index_get_nth_field(clust_index, i); + + if (!indexed[field->col->ind] || index->is_spatial()) { + dict_index_add_col(new_index, table, field->col, + field->prefix_len, + field->descending); + } + } + + ut_free(indexed); + + if (dict_index_is_unique(index)) { + new_index->n_uniq = index->n_fields; + } else { + new_index->n_uniq = new_index->n_def; + } + + /* Set the n_fields value in new_index to the actual defined + number of fields */ + + new_index->n_fields = new_index->n_def; + + new_index->cached = TRUE; + + return(new_index); +} + +/*********************************************************************** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_index_t* index) /*!< in: user representation of an FTS index */ +{ + dict_index_t* new_index; + + ut_ad(index->type & DICT_FTS); + ut_ad(dict_sys.locked()); + + /* Create a new index */ + new_index = dict_mem_index_create(index->table, index->name, + index->type, index->n_fields); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + new_index->n_uniq = 0; + new_index->cached = TRUE; + + dict_table_t* table = index->table; + + if (table->fts->cache == NULL) { + table->fts->cache = fts_cache_create(table); + } + + mysql_mutex_lock(&table->fts->cache->init_lock); + /* Notify the FTS cache about this index. */ + fts_cache_index_cache_create(table, new_index); + mysql_mutex_unlock(&table->fts->cache->init_lock); + + return(new_index); +} +/*====================== FOREIGN KEY PROCESSING ========================*/ + +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ +{ + ut_ad(dict_sys.locked()); + ut_a(foreign); + + if (foreign->referenced_table != NULL) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (foreign->foreign_table != NULL) { + foreign->foreign_table->foreign_set.erase(foreign); + } + + dict_foreign_free(foreign); +} + +/**********************************************************************//** +Looks for the foreign constraint from the foreign and referenced lists +of a table. +@return foreign constraint */ +static +dict_foreign_t* +dict_foreign_find( +/*==============*/ + dict_table_t* table, /*!< in: table object */ + dict_foreign_t* foreign) /*!< in: foreign constraint */ +{ + ut_ad(dict_sys.frozen()); + + ut_ad(dict_foreign_set_validate(table->foreign_set)); + ut_ad(dict_foreign_set_validate(table->referenced_set)); + + dict_foreign_set::iterator it = table->foreign_set.find(foreign); + + if (it != table->foreign_set.end()) { + return(*it); + } + + it = table->referenced_set.find(foreign); + + if (it != table->referenced_set.end()) { + return(*it); + } + + return(NULL); +} + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ +{ + ut_ad(dict_sys.frozen()); + + if (error) { + *error = FK_INDEX_NOT_FOUND; + } + + for (dict_index_t* index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + if (types_idx != index + && !index->to_be_dropped + && !dict_index_is_online_ddl(index) + && dict_foreign_qualify_index( + table, col_names, columns, n_cols, + index, types_idx, + check_charsets, check_null, + error, err_col_no, err_index)) { + if (error) { + *error = FK_SUCCESS; + } + + return(index); + } + } + + return(NULL); +} +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report_low( +/*==========================*/ + FILE* file, /*!< in: output stream */ + const char* name) /*!< in: table name */ +{ + rewind(file); + ut_print_timestamp(file); + fprintf(file, " Error in foreign key constraint of table %s:\n", + name); +} + +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report( +/*======================*/ + FILE* file, /*!< in: output stream */ + dict_foreign_t* fk, /*!< in: foreign key constraint */ + const char* msg) /*!< in: the error message */ +{ + std::string fk_str; + mysql_mutex_lock(&dict_foreign_err_mutex); + dict_foreign_error_report_low(file, fk->foreign_table_name); + fputs(msg, file); + fputs(" Constraint:\n", file); + fk_str = dict_print_info_on_foreign_key_in_create_format(NULL, fk, TRUE); + fputs(fk_str.c_str(), file); + putc('\n', file); + if (fk->foreign_index) { + fprintf(file, "The index in the foreign key in table is" + " %s\n%s\n", fk->foreign_index->name(), + FOREIGN_KEY_CONSTRAINTS_MSG); + } + mysql_mutex_unlock(&dict_foreign_err_mutex); +} + +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of the foreign table and the referenced table must already +be in the dictionary cache! +@return DB_SUCCESS or error code */ +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ +{ + dict_table_t* for_table; + dict_table_t* ref_table; + dict_foreign_t* for_in_cache = NULL; + dict_index_t* index; + ibool added_to_referenced_list= FALSE; + FILE* ef = dict_foreign_err_file; + + DBUG_ENTER("dict_foreign_add_to_cache"); + DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id)); + + ut_ad(dict_sys.locked()); + + for_table = dict_sys.find_table( + {foreign->foreign_table_name_lookup, + strlen(foreign->foreign_table_name_lookup)}); + + ref_table = dict_sys.find_table( + {foreign->referenced_table_name_lookup, + strlen(foreign->referenced_table_name_lookup)}); + ut_a(for_table || ref_table); + + if (for_table) { + for_in_cache = dict_foreign_find(for_table, foreign); + } + + if (!for_in_cache && ref_table) { + for_in_cache = dict_foreign_find(ref_table, foreign); + } + + if (for_in_cache) { + dict_foreign_free(foreign); + } else { + for_in_cache = foreign; + + } + + if (ref_table && !for_in_cache->referenced_table) { + index = dict_foreign_find_index( + ref_table, NULL, + for_in_cache->referenced_col_names, + for_in_cache->n_fields, for_in_cache->foreign_index, + check_charsets, false); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in referenced table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "referenced table do not match" + " the ones in table."); + + if (for_in_cache == foreign) { + dict_foreign_free(foreign); + } + + DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->referenced_table = ref_table; + for_in_cache->referenced_index = index; + + std::pair<dict_foreign_set::iterator, bool> ret + = ref_table->referenced_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + added_to_referenced_list = TRUE; + } + + if (for_table && !for_in_cache->foreign_table) { + index = dict_foreign_find_index( + for_table, col_names, + for_in_cache->foreign_col_names, + for_in_cache->n_fields, + for_in_cache->referenced_index, check_charsets, + for_in_cache->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL)); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in the table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "table do not match" + " the ones in the referenced table\n" + "or one of the ON ... SET NULL columns" + " is declared NOT NULL."); + + if (for_in_cache == foreign) { + if (added_to_referenced_list) { + const dict_foreign_set::size_type + n = ref_table->referenced_set + .erase(for_in_cache); + + ut_a(n == 1); /* the number of + elements removed must + be one */ + } + + dict_foreign_free(foreign); + } + + DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->foreign_table = for_table; + for_in_cache->foreign_index = index; + + std::pair<dict_foreign_set::iterator, bool> ret + = for_table->foreign_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + } + + /* We need to move the table to the non-LRU end of the table LRU + list. Otherwise it will be evicted from the cache. */ + + if (ref_table != NULL) { + dict_sys.prevent_eviction(ref_table); + } + + if (for_table != NULL) { + dict_sys.prevent_eviction(for_table); + } + + ut_ad(dict_lru_validate()); + DBUG_RETURN(DB_SUCCESS); +} + +/*********************************************************************//** +Scans from pointer onwards. Stops if is at the start of a copy of +'string' where characters are compared without case sensitivity, and +only outside `` or "" quotes. Stops also at NUL. +@return scanned up to this */ +static +const char* +dict_scan_to( +/*=========*/ + const char* ptr, /*!< in: scan from */ + const char* string) /*!< in: look for this */ +{ + char quote = '\0'; + bool escape = false; + + for (; *ptr; ptr++) { + if (*ptr == quote) { + /* Closing quote character: do not look for + starting quote or the keyword. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = '\0'; + } + } else if (quote) { + /* Within quotes: do nothing. */ + if (escape) { + escape = false; + } else if (*ptr == '\\') { + escape = true; + } + } else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *ptr; + } else { + /* Outside quotes: look for the keyword. */ + ulint i; + for (i = 0; string[i]; i++) { + if (toupper((int)(unsigned char)(ptr[i])) + != toupper((int)(unsigned char) + (string[i]))) { + goto nomatch; + } + } + break; +nomatch: + ; + } + } + + return(ptr); +} + +/*********************************************************************//** +Accepts a specified string. Comparisons are case-insensitive. +@return if string was accepted, the pointer is moved after that, else +ptr is returned */ +static +const char* +dict_accept( +/*========*/ + CHARSET_INFO* cs, /*!< in: the character set of ptr */ + const char* ptr, /*!< in: scan from this */ + const char* string, /*!< in: accept only this string as the next + non-whitespace string */ + ibool* success)/*!< out: TRUE if accepted */ +{ + const char* old_ptr = ptr; + const char* old_ptr2; + + *success = FALSE; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + old_ptr2 = ptr; + + ptr = dict_scan_to(ptr, string); + + if (*ptr == '\0' || old_ptr2 != ptr) { + return(old_ptr); + } + + *success = TRUE; + + return ptr + strlen(string); +} + +/*********************************************************************//** +Scans an id. For the lexical definition of an 'id', see the code below. +Strips backquotes or double quotes from around the id. +@return scanned to */ +static +const char* +dict_scan_id( +/*=========*/ + CHARSET_INFO* cs, /*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + mem_heap_t* heap, /*!< in: heap where to allocate the id + (NULL=id will not be allocated, but it + will point to string near ptr) */ + const char** id, /*!< out,own: the id; NULL if no id was + scannable */ + ibool table_id,/*!< in: TRUE=convert the allocated id + as a table name; FALSE=convert to UTF-8 */ + ibool accept_also_dot) + /*!< in: TRUE if also a dot can appear in a + non-quoted id; in a quoted id it can appear + always */ +{ + char quote = '\0'; + ulint len = 0; + const char* s; + char* str; + char* dst; + + *id = NULL; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + if (*ptr == '\0') { + + return(ptr); + } + + if (*ptr == '`' || *ptr == '"') { + quote = *ptr++; + } + + s = ptr; + + if (quote) { + for (;;) { + if (!*ptr) { + /* Syntax error */ + return(ptr); + } + if (*ptr == quote) { + ptr++; + if (*ptr != quote) { + break; + } + } + ptr++; + len++; + } + } else { + while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')' + && (accept_also_dot || *ptr != '.') + && *ptr != ',' && *ptr != '\0') { + + ptr++; + } + + len = ulint(ptr - s); + } + + if (heap == NULL) { + /* no heap given: id will point to source string */ + *id = s; + return(ptr); + } + + if (quote) { + char* d; + + str = d = static_cast<char*>( + mem_heap_alloc(heap, len + 1)); + + while (len--) { + if ((*d++ = *s++) == quote) { + s++; + } + } + *d++ = 0; + len = ulint(d - str); + ut_ad(*s == quote); + ut_ad(s + 1 == ptr); + } else { + str = mem_heap_strdupl(heap, s, len); + } + + if (!table_id) { +convert_id: + /* Convert the identifier from connection character set + to UTF-8. */ + len = 3 * len + 1; + *id = dst = static_cast<char*>(mem_heap_alloc(heap, len)); + + innobase_convert_from_id(cs, dst, str, len); + } else if (!strncmp(str, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + str += sizeof(srv_mysql50_table_name_prefix) - 1; + len -= sizeof(srv_mysql50_table_name_prefix) - 1; + goto convert_id; + } else { + /* Encode using filename-safe characters. */ + len = 5 * len + 1; + *id = dst = static_cast<char*>(mem_heap_alloc(heap, len)); + + innobase_convert_from_table_id(cs, dst, str, len); + } + + return(ptr); +} + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +char* +dict_get_referenced_table( + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len, /*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap, /*!< in/out: heap memory */ + CHARSET_INFO* from_cs) /*!< in: table name charset */ +{ + char* ref; + char db_name[MAX_DATABASE_NAME_LEN]; + char tbl_name[MAX_TABLE_NAME_LEN]; + CHARSET_INFO* to_cs = &my_charset_filename; + uint errors; + ut_ad(database_name || name); + ut_ad(table_name); + + if (!strncmp(table_name, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + table_name += sizeof(srv_mysql50_table_name_prefix) - 1; + table_name_len -= sizeof(srv_mysql50_table_name_prefix) - 1; + + to_cs = system_charset_info; + } + + table_name_len = strconvert(from_cs, table_name, table_name_len, to_cs, + tbl_name, MAX_TABLE_NAME_LEN, &errors); + table_name = tbl_name; + + if (database_name) { + to_cs = &my_charset_filename; + if (!strncmp(database_name, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + database_name + += sizeof(srv_mysql50_table_name_prefix) - 1; + database_name_len + -= sizeof(srv_mysql50_table_name_prefix) - 1; + to_cs = system_charset_info; + } + + database_name_len = strconvert( + from_cs, database_name, database_name_len, to_cs, + db_name, MAX_DATABASE_NAME_LEN, &errors); + database_name = db_name; + } else { + /* Use the database name of the foreign key table */ + + database_name = name; + database_name_len = dict_get_db_name_len(name); + } + + /* Copy database_name, '/', table_name, '\0' */ + const size_t len = database_name_len + table_name_len + 1; + ref = static_cast<char*>(mem_heap_alloc(heap, len + 1)); + memcpy(ref, database_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + /* Values; 0 = Store and compare as given; case sensitive + 1 = Store and compare in lower; case insensitive + 2 = Store as given, compare in lower; case semi-sensitive */ + if (lower_case_table_names == 2) { + innobase_casedn_str(ref); + *table = dict_sys.load_table({ref, len}); + memcpy(ref, database_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + } else { +#ifndef _WIN32 + if (lower_case_table_names == 1) { + innobase_casedn_str(ref); + } +#else + innobase_casedn_str(ref); +#endif /* !_WIN32 */ + *table = dict_sys.load_table({ref, len}); + } + + return(ref); +} + +/*********************************************************************//** +Removes MySQL comments from an SQL string. A comment is either +(a) '#' to the end of the line, +(b) '--[space]' to the end of the line, or +(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar +C comment syntax). +@return own: SQL string stripped from comments; the caller must free +this with ut_free()! */ +static +char* +dict_strip_comments( +/*================*/ + const char* sql_string, /*!< in: SQL string */ + size_t sql_length) /*!< in: length of sql_string */ +{ + char* str; + const char* sptr; + const char* eptr = sql_string + sql_length; + char* ptr; + /* unclosed quote character (0 if none) */ + char quote = 0; + bool escape = false; + + DBUG_ENTER("dict_strip_comments"); + + DBUG_PRINT("dict_strip_comments", ("%s", sql_string)); + + str = static_cast<char*>(ut_malloc_nokey(sql_length + 1)); + + sptr = sql_string; + ptr = str; + + for (;;) { +scan_more: + if (sptr >= eptr || *sptr == '\0') { +end_of_string: + *ptr = '\0'; + + ut_a(ptr <= str + sql_length); + + DBUG_PRINT("dict_strip_comments", ("%s", str)); + DBUG_RETURN(str); + } + + if (*sptr == quote) { + /* Closing quote character: do not look for + starting quote or comments. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = 0; + } + } else if (quote) { + /* Within quotes: do not look for + starting quotes or comments. */ + if (escape) { + escape = false; + } else if (*sptr == '\\') { + escape = true; + } + } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *sptr; + } else if (*sptr == '#' + || (sptr[0] == '-' && sptr[1] == '-' + && sptr[2] == ' ')) { + for (;;) { + if (++sptr >= eptr) { + goto end_of_string; + } + + /* In Unix a newline is 0x0A while in Windows + it is 0x0D followed by 0x0A */ + + switch (*sptr) { + case (char) 0X0A: + case (char) 0x0D: + case '\0': + goto scan_more; + } + } + } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') { + sptr += 2; + for (;;) { + if (sptr >= eptr) { + goto end_of_string; + } + + switch (*sptr) { + case '\0': + goto scan_more; + case '*': + if (sptr[1] == '/') { + sptr += 2; + goto scan_more; + } + } + + sptr++; + } + } + + *ptr = *sptr; + + ptr++; + sptr++; + } +} + +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table) /*!< in: table in the dictionary memory cache */ +{ + dict_foreign_t* foreign; + char* endp; + ulint biggest_id = 0; + ulint id; + ulint len; + + DBUG_ENTER("dict_table_get_highest_foreign_id"); + + ut_a(table); + + len = strlen(table->name.m_name); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + char fkid[MAX_TABLE_NAME_LEN * 2 + 20]; + foreign = *it; + + strncpy(fkid, foreign->id, (sizeof fkid) - 1); + fkid[(sizeof fkid) - 1] = '\0'; + /* Convert foreign key identifier on dictionary memory + cache to filename charset. */ + innobase_convert_to_filename_charset( + strchr(fkid, '/') + 1, + strchr(foreign->id, '/') + 1, + MAX_TABLE_NAME_LEN); + + if (strlen(fkid) > ((sizeof dict_ibfk) - 1) + len + && 0 == memcmp(fkid, table->name.m_name, len) + && 0 == memcmp(fkid + len, + dict_ibfk, (sizeof dict_ibfk) - 1) + && fkid[len + ((sizeof dict_ibfk) - 1)] != '0') { + /* It is of the >= 4.0.18 format */ + + id = strtoul(fkid + len + + ((sizeof dict_ibfk) - 1), + &endp, 10); + if (*endp == '\0') { + ut_a(id != biggest_id); + + if (id > biggest_id) { + biggest_id = id; + } + } + } + } + + DBUG_PRINT("dict_table_get_highest_foreign_id", + ("id: " ULINTPF, biggest_id)); + + DBUG_RETURN(biggest_id); +} + +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ +{ + ibool success; + char* str; + size_t len; + const char* ptr; + const char* ptr1; + const char* id; + CHARSET_INFO* cs; + + ut_a(trx->mysql_thd); + + cs = thd_charset(trx->mysql_thd); + + *n = 0; + + *constraints_to_drop = static_cast<const char**>( + mem_heap_alloc(heap, 1000 * sizeof(char*))); + + ptr = innobase_get_stmt_unsafe(trx->mysql_thd, &len); + + str = dict_strip_comments(ptr, len); + + ptr = str; + + ut_ad(dict_sys.locked()); +loop: + ptr = dict_scan_to(ptr, "DROP"); + + if (*ptr == '\0') { + ut_free(str); + + return(DB_SUCCESS); + } + + ptr = dict_accept(cs, ptr, "DROP", &success); + + if (!my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success || !my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + + goto syntax_error; + } + + ptr1 = dict_accept(cs, ptr, "IF", &success); + + if (success && my_isspace(cs, *ptr1)) { + ptr1 = dict_accept(cs, ptr1, "EXISTS", &success); + if (success) { + ptr = ptr1; + } + } + + ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE); + + if (id == NULL) { + + goto syntax_error; + } + + ut_a(*n < 1000); + (*constraints_to_drop)[*n] = id; + (*n)++; + + if (std::find_if(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_matches_id(id)) + == table->foreign_set.end()) { + + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in dropping of a foreign key" + " constraint of table ", ef); + ut_print_name(ef, NULL, table->name.m_name); + fprintf(ef, ",\nin SQL command\n%s" + "\nCannot find a constraint with the" + " given id %s.\n", str, id); + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + ut_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); + } + + goto loop; + +syntax_error: + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Syntax error in dropping of a" + " foreign key constraint of table ", ef); + ut_print_name(ef, NULL, table->name.m_name); + fprintf(ef, ",\n" + "close to:\n%s\n in SQL command\n%s\n", ptr, str); + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + ut_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); +} + +/*==================== END OF FOREIGN KEY PROCESSING ====================*/ + +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys.latch is already being held. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ +{ + ut_ad(dict_sys.frozen()); + + for (dict_table_t *table= UT_LIST_GET_FIRST(dict_sys.table_LRU); + table; table= UT_LIST_GET_NEXT(table_LRU, table)) + if (dict_index_t *index= dict_table_find_index_on_id(table, index_id)) + return index; + + for (dict_table_t *table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); + table; table= UT_LIST_GET_NEXT(table_LRU, table)) + if (dict_index_t *index= dict_table_find_index_on_id(table, index_id)) + return index; + + return nullptr; +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ +{ + dict_index_t* index; + + if (!dict_sys.is_initialised()) { + return(NULL); + } + + dict_sys.freeze(SRW_LOCK_CALL); + + index = dict_index_get_if_in_cache_low(index_id); + + dict_sys.unfreeze(); + + return(index); +} + +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ +{ + ut_ad(dtuple_get_n_fields_cmp(tuple) + <= dict_index_get_n_unique_in_tree(index)); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + ulint n_unique; + + if (dict_index_is_ibuf(index)) { + /* In a universal index tree, we take the whole record as + the node pointer if the record is on the leaf level, + on non-leaf levels we remove the last field, which + contains the page number of the child page */ + + ut_a(!dict_table_is_comp(index->table)); + n_unique = rec_get_n_fields_old(rec); + + if (level > 0) { + ut_a(n_unique > 1); + n_unique--; + } + } else { + n_unique = dict_index_get_n_unique_in_tree_nonleaf(index); + } + + tuple = dtuple_create(heap, n_unique + 1); + + /* When searching in the tree for the node pointer, we must not do + comparison on the last field, the page number field, as on upper + levels in the tree there may be identical node pointers with a + different page number; therefore, we set the n_fields_cmp to one + less: */ + + dtuple_set_n_fields_cmp(tuple, n_unique); + + dict_index_copy_types(tuple, index, n_unique); + + buf = static_cast<byte*>(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + field = dtuple_get_nth_field(tuple, n_unique); + dfield_set_data(field, buf, 4); + + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); + + rec_copy_prefix_to_dtuple(tuple, rec, index, + level ? 0 : index->n_core_fields, + n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_STATUS_NODE_PTR); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/** Convert a physical record into a search tuple. +@param[in] rec index record (not necessarily in an index page) +@param[in] index index +@param[in] leaf whether rec is in a leaf page +@param[in] n_fields number of data fields +@param[in,out] heap memory heap for allocation +@return own: data tuple */ +dtuple_t* +dict_index_build_data_tuple( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + ulint n_fields, + mem_heap_t* heap) +{ + ut_ad(!index->is_clust()); + + dtuple_t* tuple = dtuple_create(heap, n_fields); + + dict_index_copy_types(tuple, index, n_fields); + + rec_copy_prefix_to_dtuple(tuple, rec, index, + leaf ? n_fields : 0, n_fields, heap); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/*********************************************************************//** +Calculates the minimum record length in an index. */ +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint sum = 0; + ulint i; + ulint comp = dict_table_is_comp(index->table); + + if (comp) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_col_t* col + = dict_index_get_nth_col(index, i); + ulint size = dict_col_get_fixed_size(col, comp); + sum += size; + if (!size) { + size = col->len; + sum += size < 128 ? 1 : 2; + } + if (!(col->prtype & DATA_NOT_NULL)) { + nullable++; + } + } + + /* round the NULL flags up to full bytes */ + sum += UT_BITS_IN_BYTES(nullable); + + return(sum); + } + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + sum += dict_col_get_fixed_size( + dict_index_get_nth_col(index, i), comp); + } + + if (sum > 127) { + sum += 2 * dict_index_get_n_fields(index); + } else { + sum += dict_index_get_n_fields(index); + } + + sum += REC_N_OLD_EXTRA_BYTES; + + return(sum); +} + +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +std::string +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline) /*!< in: whether to add a newline */ +{ + const char* stripped_id; + ulint i; + std::string str; + + if (strchr(foreign->id, '/')) { + /* Strip the preceding database name from the constraint id */ + stripped_id = foreign->id + 1 + + dict_get_db_name_len(foreign->id); + } else { + stripped_id = foreign->id; + } + + str.append(","); + + if (add_newline) { + /* SHOW CREATE TABLE wants constraints each printed nicely + on its own line, while error messages want no newlines + inserted. */ + str.append("\n "); + } + + str.append(" CONSTRAINT "); + + str.append(innobase_quote_identifier(trx, stripped_id)); + str.append(" FOREIGN KEY ("); + + for (i = 0;;) { + str.append(innobase_quote_identifier(trx, foreign->foreign_col_names[i])); + + if (++i < foreign->n_fields) { + str.append(", "); + } else { + break; + } + } + + str.append(") REFERENCES "); + + if (dict_tables_have_same_db(foreign->foreign_table_name_lookup, + foreign->referenced_table_name_lookup)) { + /* Do not print the database name of the referenced table */ + str.append(ut_get_name(trx, + dict_remove_db_name( + foreign->referenced_table_name))); + } else { + str.append(ut_get_name(trx, + foreign->referenced_table_name)); + } + + str.append(" ("); + + for (i = 0;;) { + str.append(innobase_quote_identifier(trx, + foreign->referenced_col_names[i])); + + if (++i < foreign->n_fields) { + str.append(", "); + } else { + break; + } + } + + str.append(")"); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + str.append(" ON DELETE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + str.append(" ON DELETE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + str.append(" ON DELETE NO ACTION"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + str.append(" ON UPDATE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + str.append(" ON UPDATE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + str.append(" ON UPDATE NO ACTION"); + } + + return str; +} + +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +std::string +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table */ +{ + dict_foreign_t* foreign; + std::string str; + + dict_sys.freeze(SRW_LOCK_CALL); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (create_table_format) { + str.append( + dict_print_info_on_foreign_key_in_create_format( + trx, foreign, TRUE)); + } else { + ulint i; + str.append("; ("); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + str.append(" "); + } + + str.append(innobase_quote_identifier(trx, + foreign->foreign_col_names[i])); + } + + str.append(") REFER "); + str.append(ut_get_name(trx, + foreign->referenced_table_name)); + str.append(")"); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + str.append(" "); + } + str.append(innobase_quote_identifier( + trx, + foreign->referenced_col_names[i])); + } + + str.append(")"); + + if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) { + str.append(" ON DELETE CASCADE"); + } + + if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) { + str.append(" ON DELETE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + str.append(" ON DELETE NO ACTION"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + str.append(" ON UPDATE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + str.append(" ON UPDATE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + str.append(" ON UPDATE NO ACTION"); + } + } + } + + dict_sys.unfreeze(); + return str; +} + +/**********************************************************************//** +Flags an index corrupted both in the data dictionary cache +and in the SYS_INDEXES */ +void dict_set_corrupted(dict_index_t *index, const char *ctx) +{ + mem_heap_t* heap; + mtr_t mtr; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + const char* status; + btr_cur_t cursor; + + dict_sys.lock(SRW_LOCK_CALL); + + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + + /* Mark the table as corrupted only if the clustered index + is corrupted */ + if (dict_index_is_clust(index)) { + index->table->corrupted = TRUE; + goto func_exit; + } + + if (index->type & DICT_CORRUPT) { + /* The index was already flagged corrupted. */ + ut_ad(!dict_index_is_clust(index) || index->table->corrupted); + goto func_exit; + } + + /* If this is read only mode, do not update SYS_INDEXES, just + mark it as corrupted in memory */ + if (high_level_read_only) { + index->type |= DICT_CORRUPT; + goto func_exit; + } + + heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + + sizeof(que_fork_t) + sizeof(upd_node_t) + + sizeof(upd_t) + 12)); + mtr_start(&mtr); + index->type |= DICT_CORRUPT; + + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + /* Find the index row in SYS_INDEXES */ + tuple = dtuple_create(heap, 2); + + dfield = dtuple_get_nth_field(tuple, 0); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->table->id); + dfield_set_data(dfield, buf, 8); + + dfield = dtuple_get_nth_field(tuple, 1); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + dfield_set_data(dfield, buf, 8); + + dict_index_copy_types(tuple, sys_index, 2); + cursor.page_cur.index = sys_index; + + if (cursor.search_leaf(tuple, PAGE_CUR_LE, BTR_MODIFY_LEAF, &mtr) + != DB_SUCCESS) { + goto fail; + } + + if (cursor.low_match == dtuple_get_n_fields(tuple)) { + /* UPDATE SYS_INDEXES SET TYPE=index->type + WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */ + ulint len; + byte* field = rec_get_nth_field_old( + btr_cur_get_rec(&cursor), + DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto fail; + } + mtr.write<4>(*btr_cur_get_block(&cursor), field, index->type); + status = "Flagged"; + } else { +fail: + status = "Unable to flag"; + } + + mtr_commit(&mtr); + mem_heap_free(heap); + ib::error() << status << " corruption of " << index->name + << " in table " << index->table->name << " in " << ctx; + +func_exit: + dict_sys.unlock(); +} + +/** Sets merge_threshold in the SYS_INDEXES +@param[in,out] index index +@param[in] merge_threshold value to set */ +void +dict_index_set_merge_threshold( + dict_index_t* index, + ulint merge_threshold) +{ + mem_heap_t* heap; + mtr_t mtr; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + btr_cur_t cursor; + + ut_ad(index != NULL); + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + + heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + + sizeof(que_fork_t) + sizeof(upd_node_t) + + sizeof(upd_t) + 12)); + + mtr.start(); + + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + /* Find the index row in SYS_INDEXES */ + tuple = dtuple_create(heap, 2); + + dfield = dtuple_get_nth_field(tuple, 0); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->table->id); + dfield_set_data(dfield, buf, 8); + + dfield = dtuple_get_nth_field(tuple, 1); + buf = static_cast<byte*>(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + dfield_set_data(dfield, buf, 8); + + dict_index_copy_types(tuple, sys_index, 2); + cursor.page_cur.index = sys_index; + + if (cursor.search_leaf(tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &mtr) + != DB_SUCCESS) { + goto func_exit; + } + + if (cursor.up_match == dtuple_get_n_fields(tuple) + && rec_get_n_fields_old(btr_cur_get_rec(&cursor)) + == DICT_NUM_FIELDS__SYS_INDEXES) { + ulint len; + byte* field = rec_get_nth_field_old( + btr_cur_get_rec(&cursor), + DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len); + + ut_ad(len == 4); + mtr.write<4,mtr_t::MAYBE_NOP>(*btr_cur_get_block(&cursor), + field, merge_threshold); + } + +func_exit: + mtr_commit(&mtr); + mem_heap_free(heap); +} + +#ifdef UNIV_DEBUG +/** Sets merge_threshold for all indexes in the list of tables +@param[in] list pointer to the list of tables */ +inline +void +dict_set_merge_threshold_list_debug( + UT_LIST_BASE_NODE_T(dict_table_t)* list, + uint merge_threshold_all) +{ + for (dict_table_t* table = UT_LIST_GET_FIRST(*list); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + index->lock.x_lock(SRW_LOCK_CALL); + index->merge_threshold = merge_threshold_all + & ((1U << 6) - 1); + index->lock.x_unlock(); + } + } +} + +/** Sets merge_threshold for all indexes in dictionary cache for debug. +@param[in] merge_threshold_all value to set for all indexes */ +void +dict_set_merge_threshold_all_debug( + uint merge_threshold_all) +{ + dict_sys.freeze(SRW_LOCK_CALL); + + dict_set_merge_threshold_list_debug( + &dict_sys.table_LRU, merge_threshold_all); + dict_set_merge_threshold_list_debug( + &dict_sys.table_non_LRU, merge_threshold_all); + + dict_sys.unfreeze(); +} + +#endif /* UNIV_DEBUG */ + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +dict_index_t* +dict_table_get_index_on_name(dict_table_t* table, const char* name) +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (index->is_committed() && !strcmp(index->name, name)) { + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ +{ + bool found = true; + dict_foreign_t* foreign; + + ut_ad(index->to_be_dropped); + ut_ad(index->table == table); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + if (foreign->foreign_index == index) { + ut_ad(foreign->foreign_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->foreign_table, col_names, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE, + NULL, NULL, NULL); + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->foreign_index = new_index; + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + if (foreign->referenced_index == index) { + ut_ad(foreign->referenced_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE, + NULL, NULL, NULL); + /* There must exist an alternative index, + since this must have been checked earlier. */ + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->referenced_index = new_index; + } + } + + return(found); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ +{ + /* Check for duplicates, ignoring indexes that are marked + as to be dropped */ + + const dict_index_t* index1; + const dict_index_t* index2; + + ut_ad(dict_sys.frozen()); + + /* The primary index _must_ exist */ + ut_a(UT_LIST_GET_LEN(table->indexes) > 0); + + index1 = UT_LIST_GET_FIRST(table->indexes); + + do { + if (!index1->is_committed()) { + ut_a(!dict_index_is_clust(index1)); + + switch (check) { + case CHECK_ALL_COMPLETE: + ut_error; + case CHECK_ABORTED_OK: + switch (dict_index_get_online_status(index1)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + ut_error; + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + /* fall through */ + case CHECK_PARTIAL_OK: + break; + } + } + + for (index2 = UT_LIST_GET_NEXT(indexes, index1); + index2 != NULL; + index2 = UT_LIST_GET_NEXT(indexes, index2)) { + ut_ad(index1->is_committed() + != index2->is_committed() + || strcmp(index1->name, index2->name) != 0); + } + + index1 = UT_LIST_GET_NEXT(indexes, index1); + } while (index1); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ +{ + char db[MAX_DATABASE_NAME_LEN + 1]; + ulint db_len; + uint errors; + + db_len = dict_get_db_name_len(db_and_table); + + ut_a(db_len <= sizeof(db)); + + memcpy(db, db_and_table, db_len); + db[db_len] = '\0'; + + strconvert( + &my_charset_filename, db, uint(db_len), system_charset_info, + db_utf8, uint(db_utf8_size), &errors); + + /* convert each # to @0023 in table name and store the result in buf */ + const char* table = dict_remove_db_name(db_and_table); + const char* table_p; + char buf[MAX_TABLE_NAME_LEN * 5 + 1]; + char* buf_p; + for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) { + if (table_p[0] != '#') { + buf_p[0] = table_p[0]; + buf_p++; + } else { + buf_p[0] = '@'; + buf_p[1] = '0'; + buf_p[2] = '0'; + buf_p[3] = '2'; + buf_p[4] = '3'; + buf_p += 5; + } + ut_a((size_t) (buf_p - buf) < sizeof(buf)); + } + buf_p[0] = '\0'; + + errors = 0; + strconvert( + &my_charset_filename, buf, (uint) (buf_p - buf), + system_charset_info, + table_utf8, uint(table_utf8_size), + &errors); + + if (errors != 0) { + snprintf(table_utf8, table_utf8_size, "%s%s", + srv_mysql50_table_name_prefix, table); + } +} + +/** Resize the hash tables based on the current buffer pool size. */ +void dict_sys_t::resize() +{ + ut_ad(this == &dict_sys); + ut_ad(is_initialised()); + lock(SRW_LOCK_CALL); + + /* all table entries are in table_LRU and table_non_LRU lists */ + table_hash.free(); + table_id_hash.free(); + temp_id_hash.free(); + + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); + table_hash.create(hash_size); + table_id_hash.create(hash_size); + temp_id_hash.create(hash_size); + + for (dict_table_t *table= UT_LIST_GET_FIRST(table_LRU); table; + table= UT_LIST_GET_NEXT(table_LRU, table)) + { + ut_ad(!table->is_temporary()); + ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name)); + ulint id_fold= ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + HASH_INSERT(dict_table_t, id_hash, &table_id_hash, id_fold, table); + } + + for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table; + table= UT_LIST_GET_NEXT(table_LRU, table)) + { + ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name)); + ulint id_fold= ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + + hash_table_t *id_hash= table->is_temporary() + ? &temp_id_hash : &table_id_hash; + + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); + } + + unlock(); +} + +/** Close the data dictionary cache on shutdown. */ +void dict_sys_t::close() +{ + ut_ad(this == &dict_sys); + if (!is_initialised()) return; + + lock(SRW_LOCK_CALL); + + /* Free the hash elements. We don't remove them from table_hash + because we are invoking table_hash.free() below. */ + for (ulint i= table_hash.n_cells; i--; ) + while (dict_table_t *table= static_cast<dict_table_t*> + (HASH_GET_FIRST(&table_hash, i))) + dict_sys.remove(table); + + table_hash.free(); + + /* table_id_hash contains the same elements as in table_hash, + therefore we don't delete the individual elements. */ + table_id_hash.free(); + + /* No temporary tables should exist at this point. */ + temp_id_hash.free(); + + unlock(); + latch.destroy(); + + mysql_mutex_destroy(&dict_foreign_err_mutex); + + if (dict_foreign_err_file) + { + my_fclose(dict_foreign_err_file, MYF(MY_WME)); + dict_foreign_err_file = NULL; + } + + m_initialised= false; +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if valid */ +static +ibool +dict_lru_validate(void) +/*===================*/ +{ + dict_table_t* table; + + ut_ad(dict_sys.frozen()); + + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(table->can_be_evicted); + } + + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(!table->can_be_evicted); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Check an index to see whether its first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return true if the index qualifies, otherwise false */ +bool +dict_foreign_qualify_index( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ +{ + if (dict_index_get_n_fields(index) < n_cols) { + return(false); + } + + if (!index->is_btree()) { + return false; + } + + if (index->online_status >= ONLINE_INDEX_ABORTED) { + return false; + } + + for (ulint i = 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + ulint col_no; + + field = dict_index_get_nth_field(index, i); + col_no = dict_col_get_no(field->col); + + if (field->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + if (error && err_col_no && err_index) { + *error = FK_IS_PREFIX_INDEX; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + return(false); + } + + if (check_null + && (field->col->prtype & DATA_NOT_NULL)) { + if (error && err_col_no && err_index) { + *error = FK_COL_NOT_NULL; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + return(false); + } + + if (field->col->is_virtual()) { + col_name = ""; + for (ulint j = 0; j < table->n_v_def; j++) { + col_name = dict_table_get_v_col_name(table, j); + if (innobase_strcasecmp(field->name,col_name) == 0) { + break; + } + } + } else { + col_name = col_names + ? col_names[col_no] + : dict_table_get_col_name(table, col_no); + } + + if (0 != innobase_strcasecmp(columns[i], col_name)) { + return(false); + } + + if (types_idx && !cmp_cols_are_equal( + dict_index_get_nth_col(index, i), + dict_index_get_nth_col(types_idx, i), + check_charsets)) { + if (error && err_col_no && err_index) { + *error = FK_COLS_NOT_EQUAL; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + + return(false); + } + } + + return(true); +} + +/*********************************************************************//** +Update the state of compression failure padding heuristics. This is +called whenever a compression operation succeeds or fails. +The caller must be holding info->mutex */ +static +void +dict_index_zip_pad_update( +/*======================*/ + zip_pad_info_t* info, /*<! in/out: info to be updated */ + ulint zip_threshold) /*<! in: zip threshold value */ +{ + ulint total; + ulint fail_pct; + + ut_ad(info); + ut_ad(info->pad % ZIP_PAD_INCR == 0); + + total = info->success + info->failure; + + ut_ad(total > 0); + + if (zip_threshold == 0) { + /* User has just disabled the padding. */ + return; + } + + if (total < ZIP_PAD_ROUND_LEN) { + /* We are in middle of a round. Do nothing. */ + return; + } + + /* We are at a 'round' boundary. Reset the values but first + calculate fail rate for our heuristic. */ + fail_pct = (info->failure * 100) / total; + info->failure = 0; + info->success = 0; + + if (fail_pct > zip_threshold) { + /* Compression failures are more then user defined + threshold. Increase the pad size to reduce chances of + compression failures. + + Only do increment if it won't increase padding + beyond max pad size. */ + if (info->pad + ZIP_PAD_INCR + < (srv_page_size * zip_pad_max) / 100) { + info->pad.fetch_add(ZIP_PAD_INCR); + + MONITOR_INC(MONITOR_PAD_INCREMENTS); + } + + info->n_rounds = 0; + + } else { + /* Failure rate was OK. Another successful round + completed. */ + ++info->n_rounds; + + /* If enough successful rounds are completed with + compression failure rate in control, decrease the + padding. */ + if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT + && info->pad > 0) { + info->pad.fetch_sub(ZIP_PAD_INCR); + + info->n_rounds = 0; + + MONITOR_INC(MONITOR_PAD_DECREMENTS); + } + } +} + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + index->zip_pad.mutex.lock(); + ++index->zip_pad.success; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + index->zip_pad.mutex.unlock(); +} + +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + index->zip_pad.mutex.lock(); + ++index->zip_pad.failure; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + index->zip_pad.mutex.unlock(); +} + +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page might not compress */ +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ +{ + ulint pad; + ulint min_sz; + ulint sz; + + if (!zip_failure_threshold_pct) { + /* Disabled by user. */ + return(srv_page_size); + } + + pad = index->zip_pad.pad; + + ut_ad(pad < srv_page_size); + sz = srv_page_size - pad; + + /* Min size allowed by user. */ + ut_ad(zip_pad_max < 100); + min_sz = (srv_page_size * (100 - zip_pad_max)) / 100; + + return(ut_max(sz, min_sz)); +} + +/*************************************************************//** +Convert table flag to row format string. +@return row format name. */ +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag) /*!< in: row format setting */ +{ + switch (dict_tf_get_rec_format(table_flag)) { + case REC_FORMAT_REDUNDANT: + return("ROW_TYPE_REDUNDANT"); + case REC_FORMAT_COMPACT: + return("ROW_TYPE_COMPACT"); + case REC_FORMAT_COMPRESSED: + return("ROW_TYPE_COMPRESSED"); + case REC_FORMAT_DYNAMIC: + return("ROW_TYPE_DYNAMIC"); + } + + ut_error; + return(0); +} diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc new file mode 100644 index 00000000..f769839d --- /dev/null +++ b/storage/innobase/dict/dict0load.cc @@ -0,0 +1,3213 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0load.cc +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0load.h" + +#include "log.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0stats.h" +#include "fsp0file.h" +#include "fts0priv.h" +#include "mach0data.h" +#include "page0page.h" +#include "rem0cmp.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "fts0opt.h" +#include "row0vers.h" + +/** Loads a table definition and also all its index definitions. + +Loads those foreign key constraints whose referenced table is already in +dictionary cache. If a foreign key constraint is not loaded, then the +referenced table is pushed into the output stack (fk_tables), if it is not +NULL. These tables must be subsequently loaded so that all the foreign +key constraints are loaded into memory. + +@param[in] name Table name in the db/tablename format +@param[in] ignore_err Error to be ignored when loading table + and its index definition +@param[out] fk_tables Related table names that must also be + loaded to ensure that all foreign key + constraints are loaded. +@return table, possibly with file_unreadable flag set +@retval nullptr if the table does not exist */ +static dict_table_t *dict_load_table_one(const span<const char> &name, + dict_err_ignore_t ignore_err, + dict_names_t &fk_tables); + +/** Load an index definition from a SYS_INDEXES record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_index_low( + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if mtr + and "out" when !mtr */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + mtr_t* mtr, /*!< in/out: mini-transaction, + or nullptr if a pre-allocated + *index is to be filled in */ + dict_table_t* table, /*!< in/out: table, or NULL */ + dict_index_t** index); /*!< out,own: index, or NULL */ + +/** Load a table column definition from a SYS_COLUMNS record to dict_table_t. +@param table table, or nullptr if the output will be in column +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param column pointer to output buffer, or nullptr if table!=nullptr +@param table_id table identifier +@param col_name column name +@param rec SYS_COLUMNS record +@param mtr mini-transaction +@param nth_v_col nullptr, or pointer to a counter of virtual columns +@return error message +@retval nullptr on success */ +static const char *dict_load_column_low(dict_table_t *table, + unsigned use_uncommitted, + mem_heap_t *heap, dict_col_t *column, + table_id_t *table_id, + const char **col_name, + const rec_t *rec, + mtr_t *mtr, + ulint *nth_v_col); + +/** Load a virtual column "mapping" (to base columns) information +from a SYS_VIRTUAL record +@param[in,out] table table +@param[in] uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param[in,out] column mapped base column's dict_column_t +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@param[in] rec SYS_VIRTUAL record +@return error message +@retval NULL on success */ +static +const char* +dict_load_virtual_low( + dict_table_t* table, + bool uncommitted, + dict_col_t** column, + table_id_t* table_id, + ulint* pos, + ulint* base_pos, + const rec_t* rec); + +/** Load an index field definition from a SYS_FIELDS record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_field_low( + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + const rec_t* rec); /*!< in: SYS_FIELDS record */ + +#ifdef UNIV_DEBUG +/****************************************************************//** +Compare the name of an index column. +@return TRUE if the i'th column of index is 'name'. */ +static +ibool +name_of_col_is( +/*===========*/ + const dict_table_t* table, /*!< in: table */ + const dict_index_t* index, /*!< in: index */ + ulint i, /*!< in: index field offset */ + const char* name) /*!< in: name to compare to */ +{ + ulint tmp = dict_col_get_no(dict_field_get_col( + dict_index_get_nth_field( + index, i))); + + return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +static +const rec_t* +dict_getnext_system_low( +/*====================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor to the + record*/ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + rec_t* rec = NULL; + + while (!rec) { + btr_pcur_move_to_next_user_rec(pcur, mtr); + + rec = btr_pcur_get_rec(pcur); + + if (!btr_pcur_is_on_user_rec(pcur)) { + /* end of index */ + btr_pcur_close(pcur); + + return(NULL); + } + } + + /* Get a record, let's save the position */ + btr_pcur_store_position(pcur, mtr); + + return(rec); +} + +/********************************************************************//** +This function opens a system table, and returns the first record. +@return first record of the system table */ +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_table_t* table) /*!< in: system table */ +{ + btr_pcur_init(pcur); + if (pcur->open_leaf(true, table->indexes.start, BTR_SEARCH_LEAF, mtr) != + DB_SUCCESS) + return nullptr; + const rec_t *rec; + do + rec= dict_getnext_system_low(pcur, mtr); + while (rec && rec_get_deleted_flag(rec, 0)); + return rec; +} + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + const rec_t *rec=nullptr; + if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) != btr_pcur_t::CORRUPTED) + do + rec= dict_getnext_system_low(pcur, mtr); + while (rec && rec_get_deleted_flag(rec, 0)); + return rec; +} + +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: index to be filled */ + table_id_t* table_id) /*!< out: index table id */ +{ + byte buf[8]; + + ut_d(index->is_dummy = true); + ut_d(index->in_instant_init = false); + + /* Parse the record, and get "dict_index_t" struct filled */ + const char *err_msg= dict_load_index_low(buf, false, heap, rec, + nullptr, nullptr, &index); + *table_id= mach_read_from_8(buf); + return err_msg; +} + +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + ulint* nth_v_col) /*!< out: if virtual col, this is + record's sequence number */ +{ + const char* err_msg; + + /* Parse the record, and get "dict_col_t" struct filled */ + err_msg = dict_load_column_low(NULL, 0, heap, column, + table_id, col_name, rec, nullptr, + nth_v_col); + + return(err_msg); +} + +/** This function parses a SYS_VIRTUAL record and extracts virtual column +information +@param[in] rec current SYS_COLUMNS rec +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@return error message, or NULL on success */ +const char* +dict_process_sys_virtual_rec( + const rec_t* rec, + table_id_t* table_id, + ulint* pos, + ulint* base_pos) +{ + return dict_load_virtual_low(nullptr, false, nullptr, table_id, + pos, base_pos, rec); +} + +/********************************************************************//** +This function parses a SYS_FIELDS record and populates a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id) /*!< in: previous index id */ +{ + byte buf[8]; + byte last_index_id[8]; + const char* err_msg; + + mach_write_to_8(last_index_id, last_id); + + err_msg = dict_load_field_low(buf, false, nullptr, sys_field, + pos, last_index_id, heap, nullptr, rec); + + *index_id = mach_read_from_8(buf); + + return(err_msg); + +} + +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign) /*!< out: dict_foreign_t struct + to be filled */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) { + return("wrong number of columns in SYS_FOREIGN record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN"); + } + + /* This receives a dict_foreign_t* that points to a stack variable. + So dict_foreign_free(foreign) is not used as elsewhere. + Since the heap used here is freed elsewhere, foreign->heap + is not assigned. */ + foreign->id = mem_heap_strdupl(heap, (const char*) field, len); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + /* The _lookup versions of the referenced and foreign table names + are not assigned since they are not used in this dict_foreign_t */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->foreign_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->referenced_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len); + if (len != 4) { + goto err_len; + } + uint32_t n_fields_and_type = mach_read_from_4(field); + + foreign->type = n_fields_and_type >> 24 & ((1U << 6) - 1); + foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS; + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos) /*!< out: column position */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN_COLS"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) { + return("wrong number of columns in SYS_FOREIGN_COLS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN_COLS"); + } + *name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + if (len != 4) { + goto err_len; + } + *pos = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *for_col_name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *ref_col_name = mem_heap_strdupl(heap, (char*) field, len); + + return(NULL); +} + +/** Check the validity of a SYS_TABLES record +Make sure the fields are the right length and that they +do not contain invalid contents. +@param[in] rec SYS_TABLES record +@return error message, or NULL on success */ +static +const char* +dict_sys_tables_rec_check( + const rec_t* rec) +{ + const byte* field; + ulint len; + + ut_ad(dict_sys.locked()); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) { + return("wrong number of columns in SYS_TABLES record"); + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_TABLES"); + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len); + if (len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__MIX_ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len); + if (len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + return(NULL); +} + +/** Check if SYS_TABLES.TYPE is valid +@param[in] type SYS_TABLES.TYPE +@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used +@return whether the SYS_TABLES.TYPE value is valid */ +static +bool +dict_sys_tables_type_valid(ulint type, bool not_redundant) +{ + /* The DATA_DIRECTORY flag can be assigned fully independently + of all other persistent table flags. */ + type &= ~DICT_TF_MASK_DATA_DIR; + + if (type == 1) { + return(true); /* ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT */ + } + + if (!(type & 1)) { + /* For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, + SYS_TABLES.TYPE=1. Else, it is the same as + dict_table_t::flags, and the least significant bit + would be set. So, the bit never can be 0. */ + return(false); + } + + if (!not_redundant) { + /* SYS_TABLES.TYPE must be 1 or 1|DICT_TF_MASK_NO_ROLLBACK + for ROW_FORMAT=REDUNDANT. */ + return !(type & ~(1U | DICT_TF_MASK_NO_ROLLBACK)); + } + + if (type >= 1U << DICT_TF_POS_UNUSED) { + /* Some unknown bits are set. */ + return(false); + } + + return(dict_tf_is_valid_not_redundant(type)); +} + +/** Convert SYS_TABLES.TYPE to dict_table_t::flags. +@param[in] type SYS_TABLES.TYPE +@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used +@return table flags */ +static +uint32_t dict_sys_tables_type_to_tf(uint32_t type, bool not_redundant) +{ + ut_ad(dict_sys_tables_type_valid(type, not_redundant)); + uint32_t flags = not_redundant ? 1 : 0; + + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL are the same. */ + flags |= type & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); + + ut_ad(dict_tf_is_valid(flags)); + return(flags); +} + +/** Outcome of dict_sys_tables_rec_read() */ +enum table_read_status { READ_OK= 0, READ_ERROR, READ_NOT_FOUND }; + +/** Read and return 5 integer fields from a SYS_TABLES record. +@param[in] rec A record of SYS_TABLES +@param[in] uncommitted true=use READ UNCOMMITTED, false=READ COMMITTED +@param[in] mtr mini-transaction +@param[out] table_id Pointer to the table_id for this table +@param[out] space_id Pointer to the space_id for this table +@param[out] n_cols Pointer to number of columns for this table. +@param[out] flags Pointer to table flags +@param[out] flags2 Pointer to table flags2 +@param[out] trx_id DB_TRX_ID of the committed SYS_TABLES record, + or nullptr to perform READ UNCOMMITTED +@return whether the record was read correctly */ +MY_ATTRIBUTE((warn_unused_result)) +static +table_read_status +dict_sys_tables_rec_read( + const rec_t* rec, + bool uncommitted, + mtr_t* mtr, + table_id_t* table_id, + uint32_t* space_id, + uint32_t* n_cols, + uint32_t* flags, + uint32_t* flags2, + trx_id_t* trx_id) +{ + const byte* field; + ulint len; + mem_heap_t* heap = nullptr; + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + ut_ad(len == 6 || len == UNIV_SQL_NULL); + trx_id_t id = len == 6 ? trx_read_trx_id(field) : 0; + if (id && !uncommitted && trx_sys.find(nullptr, id, false)) { + const auto savepoint = mtr->get_savepoint(); + heap = mem_heap_create(1024); + dict_index_t* index = UT_LIST_GET_FIRST( + dict_sys.sys_tables->indexes); + rec_offs* offsets = rec_get_offsets( + rec, index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec) { + mem_heap_free(heap); + return READ_NOT_FOUND; + } + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + if (UNIV_UNLIKELY(len != 6)) { + mem_heap_free(heap); + return READ_ERROR; + } + id = trx_read_trx_id(field); + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(id); + if (trx_id) { + return READ_NOT_FOUND; + } + } + + if (trx_id) { + *trx_id = id; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__ID, &len); + ut_ad(len == 8); + *table_id = static_cast<table_id_t>(mach_read_from_8(field)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + ut_ad(len == 4); + *space_id = mach_read_from_4(field); + + /* Read the 4 byte flags from the TYPE field */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__TYPE, &len); + ut_a(len == 4); + uint32_t type = mach_read_from_4(field); + + /* Handle MDEV-12873 InnoDB SYS_TABLES.TYPE incompatibility + for PAGE_COMPRESSED=YES in MariaDB 10.2.2 to 10.2.6. + + MariaDB 10.2.2 introduced the SHARED_SPACE flag from MySQL 5.7, + shifting the flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL, + ATOMIC_WRITES (repurposed to NO_ROLLBACK in 10.3.1) by one bit. + The SHARED_SPACE flag would always + be written as 0 by MariaDB, because MariaDB does not support + CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB. + + So, instead of the bits AALLLLCxxxxxxx we would have + AALLLLC0xxxxxxx if the table was created with MariaDB 10.2.2 + to 10.2.6. (AA=ATOMIC_WRITES, LLLL=PAGE_COMPRESSION_LEVEL, + C=PAGE_COMPRESSED, xxxxxxx=7 bits that were not moved.) + + The case LLLLC=00000 is not a problem. The problem is the case + AALLLL10DB00001 where D is the (mostly ignored) DATA_DIRECTORY + flag and B is the ATOMIC_BLOBS flag (1 for ROW_FORMAT=DYNAMIC + and 0 for ROW_FORMAT=COMPACT in this case). Other low-order + bits must be so, because PAGE_COMPRESSED=YES is only allowed + for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT, not for + ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPRESSED. + + Starting with MariaDB 10.2.4, the flags would be + 00LLLL10DB00001, because ATOMIC_WRITES is always written as 0. + + We will concentrate on the PAGE_COMPRESSION_LEVEL and + PAGE_COMPRESSED=YES. PAGE_COMPRESSED=NO implies + PAGE_COMPRESSION_LEVEL=0, and in that case all the affected + bits will be 0. For PAGE_COMPRESSED=YES, the values 1..9 are + allowed for PAGE_COMPRESSION_LEVEL. That is, we must interpret + the bits AALLLL10DB00001 as AALLLL1DB00001. + + If someone created a table in MariaDB 10.2.2 or 10.2.3 with + the attribute ATOMIC_WRITES=OFF (value 2) and without + PAGE_COMPRESSED=YES or PAGE_COMPRESSION_LEVEL, that should be + rejected. The value ATOMIC_WRITES=ON (1) would look like + ATOMIC_WRITES=OFF, but it would be ignored starting with + MariaDB 10.2.4. */ + compile_time_assert(DICT_TF_POS_PAGE_COMPRESSION == 7); + compile_time_assert(DICT_TF_POS_UNUSED == 14); + + if ((type & 0x19f) != 0x101) { + /* The table cannot have been created with MariaDB + 10.2.2 to 10.2.6, because they would write the + low-order bits of SYS_TABLES.TYPE as 0b10xx00001 for + PAGE_COMPRESSED=YES. No adjustment is applicable. */ + } else if (type >= 3 << 13) { + /* 10.2.2 and 10.2.3 write ATOMIC_WRITES less than 3, + and no other flags above that can be set for the + SYS_TABLES.TYPE to be in the 10.2.2..10.2.6 format. + This would in any case be invalid format for 10.2 and + earlier releases. */ + ut_ad(!dict_sys_tables_type_valid(type, true)); + } else { + /* SYS_TABLES.TYPE is of the form AALLLL10DB00001. We + must still validate that the LLLL bits are between 0 + and 9 before we can discard the extraneous 0 bit. */ + ut_ad(!DICT_TF_GET_PAGE_COMPRESSION(type)); + + if ((((type >> 9) & 0xf) - 1) < 9) { + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) & 1); + + type = (type & 0x7fU) | (type >> 1 & ~0x7fU); + + ut_ad(DICT_TF_GET_PAGE_COMPRESSION(type)); + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) >= 1); + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) <= 9); + } else { + ut_ad(!dict_sys_tables_type_valid(type, true)); + } + } + + /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in + dict_table_t::flags the low order bit is used to determine if the + ROW_FORMAT=REDUNDANT (0) or anything else (1). + Read the 4 byte N_COLS field and look at the high order bit. It + should be set for COMPACT and later. It should not be set for + REDUNDANT. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + ut_a(len == 4); + *n_cols = mach_read_from_4(field); + + const bool not_redundant = 0 != (*n_cols & DICT_N_COLS_COMPACT); + + if (!dict_sys_tables_type_valid(type, not_redundant)) { + sql_print_error("InnoDB: Table %.*s in InnoDB" + " data dictionary contains invalid flags." + " SYS_TABLES.TYPE=" UINT32PF + " SYS_TABLES.N_COLS=" UINT32PF, + int(rec_get_field_start_offs(rec, 1)), rec, + type, *n_cols); +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return READ_ERROR; + } + + *flags = dict_sys_tables_type_to_tf(type, not_redundant); + + /* For tables created before MySQL 4.1, there may be + garbage in SYS_TABLES.MIX_LEN where flags2 are found. Such tables + would always be in ROW_FORMAT=REDUNDANT which do not have the + high bit set in n_cols, and flags would be zero. + MySQL 4.1 was the first version to support innodb_file_per_table, + that is, *space_id != 0. */ + if (not_redundant || *space_id != 0 || *n_cols & DICT_N_COLS_COMPACT + || fil_system.sys_space->full_crc32()) { + + /* Get flags2 from SYS_TABLES.MIX_LEN */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + *flags2 = mach_read_from_4(field); + + if (!dict_tf2_is_valid(*flags, *flags2)) { + sql_print_error("InnoDB: Table %.*s in InnoDB" + " data dictionary" + " contains invalid flags." + " SYS_TABLES.TYPE=" UINT32PF + " SYS_TABLES.MIX_LEN=" UINT32PF, + int(rec_get_field_start_offs(rec, 1)), + rec, + type, *flags2); + goto err_exit; + } + + /* DICT_TF2_FTS will be set when indexes are being loaded */ + *flags2 &= ~DICT_TF2_FTS; + + /* Now that we have used this bit, unset it. */ + *n_cols &= ~DICT_N_COLS_COMPACT; + } else { + *flags2 = 0; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return READ_OK; +} + +/** Check each tablespace found in the data dictionary. +Then look at each table defined in SYS_TABLES that has a space_id > 0 +to find all the file-per-table tablespaces. + +In a crash recovery we already have some tablespace objects created from +processing the REDO log. We will compare the +space_id information in the data dictionary to what we find in the +tablespace file. In addition, more validation will be done if recovery +was needed and force_recovery is not set. + +We also scan the biggest space id, and store it to fil_system. */ +void dict_check_tablespaces_and_store_max_id() +{ + uint32_t max_space_id = 0; + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_check_tablespaces_and_store_max_id"); + + mtr.start(); + + dict_sys.lock(SRW_LOCK_CALL); + + for (const rec_t *rec = dict_startscan_system(&pcur, &mtr, + dict_sys.sys_tables); + rec; rec = dict_getnext_system_low(&pcur, &mtr)) { + ulint len; + table_id_t table_id; + uint32_t space_id; + uint32_t n_cols; + uint32_t flags; + uint32_t flags2; + + /* If a table record is not useable, ignore it and continue + on to the next record. Error messages were logged. */ + if (dict_sys_tables_rec_check(rec)) { + continue; + } + + const char *field = reinterpret_cast<const char*>( + rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__NAME, + &len)); + + DBUG_PRINT("dict_check_sys_tables", + ("name: %*.s", static_cast<int>(len), field)); + + if (dict_sys_tables_rec_read(rec, false, + &mtr, &table_id, &space_id, + &n_cols, &flags, &flags2, nullptr) + != READ_OK + || space_id == TRX_SYS_SPACE) { + continue; + } + + if (flags2 & DICT_TF2_DISCARDED) { + sql_print_information("InnoDB: Ignoring tablespace" + " for %.*s because " + "the DISCARD flag is set", + static_cast<int>(len), field); + continue; + } + + /* For tables or partitions using .ibd files, the flag + DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN + before MySQL 5.6.5. The flag should not have been + introduced in persistent storage. MariaDB will keep + setting the flag when writing SYS_TABLES entries for + newly created or rebuilt tables or partitions, but + will otherwise ignore the flag. */ + + if (fil_space_for_table_exists_in_mem(space_id, flags)) { + continue; + } + + const span<const char> name{field, len}; + + char* filepath = fil_make_filepath(nullptr, name, + IBD, false); + + const bool not_dropped{!rec_get_deleted_flag(rec, 0)}; + + /* Check that the .ibd file exists. */ + if (fil_ibd_open(not_dropped, FIL_TYPE_TABLESPACE, + space_id, dict_tf_to_fsp_flags(flags), + name, filepath)) { + } else if (!not_dropped) { + } else if (srv_operation == SRV_OPERATION_NORMAL + && srv_start_after_restore + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && dict_table_t::is_temporary_name(filepath)) { + /* Mariabackup will not copy files whose + names start with #sql-. This table ought to + be dropped by drop_garbage_tables_after_restore() + a little later. */ + } else { + sql_print_warning("InnoDB: Ignoring tablespace for" + " %.*s because it" + " could not be opened.", + static_cast<int>(len), field); + } + + max_space_id = ut_max(max_space_id, space_id); + + ut_free(filepath); + } + + mtr.commit(); + + fil_set_max_space_id_if_bigger(max_space_id); + + dict_sys.unlock(); + + DBUG_VOID_RETURN; +} + +/** Error message for a delete-marked record in dict_load_column_low() */ +static const char *dict_load_column_del= "delete-marked record in SYS_COLUMNS"; +/** Error message for a missing record in dict_load_column_low() */ +static const char *dict_load_column_none= "SYS_COLUMNS record not found"; +/** Message for incomplete instant ADD/DROP in dict_load_column_low() */ +static const char *dict_load_column_instant= "incomplete instant ADD/DROP"; + +/** Load a table column definition from a SYS_COLUMNS record to dict_table_t. +@param table table, or nullptr if the output will be in column +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param column pointer to output buffer, or nullptr if table!=nullptr +@param table_id table identifier +@param col_name column name +@param rec SYS_COLUMNS record +@param mtr mini-transaction +@param nth_v_col nullptr, or pointer to a counter of virtual columns +@return error message +@retval nullptr on success */ +static const char *dict_load_column_low(dict_table_t *table, + unsigned use_uncommitted, + mem_heap_t *heap, dict_col_t *column, + table_id_t *table_id, + const char **col_name, + const rec_t *rec, + mtr_t *mtr, + ulint *nth_v_col) +{ + char* name; + const byte* field; + ulint len; + ulint mtype; + ulint prtype; + ulint col_len; + ulint pos; + ulint num_base; + + ut_ad(!table == !!column); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) { + return("wrong number of columns in SYS_COLUMNS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_COLUMNS"); + } + + if (table_id) { + *table_id = mach_read_from_8(field); + } else if (table->id != mach_read_from_8(field)) { + return dict_load_column_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__POS, &len); + if (len != 4) { + goto err_len; + } + + pos = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (trx_id && mtr && use_uncommitted < 2 + && trx_sys.find(nullptr, trx_id, false)) { + if (use_uncommitted) { + return dict_load_column_instant; + } + const auto savepoint = mtr->get_savepoint(); + dict_index_t* index = UT_LIST_GET_FIRST( + dict_sys.sys_columns->indexes); + rec_offs* offsets = rec_get_offsets( + rec, index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers) { + return dict_load_column_none; + } + ut_ad(!rec_get_deleted_flag(rec, 0)); + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(trx_id); + return dict_load_column_del; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + *col_name = name = mem_heap_strdupl(heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len); + if (len != 4) { + goto err_len; + } + + mtype = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len); + if (len != 4) { + goto err_len; + } + prtype = mach_read_from_4(field); + + if (dtype_get_charset_coll(prtype) == 0 + && dtype_is_string_type(mtype)) { + /* The table was created with < 4.1.2. */ + + if (dtype_is_binary_string_type(mtype, prtype)) { + /* Use the binary collation for + string columns of binary type. */ + + prtype = dtype_form_prtype( + prtype, + DATA_MYSQL_BINARY_CHARSET_COLL); + } else { + /* Use the default charset for + other than binary columns. */ + + prtype = dtype_form_prtype( + prtype, + data_mysql_default_charset_coll); + } + } + + if (table && table->n_def != pos && !(prtype & DATA_VIRTUAL)) { + return("SYS_COLUMNS.POS mismatch"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__LEN, &len); + if (len != 4) { + goto err_len; + } + col_len = mach_read_from_4(field); + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PREC, &len); + if (len != 4) { + goto err_len; + } + num_base = mach_read_from_4(field); + + if (table) { + if (prtype & DATA_VIRTUAL) { +#ifdef UNIV_DEBUG + dict_v_col_t* vcol = +#endif + dict_mem_table_add_v_col( + table, heap, name, mtype, + prtype, col_len, + dict_get_v_col_mysql_pos(pos), num_base); + ut_ad(vcol->v_pos == dict_get_v_col_pos(pos)); + } else { + ut_ad(num_base == 0); + dict_mem_table_add_col(table, heap, name, mtype, + prtype, col_len); + } + + if (trx_id > table->def_trx_id) { + table->def_trx_id = trx_id; + } + } else { + dict_mem_fill_column_struct(column, pos, mtype, + prtype, col_len); + } + + /* Report the virtual column number */ + if ((prtype & DATA_VIRTUAL) && nth_v_col != NULL) { + *nth_v_col = dict_get_v_col_pos(pos); + } + + return(NULL); +} + +/** Error message for a delete-marked record in dict_load_virtual_low() */ +static const char *dict_load_virtual_del= "delete-marked record in SYS_VIRTUAL"; +static const char *dict_load_virtual_none= "SYS_VIRTUAL record not found"; + +/** Load a virtual column "mapping" (to base columns) information +from a SYS_VIRTUAL record +@param[in,out] table table +@param[in] uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param[in,out] column mapped base column's dict_column_t +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@param[in] rec SYS_VIRTUAL record +@return error message +@retval NULL on success */ +static +const char* +dict_load_virtual_low( + dict_table_t* table, + bool uncommitted, + dict_col_t** column, + table_id_t* table_id, + ulint* pos, + ulint* base_pos, + const rec_t* rec) +{ + const byte* field; + ulint len; + ulint base; + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_VIRTUAL) { + return("wrong number of columns in SYS_VIRTUAL record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_VIRTUAL"); + } + + if (table_id != NULL) { + *table_id = mach_read_from_8(field); + } else if (table->id != mach_read_from_8(field)) { + return dict_load_virtual_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__POS, &len); + if (len != 4) { + goto err_len; + } + + if (pos != NULL) { + *pos = mach_read_from_4(field); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__BASE_POS, &len); + if (len != 4) { + goto err_len; + } + + base = mach_read_from_4(field); + + if (base_pos != NULL) { + *base_pos = base; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (trx_id && column && !uncommitted + && trx_sys.find(nullptr, trx_id, false)) { + if (!rec_get_deleted_flag(rec, 0)) { + return dict_load_virtual_none; + } + } else if (rec_get_deleted_flag(rec, 0)) { + ut_ad(trx_id != 0); + return dict_load_virtual_del; + } + + if (column != NULL) { + *column = dict_table_get_nth_col(table, base); + } + + return(NULL); +} + +/** Load the definitions for table columns. +@param table table +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@return error code +@retval DB_SUCCESS on success +@retval DB_SUCCESS_LOCKED_REC on success if use_uncommitted=1 +and instant ADD/DROP/reorder was detected */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static dberr_t dict_load_columns(dict_table_t *table, unsigned use_uncommitted, + mem_heap_t *heap) +{ + btr_pcur_t pcur; + mtr_t mtr; + ulint n_skipped = 0; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_columns->indexes.start; + ut_ad(!dict_sys.sys_columns->not_redundant()); + + ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__NAME, "NAME")); + ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__PREC, "PREC")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte table_id[8]; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield, table_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + + ut_ad(table->n_t_cols == static_cast<ulint>( + table->n_cols) + static_cast<ulint>(table->n_v_cols)); + + for (ulint i = 0; + i + DATA_N_SYS_COLS < table->n_t_cols + n_skipped; + i++) { + const char* err_msg; + const char* name = NULL; + ulint nth_v_col = ULINT_UNDEFINED; + const rec_t* rec = btr_pcur_get_rec(&pcur); + + err_msg = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_column_low(table, use_uncommitted, + heap, NULL, NULL, + &name, rec, &mtr, &nth_v_col) + : dict_load_column_none; + + if (!err_msg) { + } else if (err_msg == dict_load_column_del) { + n_skipped++; + goto next_rec; + } else if (err_msg == dict_load_column_instant) { + err = DB_SUCCESS_LOCKED_REC; + goto func_exit; + } else if (err_msg == dict_load_column_none + && strstr(table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + break; + } else { + ib::error() << err_msg << " for table " << table->name; + err = DB_CORRUPTION; + goto func_exit; + } + + /* Note: Currently we have one DOC_ID column that is + shared by all FTS indexes on a table. And only non-virtual + column can be used for FULLTEXT index */ + if (innobase_strcasecmp(name, + FTS_DOC_ID_COL_NAME) == 0 + && nth_v_col == ULINT_UNDEFINED) { + dict_col_t* col; + /* As part of normal loading of tables the + table->flag is not set for tables with FTS + till after the FTS indexes are loaded. So we + create the fts_t instance here if there isn't + one already created. + + This case does not arise for table create as + the flag is set before the table is created. */ + if (table->fts == NULL) { + table->fts = fts_create(table); + table->fts->cache = fts_cache_create(table); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + } + + ut_a(table->fts->doc_col == ULINT_UNDEFINED); + + col = dict_table_get_nth_col(table, i - n_skipped); + + ut_ad(col->len == sizeof(doc_id_t)); + + if (col->prtype & DATA_FTS_DOC_ID) { + DICT_TF2_FLAG_SET( + table, DICT_TF2_FTS_HAS_DOC_ID); + DICT_TF2_FLAG_UNSET( + table, DICT_TF2_FTS_ADD_DOC_ID); + } + + table->fts->doc_col = i - n_skipped; + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return err; +} + +/** Loads SYS_VIRTUAL info for one virtual column +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param nth_v_col virtual column position */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +dberr_t +dict_load_virtual_col(dict_table_t *table, bool uncommitted, ulint nth_v_col) +{ + const dict_v_col_t* v_col = dict_table_get_nth_v_col(table, nth_v_col); + + if (v_col->num_base == 0) { + return DB_SUCCESS; + } + + dict_index_t* sys_virtual_index; + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + sys_virtual_index = dict_sys.sys_virtual->indexes.start; + ut_ad(!dict_sys.sys_virtual->not_redundant()); + + ut_ad(name_of_col_is(dict_sys.sys_virtual, sys_virtual_index, + DICT_FLD__SYS_VIRTUAL__POS, "POS")); + + dfield_t dfield[2]; + dtuple_t tuple{ + 0,2,2,dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte table_id[8], vcol_pos[4]; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield[0], table_id, 8); + mach_write_to_4(vcol_pos, + dict_create_v_col_pos(nth_v_col, v_col->m_col.ind)); + dfield_set_data(&dfield[1], vcol_pos, 4); + + dict_index_copy_types(&tuple, sys_virtual_index, 2); + pcur.btr_cur.page_cur.index = sys_virtual_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + + for (ulint i = 0, skipped = 0; + i < unsigned{v_col->num_base} + skipped; i++) { + ulint pos; + const char* err_msg + = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_virtual_low(table, uncommitted, + &v_col->base_col[i - skipped], + NULL, + &pos, NULL, + btr_pcur_get_rec(&pcur)) + : dict_load_virtual_none; + + if (!err_msg) { + ut_ad(pos == mach_read_from_4(vcol_pos)); + } else if (err_msg == dict_load_virtual_del) { + skipped++; + } else if (err_msg == dict_load_virtual_none + && strstr(table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + break; + } else { + ib::error() << err_msg << " for table " << table->name; + err = DB_CORRUPTION; + break; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return err; +} + +/** Loads info from SYS_VIRTUAL for virtual columns. +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static dberr_t dict_load_virtual(dict_table_t *table, bool uncommitted) +{ + for (ulint i= 0; i < table->n_v_cols; i++) + if (dberr_t err= dict_load_virtual_col(table, uncommitted, i)) + return err; + return DB_SUCCESS; +} + +/** Error message for a delete-marked record in dict_load_field_low() */ +static const char *dict_load_field_del= "delete-marked record in SYS_FIELDS"; + +static const char *dict_load_field_none= "SYS_FIELDS record not found"; + +/** Load an index field definition from a SYS_FIELDS record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_field_low( + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + const rec_t* rec) /*!< in: SYS_FIELDS record */ +{ + const byte* field; + ulint len; + unsigned pos_and_prefix_len; + unsigned prefix_len; + bool descending; + bool first_field; + ulint position; + + /* Either index or sys_field is supplied, not both */ + ut_ad((!index) != (!sys_field)); + ut_ad((!index) == !mtr); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) { + return("wrong number of columns in SYS_FIELDS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_FIELDS"); + } + + if (!index) { + ut_a(last_index_id); + memcpy(index_id, (const char*) field, 8); + first_field = memcmp(index_id, last_index_id, 8); + } else { + first_field = (index->n_def == 0); + if (memcmp(field, index_id, 8)) { + return dict_load_field_none; + } + } + + /* The next field stores the field position in the index and a + possible column prefix length if the index field does not + contain the whole column. The storage format is like this: if + there is at least one prefix field in the index, then the HIGH + 2 bytes contain the field number (index->n_def) and the low 2 + bytes the prefix length for the field. Otherwise the field + number (index->n_def) is contained in the 2 LOW bytes. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__POS, &len); + if (len != 4) { + goto err_len; + } + + pos_and_prefix_len = mach_read_from_4(field); + + if (index && UNIV_UNLIKELY + ((pos_and_prefix_len & 0xFFFFUL) != index->n_def + && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) { + return("SYS_FIELDS.POS mismatch"); + } + + if (first_field || pos_and_prefix_len > 0xFFFFUL) { + prefix_len = pos_and_prefix_len & 0x7FFFUL; + descending = (pos_and_prefix_len & 0x8000UL); + position = (pos_and_prefix_len & 0xFFFF0000UL) >> 16; + } else { + prefix_len = 0; + descending = false; + position = pos_and_prefix_len & 0xFFFFUL; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (!trx_id) { + ut_ad(!rec_get_deleted_flag(rec, 0)); + } else if (!mtr || uncommitted) { + } else if (trx_sys.find(nullptr, trx_id, false)) { + const auto savepoint = mtr->get_savepoint(); + dict_index_t* sys_field = UT_LIST_GET_FIRST( + dict_sys.sys_fields->indexes); + rec_offs* offsets = rec_get_offsets( + rec, sys_field, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, sys_field, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers || rec_get_deleted_flag(rec, 0)) { + return dict_load_field_none; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + return(dict_load_field_del); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + if (index) { + dict_mem_index_add_field( + index, mem_heap_strdupl(heap, (const char*) field, len), + prefix_len, descending); + } else { + sys_field->name = mem_heap_strdupl( + heap, (const char*) field, len); + sys_field->prefix_len = prefix_len & ((1U << 12) - 1); + sys_field->descending = descending; + *pos = position; + } + + return(NULL); +} + +/** +Load definitions for index fields. +@param index index whose fields are to be loaded +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param heap memory heap for temporary storage +@return error code +@return DB_SUCCESS if the fields were loaded successfully */ +static dberr_t dict_load_fields(dict_index_t *index, bool uncommitted, + mem_heap_t *heap) +{ + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_fields->indexes.start; + ut_ad(!dict_sys.sys_fields->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_fields, sys_index, + DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte index_id[8]; + mach_write_to_8(index_id, index->id); + dfield_set_data(&dfield, index_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, + &pcur, &mtr); + if (error != DB_SUCCESS) { + goto func_exit; + } + + for (ulint i = 0; i < index->n_fields; i++) { + const char *err_msg = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_field_low(index_id, uncommitted, index, + nullptr, nullptr, nullptr, + heap, &mtr, + btr_pcur_get_rec(&pcur)) + : dict_load_field_none; + + if (!err_msg) { + } else if (err_msg == dict_load_field_del) { + /* There could be delete marked records in + SYS_FIELDS because SYS_FIELDS.INDEX_ID can be + updated by ALTER TABLE ADD INDEX. */ + } else { + if (err_msg != dict_load_field_none + || strstr(index->table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + ib::error() << err_msg << " for index " + << index->name + << " of table " + << index->table->name; + } + error = DB_CORRUPTION; + break; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return error; +} + +/** Error message for a delete-marked record in dict_load_index_low() */ +static const char *dict_load_index_del= "delete-marked record in SYS_INDEXES"; +/** Error message for table->id mismatch in dict_load_index_low() */ +static const char *dict_load_index_none= "SYS_INDEXES record not found"; +/** Error message for SYS_TABLES flags mismatch in dict_load_table_low() */ +static const char *dict_load_table_flags= "incorrect flags in SYS_TABLES"; + +/** Load an index definition from a SYS_INDEXES record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_index_low( + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if mtr + and "out" when !mtr */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + mtr_t* mtr, /*!< in/out: mini-transaction, + or nullptr if a pre-allocated + *index is to be filled in */ + dict_table_t* table, /*!< in/out: table, or NULL */ + dict_index_t** index) /*!< out,own: index, or NULL */ +{ + const byte* field; + ulint len; + index_id_t id; + ulint n_fields; + ulint type; + unsigned merge_threshold; + + if (mtr) { + *index = NULL; + } + + if (rec_get_n_fields_old(rec) == DICT_NUM_FIELDS__SYS_INDEXES) { + /* MERGE_THRESHOLD exists */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len); + switch (len) { + case 4: + merge_threshold = mach_read_from_4(field); + break; + case UNIV_SQL_NULL: + merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + break; + default: + return("incorrect MERGE_THRESHOLD length" + " in SYS_INDEXES"); + } + } else if (rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES - 1) { + /* MERGE_THRESHOLD doesn't exist */ + + merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + } else { + return("wrong number of columns in SYS_INDEXES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_INDEXES"); + } + + if (!mtr) { + /* We are reading a SYS_INDEXES record. Copy the table_id */ + memcpy(table_id, (const char*) field, 8); + } else if (memcmp(field, table_id, 8)) { + /* Caller supplied table_id, verify it is the same + id as on the index record */ + return dict_load_index_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__ID, &len); + if (len != 8) { + goto err_len; + } + + id = mach_read_from_8(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + if (!trx_id) { + ut_ad(!rec_get_deleted_flag(rec, 0)); + } else if (!mtr || uncommitted) { + } else if (trx_sys.find(nullptr, trx_id, false)) { + const auto savepoint = mtr->get_savepoint(); + dict_index_t* sys_index = UT_LIST_GET_FIRST( + dict_sys.sys_indexes->indexes); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers || rec_get_deleted_flag(rec, 0)) { + return dict_load_index_none; + } + } else if (rec_get_deleted_flag(rec, 0) + && rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] + != static_cast<byte>(*TEMP_INDEX_PREFIX_STR) + && table->def_trx_id < trx_id) { + table->def_trx_id = trx_id; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len); + if (len != 4) { + goto err_len; + } + n_fields = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto err_len; + } + type = mach_read_from_4(field); + if (type & (~0U << DICT_IT_BITS)) { + return("unknown SYS_INDEXES.TYPE bits"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) { + goto err_len; + } + + ut_d(const auto name_offs =) + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_INDEXES__NAME, &len); + ut_ad(name_offs == 8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + if (rec_get_deleted_flag(rec, 0)) { + return dict_load_index_del; + } + + char* name = mem_heap_strdupl(heap, reinterpret_cast<const char*>(rec) + + (8 + 8 + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN), + len); + + if (mtr) { + *index = dict_mem_index_create(table, name, type, n_fields); + } else { + dict_mem_fill_index_struct(*index, nullptr, name, + type, n_fields); + } + + (*index)->id = id; + (*index)->page = mach_read_from_4(field); + ut_ad((*index)->page); + (*index)->merge_threshold = merge_threshold & ((1U << 6) - 1); + + return(NULL); +} + +/** Load definitions for table indexes. Adds them to the data dictionary cache. +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param ignore_err errors to be ignored when loading the index definition +@return error code +@retval DB_SUCCESS if all indexes were successfully loaded +@retval DB_CORRUPTION if corruption of dictionary table +@retval DB_UNSUPPORTED if table has unknown index type */ +static MY_ATTRIBUTE((nonnull)) +dberr_t dict_load_indexes(dict_table_t *table, bool uncommitted, + mem_heap_t *heap, dict_err_ignore_t ignore_err) +{ + dict_index_t* sys_index; + btr_pcur_t pcur; + byte table_id[8]; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + sys_index = dict_sys.sys_indexes->indexes.start; + ut_ad(!dict_sys.sys_indexes->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__NAME, "NAME")); + ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield, table_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, + &pcur, &mtr); + if (error != DB_SUCCESS) { + goto func_exit; + } + + while (btr_pcur_is_on_user_rec(&pcur)) { + dict_index_t* index = NULL; + const char* err_msg; + const rec_t* rec = btr_pcur_get_rec(&pcur); + if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && (rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES + /* a record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + || rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES - 1)) { + const byte* field; + ulint len; + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__NAME, &len); + + if (len != UNIV_SQL_NULL + && static_cast<char>(*field) + == static_cast<char>(*TEMP_INDEX_PREFIX_STR)) { + /* Skip indexes whose name starts with + TEMP_INDEX_PREFIX_STR, because they will + be dropped by row_merge_drop_temp_indexes() + during crash recovery. */ + goto next_rec; + } + } + + err_msg = dict_load_index_low(table_id, uncommitted, heap, rec, + &mtr, table, &index); + ut_ad(!index == !!err_msg); + + if (err_msg == dict_load_index_none) { + /* We have ran out of index definitions for + the table. */ + break; + } + + if (err_msg == dict_load_index_del) { + goto next_rec; + } else if (err_msg) { + ib::error() << err_msg; + if (ignore_err & DICT_ERR_IGNORE_INDEX) { + goto next_rec; + } + error = DB_CORRUPTION; + goto func_exit; + } else if (rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] + == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) { + dict_mem_index_free(index); + goto next_rec; + } else { + const trx_id_t id = trx_read_trx_id(rec + 8 + 8); + if (id > table->def_trx_id) { + table->def_trx_id = id; + } + } + + ut_ad(index); + ut_ad(!dict_index_is_online_ddl(index)); + + /* Check whether the index is corrupted */ + if (ignore_err != DICT_ERR_IGNORE_DROP + && index->is_corrupted() && index->is_clust()) { + dict_mem_index_free(index); + error = DB_TABLE_CORRUPT; + goto func_exit; + } + + if (index->type & DICT_FTS + && !dict_table_has_fts_index(table)) { + /* This should have been created by now. */ + ut_a(table->fts != NULL); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS); + } + + /* We check for unsupported types first, so that the + subsequent checks are relevant for the supported types. */ + if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE + | DICT_CORRUPT | DICT_FTS + | DICT_SPATIAL | DICT_VIRTUAL)) { + + ib::error() << "Unknown type " << index->type + << " of index " << index->name + << " of table " << table->name; + + error = DB_UNSUPPORTED; + dict_mem_index_free(index); + goto func_exit; + } else if (index->page == FIL_NULL + && table->is_readable() + && (!(index->type & DICT_FTS))) { + if (!uncommitted + && ignore_err != DICT_ERR_IGNORE_DROP) { + ib::error_or_warn(!(ignore_err + & DICT_ERR_IGNORE_INDEX)) + << "Index " << index->name + << " for table " << table->name + << " has been freed!"; + } + + if (!(ignore_err & DICT_ERR_IGNORE_INDEX)) { +corrupted: + dict_mem_index_free(index); + error = DB_CORRUPTION; + goto func_exit; + } + /* If caller can tolerate this error, + we will continue to load the index and + let caller deal with this error. However + mark the index and table corrupted. We + only need to mark such in the index + dictionary cache for such metadata corruption, + since we would always be able to set it + when loading the dictionary cache */ + if (index->is_clust()) { + index->table->corrupted = true; + index->table->file_unreadable = true; + } + index->type |= DICT_CORRUPT; + } else if (!dict_index_is_clust(index) + && NULL == dict_table_get_first_index(table)) { + + ib::error() << "Trying to load index " << index->name + << " for table " << table->name + << ", but the first index is not clustered!"; + + goto corrupted; + } else if (dict_is_sys_table(table->id) + && (dict_index_is_clust(index) + || ((table == dict_sys.sys_tables) + && !strcmp("ID_IND", index->name)))) { + + /* The index was created in memory already at booting + of the database server */ + dict_mem_index_free(index); + } else { + error = dict_load_fields(index, uncommitted, heap); + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* The data dictionary tables should never contain + invalid index definitions. If we ignored this error + and simply did not load this index definition, the + .frm file would disagree with the index definitions + inside InnoDB. */ + if ((error = dict_index_add_to_cache(index, + index->page)) + != DB_SUCCESS) { + goto func_exit; + } + +#ifdef UNIV_DEBUG + // The following assertion doesn't hold for FTS indexes + // as it may have prefix_len=1 with any charset + if (index->type != DICT_FTS) { + for (uint i = 0; i < index->n_fields; i++) { + dict_field_t &f = index->fields[i]; + ut_ad(f.col->mbmaxlen == 0 + || f.prefix_len + % f.col->mbmaxlen == 0); + } + } +#endif /* UNIV_DEBUG */ + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + if (!dict_table_get_first_index(table) + && !(ignore_err & DICT_ERR_IGNORE_INDEX)) { + ib::warn() << "No indexes found for table " << table->name; + error = DB_CORRUPTION; + goto func_exit; + } + + ut_ad(table->fts_doc_id_index == NULL); + + if (table->fts != NULL) { + dict_index_t *idx = dict_table_get_index_on_name( + table, FTS_DOC_ID_INDEX_NAME); + if (idx && dict_index_is_unique(idx)) { + table->fts_doc_id_index = idx; + } + } + + /* If the table contains FTS indexes, populate table->fts->indexes */ + if (dict_table_has_fts_index(table)) { + ut_ad(table->fts_doc_id_index != NULL); + /* table->fts->indexes should have been created. */ + ut_a(table->fts->indexes != NULL); + dict_table_get_all_fts_indexes(table, table->fts->indexes); + } + +func_exit: + mtr.commit(); + return error; +} + +/** Load a table definition from a SYS_TABLES record to dict_table_t. +Do not load any columns or indexes. +@param[in,out] mtr mini-transaction +@param[in] uncommitted whether to use READ UNCOMMITTED isolation level +@param[in] rec SYS_TABLES record +@param[out,own] table table, or nullptr +@return error message +@retval nullptr on success */ +const char *dict_load_table_low(mtr_t *mtr, bool uncommitted, + const rec_t *rec, dict_table_t **table) +{ + table_id_t table_id; + uint32_t space_id, t_num, flags, flags2; + ulint n_cols, n_v_col; + trx_id_t trx_id; + + if (const char* error_text = dict_sys_tables_rec_check(rec)) { + *table = NULL; + return(error_text); + } + + if (auto r = dict_sys_tables_rec_read(rec, uncommitted, mtr, + &table_id, &space_id, + &t_num, &flags, &flags2, + &trx_id)) { + *table = NULL; + return r == READ_ERROR ? dict_load_table_flags : nullptr; + } + + dict_table_decode_n_col(t_num, &n_cols, &n_v_col); + + *table = dict_table_t::create( + span<const char>(reinterpret_cast<const char*>(rec), + rec_get_field_start_offs(rec, 1)), + nullptr, n_cols + n_v_col, n_v_col, flags, flags2); + (*table)->space_id = space_id; + (*table)->id = table_id; + (*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED); + (*table)->def_trx_id = trx_id; + return(NULL); +} + +/** Make sure the data_file_name is saved in dict_table_t if needed. +@param[in,out] table Table object */ +void dict_get_and_save_data_dir_path(dict_table_t *table) +{ + ut_ad(!table->is_temporary()); + ut_ad(!table->space || table->space->id == table->space_id); + + if (!table->data_dir_path && table->space_id && table->space) + { + const char *filepath= table->space->chain.start->name; + if (strncmp(fil_path_to_mysql_datadir, filepath, + strlen(fil_path_to_mysql_datadir))) + { + table->lock_mutex_lock(); + table->flags|= 1 << DICT_TF_POS_DATA_DIR & ((1U << DICT_TF_BITS) - 1); + table->data_dir_path= mem_heap_strdup(table->heap, filepath); + os_file_make_data_dir_path(table->data_dir_path); + table->lock_mutex_unlock(); + } + } +} + +/** Opens a tablespace for dict_load_table_one() +@param[in,out] table A table that refers to the tablespace to open +@param[in] ignore_err Whether to ignore an error. */ +UNIV_INLINE +void +dict_load_tablespace( + dict_table_t* table, + dict_err_ignore_t ignore_err) +{ + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND); + ut_ad(fil_system.sys_space); + + if (table->space_id == TRX_SYS_SPACE) { + table->space = fil_system.sys_space; + return; + } + + if (table->flags2 & DICT_TF2_DISCARDED) { + ib::warn() << "Tablespace for table " << table->name + << " is set as discarded."; + table->file_unreadable = true; + return; + } + + /* The tablespace may already be open. */ + table->space = fil_space_for_table_exists_in_mem(table->space_id, + table->flags); + if (table->space) { + return; + } + + if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) { + table->file_unreadable = true; + return; + } + + if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) { + ib::error() << "Failed to find tablespace for table " + << table->name << " in the cache. Attempting" + " to load the tablespace with space id " + << table->space_id; + } + + /* Use the remote filepath if needed. This parameter is optional + in the call to fil_ibd_open(). If not supplied, it will be built + from the table->name. */ + char* filepath = NULL; + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + /* This will set table->data_dir_path from fil_system */ + dict_get_and_save_data_dir_path(table); + + if (table->data_dir_path) { + filepath = fil_make_filepath( + table->data_dir_path, table->name, IBD, true); + } + } + + table->space = fil_ibd_open( + 2, FIL_TYPE_TABLESPACE, table->space_id, + dict_tf_to_fsp_flags(table->flags), + {table->name.m_name, strlen(table->name.m_name)}, filepath); + + if (!table->space) { + /* We failed to find a sensible tablespace file */ + table->file_unreadable = true; + } + + ut_free(filepath); +} + +/** Loads a table definition and also all its index definitions. + +Loads those foreign key constraints whose referenced table is already in +dictionary cache. If a foreign key constraint is not loaded, then the +referenced table is pushed into the output stack (fk_tables), if it is not +NULL. These tables must be subsequently loaded so that all the foreign +key constraints are loaded into memory. + +@param[in] name Table name in the db/tablename format +@param[in] ignore_err Error to be ignored when loading table + and its index definition +@param[out] fk_tables Related table names that must also be + loaded to ensure that all foreign key + constraints are loaded. +@return table, possibly with file_unreadable flag set +@retval nullptr if the table does not exist */ +static dict_table_t *dict_load_table_one(const span<const char> &name, + dict_err_ignore_t ignore_err, + dict_names_t &fk_tables) +{ + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_load_table_one"); + DBUG_PRINT("dict_load_table_one", + ("table: %.*s", int(name.size()), name.data())); + + ut_ad(dict_sys.locked()); + + dict_index_t *sys_index = dict_sys.sys_tables->indexes.start; + ut_ad(!dict_sys.sys_tables->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__ID, "ID")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__N_COLS, "N_COLS")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__TYPE, "TYPE")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__SPACE, "SPACE")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + dfield_set_data(&dfield, name.data(), name.size()); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + bool uncommitted = false; +reload: + mtr.start(); + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + + if (err != DB_SUCCESS || !btr_pcur_is_on_user_rec(&pcur)) { + /* Not found */ +err_exit: + mtr.commit(); + DBUG_RETURN(nullptr); + } + + const rec_t* rec = btr_pcur_get_rec(&pcur); + + /* Check if the table name in record is the searched one */ + if (rec_get_field_start_offs(rec, 1) != name.size() + || memcmp(name.data(), rec, name.size())) { + goto err_exit; + } + + dict_table_t* table; + if (const char* err_msg = + dict_load_table_low(&mtr, uncommitted, rec, &table)) { + if (err_msg != dict_load_table_flags) { + ib::error() << err_msg; + } + goto err_exit; + } + if (!table) { + goto err_exit; + } + + const unsigned use_uncommitted = uncommitted + ? 2 + : table->id == mach_read_from_8( + rec + rec_get_field_start_offs( + rec, DICT_FLD__SYS_TABLES__ID)); + + mtr.commit(); + + mem_heap_t* heap = mem_heap_create(32000); + + dict_load_tablespace(table, ignore_err); + + switch (dict_load_columns(table, use_uncommitted, heap)) { + case DB_SUCCESS_LOCKED_REC: + ut_ad(!uncommitted); + uncommitted = true; + dict_mem_table_free(table); + mem_heap_free(heap); + goto reload; + case DB_SUCCESS: + if (!dict_load_virtual(table, uncommitted)) { + break; + } + /* fall through */ + default: + dict_mem_table_free(table); + mem_heap_free(heap); + DBUG_RETURN(nullptr); + } + + dict_table_add_system_columns(table, heap); + + table->can_be_evicted = true; + table->add_to_cache(); + + mem_heap_empty(heap); + + ut_ad(dict_tf2_is_valid(table->flags, table->flags2)); + + /* If there is no tablespace for the table then we only need to + load the index definitions. So that we can IMPORT the tablespace + later. When recovering table locks for resurrected incomplete + transactions, the tablespace should exist, because DDL operations + were not allowed while the table is being locked by a transaction. */ + dict_err_ignore_t index_load_err = + !(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && !table->is_readable() + ? DICT_ERR_IGNORE_ALL + : ignore_err; + + err = dict_load_indexes(table, uncommitted, heap, index_load_err); + + if (err == DB_TABLE_CORRUPT) { + /* Refuse to load the table if the table has a corrupted + cluster index */ + ut_ad(index_load_err != DICT_ERR_IGNORE_DROP); + ib::error() << "Refusing to load corrupted table " + << table->name; +evict: + dict_sys.remove(table); + mem_heap_free(heap); + DBUG_RETURN(nullptr); + } + + if (err != DB_SUCCESS || !table->is_readable()) { + } else if (dict_index_t* pk = dict_table_get_first_index(table)) { + ut_ad(pk->is_primary()); + if (pk->is_corrupted() + || pk->page >= table->space->get_size()) { +corrupted: + table->corrupted = true; + table->file_unreadable = true; + err = DB_TABLE_CORRUPT; + } else if (table->space->id + && ignore_err == DICT_ERR_IGNORE_DROP) { + /* Do not bother to load data from .ibd files + only to delete the .ibd files. */ + goto corrupted; + } else { + const page_id_t page_id{table->space->id, pk->page}; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id, table->space->zip_size(), + RW_S_LATCH, &mtr); + const bool corrupted = !block + || page_get_space_id(block->page.frame) + != page_id.space() + || page_get_page_no(block->page.frame) + != page_id.page_no() + || (mach_read_from_2(FIL_PAGE_TYPE + + block->page.frame) + != FIL_PAGE_INDEX + && mach_read_from_2(FIL_PAGE_TYPE + + block->page.frame) + != FIL_PAGE_TYPE_INSTANT); + mtr.commit(); + if (corrupted) { + goto corrupted; + } + + if (table->supports_instant()) { + err = btr_cur_instant_init(table); + } + } + } else { + ut_ad(ignore_err & DICT_ERR_IGNORE_INDEX); + if (ignore_err != DICT_ERR_IGNORE_DROP) { + err = DB_CORRUPTION; + goto evict; + } + } + + /* Initialize table foreign_child value. Its value could be + changed when dict_load_foreigns() is called below */ + table->fk_max_recusive_level = 0; + + /* We will load the foreign key information only if + all indexes were loaded. */ + if (!table->is_readable()) { + /* Don't attempt to load the indexes from disk. */ + } else if (err == DB_SUCCESS) { + err = dict_load_foreigns(table->name.m_name, nullptr, + 0, true, ignore_err, fk_tables); + + if (err != DB_SUCCESS) { + ib::warn() << "Load table " << table->name + << " failed, the table has missing" + " foreign key indexes. Turn off" + " 'foreign_key_checks' and try again."; + goto evict; + } else { + dict_mem_table_fill_foreign_vcol_set(table); + table->fk_max_recusive_level = 0; + } + } + + mem_heap_free(heap); + + ut_ad(!table + || (ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) + || !table->is_readable() + || !table->corrupted); + + if (table && table->fts) { + if (!(dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) { + /* the table->fts could be created in dict_load_column + when a user defined FTS_DOC_ID is present, but no + FTS */ + table->fts->~fts_t(); + table->fts = nullptr; + } else if (fts_optimize_wq) { + fts_optimize_add_table(table); + } else if (table->can_be_evicted) { + /* fts_optimize_thread is not started yet. + So make the table as non-evictable from cache. */ + dict_sys.prevent_eviction(table); + } + } + + ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table)); + + DBUG_RETURN(table); +} + +dict_table_t *dict_sys_t::load_table(const span<const char> &name, + dict_err_ignore_t ignore) +{ + if (dict_table_t *table= find_table(name)) + return table; + dict_names_t fk_list; + dict_table_t *table= dict_load_table_one(name, ignore, fk_list); + while (!fk_list.empty()) + { + const char *f= fk_list.front(); + const span<const char> name{f, strlen(f)}; + if (!find_table(name)) + dict_load_table_one(name, ignore, fk_list); + fk_list.pop_front(); + } + + return table; +} + +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err) /*!< in: errors to ignore + when loading the table */ +{ + byte id_buf[8]; + btr_pcur_t pcur; + const byte* field; + ulint len; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + /* NOTE that the operation of this function is protected by + dict_sys.latch, and therefore no deadlocks can occur + with other dictionary operations. */ + + mtr.start(); + /*---------------------------------------------------*/ + /* Get the secondary index based on ID for table SYS_TABLES */ + dict_index_t *sys_table_ids = + dict_sys.sys_tables->indexes.start->indexes.next; + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + + /* Write the table id in byte format to id_buf */ + mach_write_to_8(id_buf, table_id); + dfield_set_data(&dfield, id_buf, 8); + dict_index_copy_types(&tuple, sys_table_ids, 1); + pcur.btr_cur.page_cur.index = sys_table_ids; + + dict_table_t* table = nullptr; + + if (btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr) + == DB_SUCCESS + && btr_pcur_is_on_user_rec(&pcur)) { + /*---------------------------------------------------*/ + /* Now we have the record in the secondary index + containing the table ID and NAME */ + const rec_t* rec = btr_pcur_get_rec(&pcur); +check_rec: + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLE_IDS__ID, &len); + ut_ad(len == 8); + + /* Check if the table id in record is the one searched for */ + if (table_id == mach_read_from_8(field)) { + field = rec_get_nth_field_old(rec, + DICT_FLD__SYS_TABLE_IDS__NAME, &len); + table = dict_sys.load_table( + {reinterpret_cast<const char*>(field), + len}, ignore_err); + if (table && table->id != table_id) { + ut_ad(rec_get_deleted_flag(rec, 0)); + table = nullptr; + } + if (!table) { + while (btr_pcur_move_to_next(&pcur, &mtr)) { + rec = btr_pcur_get_rec(&pcur); + + if (page_rec_is_user_rec(rec)) { + goto check_rec; + } + } + } + } + } + + mtr.commit(); + return table; +} + +/********************************************************************//** +This function is called when the database is booted. Loads system table +index definitions except for the clustered index which is added to the +dictionary cache at booting before calling this function. */ +void +dict_load_sys_table( +/*================*/ + dict_table_t* table) /*!< in: system table */ +{ + mem_heap_t* heap; + + ut_ad(dict_sys.locked()); + + heap = mem_heap_create(1000); + + dict_load_indexes(table, false, heap, DICT_ERR_IGNORE_NONE); + + mem_heap_free(heap); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/********************************************************************//** +Loads foreign key constraint col names (also for the referenced table). +Members that must be set (and valid) in foreign: +foreign->heap +foreign->n_fields +foreign->id ('\0'-terminated) +Members that will be created and set by this function: +foreign->foreign_col_names[i] +foreign->referenced_col_names[i] +(for i=0..foreign->n_fields-1) */ +static dberr_t dict_load_foreign_cols(dict_foreign_t *foreign, trx_id_t trx_id) +{ + btr_pcur_t pcur; + mtr_t mtr; + size_t id_len; + + ut_ad(dict_sys.locked()); + + id_len = strlen(foreign->id); + + foreign->foreign_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + foreign->referenced_col_names = static_cast<const char**>( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_foreign_cols->indexes.start; + ut_ad(!dict_sys.sys_foreign_cols->not_redundant()); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + + dfield_set_data(&dfield, foreign->id, id_len); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + mem_heap_t* heap = nullptr; + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + for (ulint i = 0; i < foreign->n_fields; i++) { + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + const rec_t* rec = btr_pcur_get_rec(&pcur); + ulint len; + const byte* field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len); + ut_a(len == DATA_TRX_ID_LEN); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_empty(heap); + } + + const trx_id_t id = trx_read_trx_id(field); + if (!id) { + } else if (id != trx_id && trx_sys.find(nullptr, id, false)) { + const auto savepoint = mtr.get_savepoint(); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, + &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, &mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr.rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec || rec_get_deleted_flag(rec, 0)) { + goto next; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(id); + goto next; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + + if (len != id_len || memcmp(foreign->id, field, len)) { + const rec_t* pos; + ulint pos_len; + const rec_t* for_col_name; + ulint for_col_name_len; + const rec_t* ref_col_name; + ulint ref_col_name_len; + + pos = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, + &pos_len); + + for_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, + &for_col_name_len); + + ref_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, + &ref_col_name_len); + + ib::error sout; + + sout << "Unable to load column names for foreign" + " key '" << foreign->id + << "' because it was not found in" + " InnoDB internal table SYS_FOREIGN_COLS. The" + " closest entry we found is:" + " (ID='"; + sout.write(field, len); + sout << "', POS=" << mach_read_from_4(pos) + << ", FOR_COL_NAME='"; + sout.write(for_col_name, for_col_name_len); + sout << "', REF_COL_NAME='"; + sout.write(ref_col_name, ref_col_name_len); + sout << "')"; + + err = DB_CORRUPTION; + break; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + ut_a(len == 4); + ut_a(i == mach_read_from_4(field)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + foreign->foreign_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + foreign->referenced_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + +next: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } +func_exit: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return err; +} + +/***********************************************************************//** +Loads a foreign key constraint to the dictionary cache. If the referenced +table is not yet loaded, it is added in the output parameter (fk_tables). +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +dict_load_foreign( +/*==============*/ + const char* table_name, /*!< in: table name */ + bool uncommitted, /*!< in: use READ UNCOMMITTED + transaction isolation level */ + const char** col_names, + /*!< in: column names, or NULL + to use foreign->foreign_table->col_names */ + trx_id_t trx_id, + /*!< in: current transaction id, or 0 */ + bool check_recursive, + /*!< in: whether to record the foreign table + parent count to avoid unlimited recursive + load of chained foreign tables */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + span<const char> id, + /*!< in: foreign constraint id */ + dict_err_ignore_t ignore_err, + /*!< in: error to be ignored */ + dict_names_t& fk_tables) + /*!< out: the foreign key constraint is added + to the dictionary cache only if the referenced + table is already in cache. Otherwise, the + foreign key constraint is not added to cache, + and the referenced table is added to this + stack. */ +{ + dict_foreign_t* foreign; + btr_pcur_t pcur; + const byte* field; + ulint len; + mtr_t mtr; + dict_table_t* for_table; + dict_table_t* ref_table; + + DBUG_ENTER("dict_load_foreign"); + DBUG_PRINT("dict_load_foreign", + ("id: '%.*s', check_recursive: %d", + int(id.size()), id.data(), check_recursive)); + + ut_ad(dict_sys.locked()); + + dict_index_t* sys_index = dict_sys.sys_foreign->indexes.start; + ut_ad(!dict_sys.sys_foreign->not_redundant()); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + dfield_set_data(&dfield, id.data(), id.size()); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + mtr.start(); + + mem_heap_t* heap = nullptr; + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { +not_found: + err = DB_NOT_FOUND; +err_exit: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + DBUG_RETURN(err); + } + + const rec_t* rec = btr_pcur_get_rec(&pcur); + static_assert(DICT_FLD__SYS_FOREIGN__ID == 0, "compatibility"); + field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len); + + /* Check if the id in record is the searched one */ + if (len != id.size() || memcmp(id.data(), field, id.size())) { + goto not_found; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len); + ut_a(len == DATA_TRX_ID_LEN); + + const trx_id_t tid = trx_read_trx_id(field); + + if (tid && tid != trx_id && !uncommitted + && trx_sys.find(nullptr, tid, false)) { + const auto savepoint = mtr.get_savepoint(); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, &mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr.rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec) { + goto not_found; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(tid); + goto not_found; + } + + /* Read the table names and the number of columns associated + with the constraint */ + + foreign = dict_mem_foreign_create(); + + uint32_t n_fields_and_type = mach_read_from_4( + rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len)); + + ut_a(len == 4); + + /* We store the type in the bits 24..29 of n_fields_and_type. */ + + foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1); + foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS; + + foreign->id = mem_heap_strdupl(foreign->heap, id.data(), id.size()); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + + foreign->foreign_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + const size_t foreign_table_name_len = len; + const size_t table_name_len = strlen(table_name); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + + if (!my_charset_latin1.strnncoll(table_name, table_name_len, + foreign->foreign_table_name, + foreign_table_name_len)) { + } else if (!check_recursive + && !my_charset_latin1.strnncoll(table_name, table_name_len, + (const char*) field, len)) { + } else { + dict_foreign_free(foreign); + goto not_found; + } + + foreign->referenced_table_name = mem_heap_strdupl( + foreign->heap, (const char*) field, len); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + err = dict_load_foreign_cols(foreign, trx_id); + if (err != DB_SUCCESS) { + goto load_error; + } + + ref_table = dict_sys.find_table( + {foreign->referenced_table_name_lookup, + strlen(foreign->referenced_table_name_lookup)}); + for_table = dict_sys.find_table( + {foreign->foreign_table_name_lookup, + strlen(foreign->foreign_table_name_lookup)}); + + if (!for_table) { + /* To avoid recursively loading the tables related through + the foreign key constraints, the child table name is saved + here. The child table will be loaded later, along with its + foreign key constraint. */ + + ut_a(ref_table != NULL); + fk_tables.push_back( + mem_heap_strdupl(ref_table->heap, + foreign->foreign_table_name_lookup, + foreign_table_name_len)); +load_error: + dict_foreign_remove_from_cache(foreign); + DBUG_RETURN(err); + } + + ut_a(for_table || ref_table); + + /* Note that there may already be a foreign constraint object in + the dictionary cache for this constraint: then the following + call only sets the pointers in it to point to the appropriate table + and index objects and frees the newly created object foreign. + Adding to the cache should always succeed since we are not creating + a new foreign key constraint but loading one from the data + dictionary. */ + + DBUG_RETURN(dict_foreign_add_to_cache(foreign, col_names, + check_charsets, + ignore_err)); +} + +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. + +The foreign key constraint is loaded only if the referenced table is also +in the dictionary cache. If the referenced table is not in dictionary +cache, then it is added to the output parameter (fk_tables). + +@return DB_SUCCESS or error code */ +dberr_t +dict_load_foreigns( + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + trx_id_t trx_id, /*!< in: DDL transaction id, + or 0 to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err, /*!< in: error to be ignored */ + dict_names_t& fk_tables) + /*!< out: stack of table + names which must be loaded + subsequently to load all the + foreign key constraints. */ +{ + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_load_foreigns"); + + ut_ad(dict_sys.locked()); + + if (!dict_sys.sys_foreign || !dict_sys.sys_foreign_cols) { + if (ignore_err & DICT_ERR_IGNORE_FK_NOKEY) { + DBUG_RETURN(DB_SUCCESS); + } + sql_print_information("InnoDB: No foreign key system tables" + " in the database"); + DBUG_RETURN(DB_ERROR); + } + + ut_ad(!dict_sys.sys_foreign->not_redundant()); + + dict_index_t *sec_index = dict_table_get_next_index( + dict_table_get_first_index(dict_sys.sys_foreign)); + ut_ad(!strcmp(sec_index->fields[0].name, "FOR_NAME")); + bool check_recursive = !trx_id; + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + +start_load: + mtr.start(); + dfield_set_data(&dfield, table_name, strlen(table_name)); + dict_index_copy_types(&tuple, sec_index, 1); + pcur.btr_cur.page_cur.index = sec_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + DBUG_RETURN(err); + } +loop: + const rec_t* rec = btr_pcur_get_rec(&pcur); + const byte* field; + const auto maybe_deleted = rec_get_deleted_flag(rec, 0); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* End of index */ + + goto load_next_index; + } + + /* Now we have the record in the secondary index containing a table + name and a foreign constraint ID */ + + ulint len; + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len); + + /* Check if the table name in the record is the one searched for; the + following call does the comparison in the latin1_swedish_ci + charset-collation, in a case-insensitive way. */ + + if (cmp_data(dfield_get_type(&dfield)->mtype, + dfield_get_type(&dfield)->prtype, + false, + reinterpret_cast<const byte*>(table_name), + dfield_get_len(&dfield), + field, len)) { + goto load_next_index; + } + + /* Since table names in SYS_FOREIGN are stored in a case-insensitive + order, we have to check that the table name matches also in a binary + string comparison. On Unix, MySQL allows table names that only differ + in character case. If lower_case_table_names=2 then what is stored + may not be the same case, but the previous comparison showed that they + match with no-case. */ + + if (lower_case_table_names != 2 && memcmp(field, table_name, len)) { + goto next_rec; + } + + /* Now we get a foreign key constraint id */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len); + + /* Copy the string because the page may be modified or evicted + after mtr.commit() below. */ + char fk_id[MAX_TABLE_NAME_LEN + NAME_LEN]; + err = DB_SUCCESS; + if (UNIV_LIKELY(len < sizeof fk_id)) { + memcpy(fk_id, field, len); + } + + btr_pcur_store_position(&pcur, &mtr); + + mtr.commit(); + + /* Load the foreign constraint definition to the dictionary cache */ + + err = len < sizeof fk_id + ? dict_load_foreign(table_name, false, col_names, trx_id, + check_recursive, check_charsets, + {fk_id, len}, ignore_err, fk_tables) + : DB_CORRUPTION; + + switch (err) { + case DB_SUCCESS: + break; + case DB_NOT_FOUND: + if (maybe_deleted) { + break; + } + sql_print_error("InnoDB: Cannot load foreign constraint %.*s:" + " could not find the relevant record in " + "SYS_FOREIGN", int(len), fk_id); + /* fall through */ + default: +corrupted: + ut_free(pcur.old_rec_buf); + DBUG_RETURN(err); + } + + mtr.start(); + if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr) + == btr_pcur_t::CORRUPTED) { + mtr.commit(); + goto corrupted; + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; + +load_next_index: + mtr.commit(); + + if ((sec_index = dict_table_get_next_index(sec_index))) { + /* Switch to scan index on REF_NAME, fk_max_recusive_level + already been updated when scanning FOR_NAME index, no need to + update again */ + check_recursive = false; + goto start_load; + } + + ut_free(pcur.old_rec_buf); + DBUG_RETURN(DB_SUCCESS); +} diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc new file mode 100644 index 00000000..b8b2d583 --- /dev/null +++ b/storage/innobase/dict/dict0mem.cc @@ -0,0 +1,1379 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0mem.cc +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "ha_prototypes.h" +#include <mysql_com.h> + +#include "dict0mem.h" +#include "rem0rec.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "fts0priv.h" +#include "lock0lock.h" +#include "row0row.h" +#include "sql_string.h" +#include <iostream> + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ + +/** System databases */ +static const char* innobase_system_databases[] = { + "mysql/", + "information_schema/", + "performance_schema/", + NullS +}; + +/** Determine if a table belongs to innobase_system_databases[] +@param[in] name database_name/table_name +@return whether the database_name is in innobase_system_databases[] */ +static bool dict_mem_table_is_system(const char *name) +{ + /* table has the following format: database/table + and some system table are of the form SYS_* */ + if (!strchr(name, '/')) { + return true; + } + size_t table_len = strlen(name); + const char *system_db; + int i = 0; + while ((system_db = innobase_system_databases[i++]) + && (system_db != NullS)) { + size_t len = strlen(system_db); + if (table_len > len && !strncmp(name, system_db, len)) { + return true; + } + } + return false; +} + +/** The start of the table basename suffix for partitioned tables */ +const char table_name_t::part_suffix[4] +#ifdef _WIN32 += "#p#"; +#else += "#P#"; +#endif + +/** Display an identifier. +@param[in,out] s output stream +@param[in] id_name SQL identifier (other than table name) +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const id_name_t& id_name) +{ + const char q = '`'; + const char* c = id_name; + s << q; + for (; *c != 0; c++) { + if (*c == q) { + s << *c; + } + s << *c; + } + s << q; + return(s); +} + +/** Display a table name. +@param[in,out] s output stream +@param[in] table_name table name +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const table_name_t& table_name) +{ + return(s << ut_get_name(NULL, table_name.m_name)); +} + +bool dict_col_t::same_encoding(uint16_t a, uint16_t b) +{ + if (const CHARSET_INFO *acs= get_charset(a, MYF(MY_WME))) + if (const CHARSET_INFO *bcs= get_charset(b, MYF(MY_WME))) + return Charset(bcs).encoding_allows_reinterpret_as(acs); + return false; +} + +/** Create metadata. +@param name table name +@param space tablespace +@param n_cols total number of columns (both virtual and non-virtual) +@param n_v_cols number of virtual columns +@param flags table flags +@param flags2 table flags2 +@return newly allocated table object */ +dict_table_t *dict_table_t::create(const span<const char> &name, + fil_space_t *space, + ulint n_cols, ulint n_v_cols, ulint flags, + ulint flags2) +{ + ut_ad(!space || space->purpose == FIL_TYPE_TABLESPACE || + space->purpose == FIL_TYPE_TEMPORARY || + space->purpose == FIL_TYPE_IMPORT); + ut_a(dict_tf2_is_valid(flags, flags2)); + ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK)); + + mem_heap_t *heap= mem_heap_create(DICT_HEAP_SIZE); + + dict_table_t *table= static_cast<dict_table_t*> + (mem_heap_zalloc(heap, sizeof(*table))); + + lock_table_lock_list_init(&table->locks); + UT_LIST_INIT(table->indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + table->heap= heap; + + ut_d(table->magic_n= DICT_TABLE_MAGIC_N); + + table->flags= static_cast<unsigned>(flags) & ((1U << DICT_TF_BITS) - 1); + table->flags2= static_cast<unsigned>(flags2) & ((1U << DICT_TF2_BITS) - 1); + table->name.m_name= mem_strdupl(name.data(), name.size()); + table->mdl_name.m_name= table->name.m_name; + table->is_system_db= dict_mem_table_is_system(table->name.m_name); + table->space= space; + table->space_id= space ? space->id : UINT32_MAX; + table->n_t_cols= static_cast<unsigned>(n_cols + DATA_N_SYS_COLS) & + dict_index_t::MAX_N_FIELDS; + table->n_v_cols= static_cast<unsigned>(n_v_cols) & + dict_index_t::MAX_N_FIELDS; + table->n_cols= static_cast<unsigned>(table->n_t_cols - table->n_v_cols) & + dict_index_t::MAX_N_FIELDS; + table->cols= static_cast<dict_col_t*> + (mem_heap_alloc(heap, table->n_cols * sizeof *table->cols)); + table->v_cols= static_cast<dict_v_col_t*> + (mem_heap_alloc(heap, n_v_cols * sizeof *table->v_cols)); + for (ulint i = n_v_cols; i--; ) + new (&table->v_cols[i]) dict_v_col_t(); + table->autoinc_lock= static_cast<ib_lock_t*> + (mem_heap_alloc(heap, sizeof *table->autoinc_lock)); + /* If the table has an FTS index or we are in the process + of building one, create the table->fts */ + if (dict_table_has_fts_index(table) || + DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID | + DICT_TF2_FTS_ADD_DOC_ID)) + { + table->fts= fts_create(table); + table->fts->cache= fts_cache_create(table); + } + + new (&table->foreign_set) dict_foreign_set(); + new (&table->referenced_set) dict_foreign_set(); + + return table; +} + +/****************************************************************//** +Free a table memory object. */ +void +dict_mem_table_free( +/*================*/ + dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(UT_LIST_GET_LEN(table->indexes) == 0); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(UT_LIST_GET_LEN(table->freed_indexes) == 0); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(table->cached = FALSE); + + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + if (table->fts) { + table->fts->~fts_t(); + } + } + + dict_mem_table_free_foreign_vcol_set(table); + + table->foreign_set.~dict_foreign_set(); + table->referenced_set.~dict_foreign_set(); + + ut_free(table->name.m_name); + + /* Clean up virtual index info structures that are registered + with virtual columns */ + for (ulint i = 0; i < table->n_v_def; i++) { + dict_table_get_nth_v_col(table, i)->~dict_v_col_t(); + } + + UT_DELETE(table->s_cols); + + mem_heap_free(table->heap); +} + +/****************************************************************//** +Append 'name' to 'col_names'. @see dict_table_t::col_names +@return new column names array */ +static +const char* +dict_add_col_name( +/*==============*/ + const char* col_names, /*!< in: existing column names, or + NULL */ + ulint cols, /*!< in: number of existing columns */ + const char* name, /*!< in: new column name */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint old_len; + ulint new_len; + ulint total_len; + char* res; + + ut_ad(!cols == !col_names); + + /* Find out length of existing array. */ + if (col_names) { + const char* s = col_names; + ulint i; + + for (i = 0; i < cols; i++) { + s += strlen(s) + 1; + } + + old_len = unsigned(s - col_names); + } else { + old_len = 0; + } + + new_len = strlen(name) + 1; + total_len = old_len + new_len; + + res = static_cast<char*>(mem_heap_alloc(heap, total_len)); + + if (old_len > 0) { + memcpy(res, col_names, old_len); + } + + memcpy(res + old_len, name, new_len); + + return(res); +} + +/**********************************************************************//** +Adds a column definition to a table. */ +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ +{ + dict_col_t* col; + unsigned i; + + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + ut_ad(!(prtype & DATA_VIRTUAL)); + + i = table->n_def++; + + table->n_t_def++; + + if (name) { + if (table->n_def == table->n_cols) { + heap = table->heap; + } + if (i && !table->col_names) { + /* All preceding column names are empty. */ + char* s = static_cast<char*>( + mem_heap_zalloc(heap, table->n_def)); + + table->col_names = s; + } + + table->col_names = dict_add_col_name(table->col_names, + i, name, heap); + } + + col = dict_table_get_nth_col(table, i); + + dict_mem_fill_column_struct(col, i, mtype, prtype, len); + + switch (prtype & DATA_VERSIONED) { + case DATA_VERS_START: + ut_ad(!table->vers_start); + table->vers_start = i & dict_index_t::MAX_N_FIELDS; + break; + case DATA_VERS_END: + ut_ad(!table->vers_end); + table->vers_end = i & dict_index_t::MAX_N_FIELDS; + } +} + +/** Adds a virtual column definition to a table. +@param[in,out] table table +@param[in,out] heap temporary memory heap, or NULL. It is + used to store name when we have not finished + adding all columns. When all columns are + added, the whole name will copy to memory from + table->heap +@param[in] name column name +@param[in] mtype main datatype +@param[in] prtype precise type +@param[in] len length +@param[in] pos position in a table +@param[in] num_base number of base columns +@return the virtual column definition */ +dict_v_col_t* +dict_mem_table_add_v_col( + dict_table_t* table, + mem_heap_t* heap, + const char* name, + ulint mtype, + ulint prtype, + ulint len, + ulint pos, + ulint num_base) +{ + dict_v_col_t* v_col; + + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + ut_ad(prtype & DATA_VIRTUAL); + + unsigned i = table->n_v_def++; + + table->n_t_def++; + + if (name != NULL) { + if (table->n_v_def == table->n_v_cols) { + heap = table->heap; + } + + if (i && !table->v_col_names) { + /* All preceding column names are empty. */ + char* s = static_cast<char*>( + mem_heap_zalloc(heap, table->n_v_def)); + + table->v_col_names = s; + } + + table->v_col_names = dict_add_col_name(table->v_col_names, + i, name, heap); + } + + v_col = &table->v_cols[i]; + + dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len); + v_col->v_pos = i & dict_index_t::MAX_N_FIELDS; + + if (num_base != 0) { + v_col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc( + table->heap, num_base * sizeof( + *v_col->base_col))); + } else { + v_col->base_col = NULL; + } + + v_col->num_base = static_cast<unsigned>(num_base) + & dict_index_t::MAX_N_FIELDS; + + /* Initialize the index list for virtual columns */ + ut_ad(v_col->v_indexes.empty()); + + return(v_col); +} + +/** Adds a stored column definition to a table. +@param[in] table table +@param[in] num_base number of base columns. */ +void +dict_mem_table_add_s_col( + dict_table_t* table, + ulint num_base) +{ + unsigned i = unsigned(table->n_def) - 1; + dict_col_t* col = dict_table_get_nth_col(table, i); + dict_s_col_t s_col; + + ut_ad(col != NULL); + + if (table->s_cols == NULL) { + table->s_cols = UT_NEW_NOKEY(dict_s_col_list()); + } + + s_col.m_col = col; + s_col.s_pos = i + table->n_v_def; + + if (num_base != 0) { + s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc( + table->heap, num_base * sizeof(dict_col_t*))); + } else { + s_col.base_col = NULL; + } + + s_col.num_base = num_base; + table->s_cols->push_front(s_col); +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +static MY_ATTRIBUTE((nonnull)) +void +dict_mem_table_col_rename_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned i, /*!< in: column offset corresponding to s */ + const char* to, /*!< in: new column name */ + const char* s, /*!< in: pointer to table->col_names */ + bool is_virtual) + /*!< in: if this is a virtual column */ +{ + char* t_col_names = const_cast<char*>( + is_virtual ? table->v_col_names : table->col_names); + ulint n_col = is_virtual ? table->n_v_def : table->n_def; + + size_t from_len = strlen(s), to_len = strlen(to); + + ut_ad(i < table->n_def || is_virtual); + ut_ad(i < table->n_v_def || !is_virtual); + + ut_ad(from_len <= NAME_LEN); + ut_ad(to_len <= NAME_LEN); + + char from[NAME_LEN + 1]; + strncpy(from, s, sizeof from - 1); + from[sizeof from - 1] = '\0'; + + if (from_len == to_len) { + /* The easy case: simply replace the column name in + table->col_names. */ + strcpy(const_cast<char*>(s), to); + } else { + /* We need to adjust all affected index->field + pointers, as in dict_index_add_col(). First, copy + table->col_names. */ + ulint prefix_len = ulint(s - t_col_names); + + for (; i < n_col; i++) { + s += strlen(s) + 1; + } + + ulint full_len = ulint(s - t_col_names); + char* col_names; + + if (to_len > from_len) { + col_names = static_cast<char*>( + mem_heap_alloc( + table->heap, + full_len + to_len - from_len)); + + memcpy(col_names, t_col_names, prefix_len); + } else { + col_names = const_cast<char*>(t_col_names); + } + + memcpy(col_names + prefix_len, to, to_len); + memmove(col_names + prefix_len + to_len, + t_col_names + (prefix_len + from_len), + full_len - (prefix_len + from_len)); + + /* Replace the field names in every index. */ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + ulint n_fields = dict_index_get_n_fields(index); + + for (ulint i = 0; i < n_fields; i++) { + dict_field_t* field + = dict_index_get_nth_field( + index, i); + + ut_ad(!field->name + == field->col->is_dropped()); + if (!field->name) { + /* dropped columns lack a name */ + ut_ad(index->is_instant()); + continue; + } + + /* if is_virtual and that in field->col does + not match, continue */ + if ((!is_virtual) != + (!field->col->is_virtual())) { + continue; + } + + ulint name_ofs + = ulint(field->name - t_col_names); + if (name_ofs <= prefix_len) { + field->name = col_names + name_ofs; + } else { + ut_a(name_ofs < full_len); + field->name = col_names + + name_ofs + to_len - from_len; + } + } + } + + if (is_virtual) { + table->v_col_names = col_names; + } else { + table->col_names = col_names; + } + } + + /* Virtual columns are not allowed for foreign key */ + if (is_virtual) { + return; + } + + dict_foreign_t* foreign; + + /* Replace the field names in every foreign key constraint. */ + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_index == NULL) { + /* We may go here when we set foreign_key_checks to 0, + and then try to rename a column and modify the + corresponding foreign key constraint. The index + would have been dropped, we have to find an equivalent + one */ + for (unsigned f = 0; f < foreign->n_fields; f++) { + if (strcmp(foreign->foreign_col_names[f], from) + == 0) { + + char** rc = const_cast<char**>( + foreign->foreign_col_names + + f); + + if (to_len <= strlen(*rc)) { + memcpy(*rc, to, to_len + 1); + } else { + *rc = static_cast<char*>( + mem_heap_dup( + foreign->heap, + to, + to_len + 1)); + } + } + } + + /* New index can be null if InnoDB already dropped + the foreign index when FOREIGN_KEY_CHECKS is + disabled */ + foreign->foreign_index = dict_foreign_find_index( + foreign->foreign_table, NULL, + foreign->foreign_col_names, + foreign->n_fields, NULL, true, false, + NULL, NULL, NULL); + + } else { + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* These can point straight to + table->col_names, because the foreign key + constraints will be freed at the same time + when the table object is freed. */ + foreign->foreign_col_names[f] + = dict_index_get_nth_field( + foreign->foreign_index, + f)->name; + } + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (!foreign->referenced_index) { + /* Referenced index could have been dropped + when foreign_key_checks is disabled. In that case, + rename the corresponding referenced_col_names and + find the equivalent referenced index also */ + for (unsigned f = 0; f < foreign->n_fields; f++) { + + const char*& rc = + foreign->referenced_col_names[f]; + if (strcmp(rc, from)) { + continue; + } + + if (to_len <= strlen(rc)) { + memcpy(const_cast<char*>(rc), to, + to_len + 1); + } else { + rc = static_cast<char*>( + mem_heap_dup( + foreign->heap, + to, to_len + 1)); + } + } + + /* New index can be null if InnoDB already dropped + the referenced index when FOREIGN_KEY_CHECKS is + disabled */ + foreign->referenced_index = dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, NULL, true, false, + NULL, NULL, NULL); + return; + } + + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* foreign->referenced_col_names[] need to be + copies, because the constraint may become + orphan when foreign_key_checks=0 and the + parent table is dropped. */ + + const char* col_name = dict_index_get_nth_field( + foreign->referenced_index, f)->name; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + char** rc = const_cast<char**>( + foreign->referenced_col_names + f); + size_t col_name_len_1 = strlen(col_name) + 1; + + if (col_name_len_1 <= strlen(*rc) + 1) { + memcpy(*rc, col_name, col_name_len_1); + } else { + *rc = static_cast<char*>( + mem_heap_dup( + foreign->heap, + col_name, + col_name_len_1)); + } + } + } + } +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ulint nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to, /*!< in: new column name */ + bool is_virtual) + /*!< in: if this is a virtual column */ +{ + const char* s = is_virtual ? table->v_col_names : table->col_names; + + ut_ad((!is_virtual && nth_col < table->n_def) + || (is_virtual && nth_col < table->n_v_def)); + + for (ulint i = 0; i < nth_col; i++) { + size_t len = strlen(s); + ut_ad(len > 0); + s += len + 1; + } + + ut_ad(!my_strcasecmp(system_charset_info, from, s)); + + dict_mem_table_col_rename_low(table, static_cast<unsigned>(nth_col), + to, s, is_virtual); +} + +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len) /*!< in: column length */ +{ + unsigned mbminlen, mbmaxlen; + + column->ind = static_cast<unsigned>(col_pos) + & dict_index_t::MAX_N_FIELDS; + column->ord_part = 0; + column->max_prefix = 0; + column->mtype = static_cast<uint8_t>(mtype); + column->prtype = static_cast<unsigned>(prtype); + column->len = static_cast<uint16_t>(col_len); + dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); + column->mbminlen = mbminlen & 7; + column->mbmaxlen = mbmaxlen & 7; + column->def_val.data = NULL; + column->def_val.len = UNIV_SQL_DEFAULT; + ut_ad(!column->is_dropped()); +} + +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +dict_index_t* +dict_mem_index_create( +/*==================*/ + dict_table_t* table, /*!< in: table */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + dict_index_t* index; + mem_heap_t* heap; + + ut_ad(!table || table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index_name); + + heap = mem_heap_create(DICT_HEAP_SIZE); + + index = static_cast<dict_index_t*>( + mem_heap_zalloc(heap, sizeof(*index))); + index->table = table; + + dict_mem_fill_index_struct(index, heap, index_name, type, n_fields); + + new (&index->zip_pad.mutex) std::mutex(); + + if (type & DICT_SPATIAL) { + index->rtr_track = new + (mem_heap_alloc(heap, sizeof *index->rtr_track)) + rtr_info_track_t(); + mysql_mutex_init(rtr_active_mutex_key, + &index->rtr_track->rtr_active_mutex, nullptr); + } + + return(index); +} + +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +dict_foreign_t* +dict_mem_foreign_create(void) +/*=========================*/ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + DBUG_ENTER("dict_mem_foreign_create"); + + heap = mem_heap_create(100); + + foreign = static_cast<dict_foreign_t*>( + mem_heap_zalloc(heap, sizeof(dict_foreign_t))); + + foreign->heap = heap; + + foreign->v_cols = NULL; + + DBUG_PRINT("dict_mem_foreign_create", ("heap: %p", heap)); + + DBUG_RETURN(foreign); +} + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (lower_case_table_names == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->foreign_table_name) + 1; + + foreign->foreign_table_name_lookup = + static_cast<char*>( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->foreign_table_name_lookup, + foreign->foreign_table_name); + innobase_casedn_str(foreign->foreign_table_name_lookup); + } else { + foreign->foreign_table_name_lookup + = foreign->foreign_table_name; + } +} + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (lower_case_table_names == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->referenced_table_name) + 1; + + foreign->referenced_table_name_lookup = + static_cast<char*>( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->referenced_table_name_lookup, + foreign->referenced_table_name); + innobase_casedn_str(foreign->referenced_table_name_lookup); + } else { + foreign->referenced_table_name_lookup + = foreign->referenced_table_name; + } +} + +/** Fill the virtual column set with virtual column information +present in the given virtual index. +@param[in] index virtual index +@param[out] v_cols virtual column set. */ +static +void +dict_mem_fill_vcol_has_index( + const dict_index_t* index, + dict_vcol_set** v_cols) +{ + for (ulint i = 0; i < index->table->n_v_cols; i++) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + index->table, i); + if (!v_col->m_col.ord_part) { + continue; + } + + for (const auto& v_idx : v_col->v_indexes) { + if (v_idx.index != index) { + continue; + } + + if (*v_cols == NULL) { + *v_cols = UT_NEW_NOKEY(dict_vcol_set()); + } + + (*v_cols)->insert(v_col); + } + } +} + +/** Fill the virtual column set with the virtual column of the index +if the index contains given column name. +@param[in] col_name column name +@param[in] table innodb table object +@param[out] v_cols set of virtual column information. */ +static +void +dict_mem_fill_vcol_from_v_indexes( + const char* col_name, + const dict_table_t* table, + dict_vcol_set** v_cols) +{ + /* virtual column can't be Primary Key, so start with + secondary index */ + for (dict_index_t* index = dict_table_get_next_index( + dict_table_get_first_index(table)); + index; + index = dict_table_get_next_index(index)) { + + /* Skip if the index have newly added + virtual column because field name is NULL. + Later virtual column set will be + refreshed during loading of table. */ + if (!dict_index_has_virtual(index) + || index->has_new_v_col()) { + continue; + } + + for (ulint i = 0; i < index->n_fields; i++) { + dict_field_t* field = + dict_index_get_nth_field(index, i); + + if (strcmp(field->name, col_name) == 0) { + dict_mem_fill_vcol_has_index( + index, v_cols); + } + } + } +} + +/** Fill the virtual column set with virtual columns which have base columns +as the given col_name +@param[in] col_name column name +@param[in] table table object +@param[out] v_cols set of virtual columns. */ +static +void +dict_mem_fill_vcol_set_for_base_col( + const char* col_name, + const dict_table_t* table, + dict_vcol_set** v_cols) +{ + for (ulint i = 0; i < table->n_v_cols; i++) { + dict_v_col_t* v_col = dict_table_get_nth_v_col(table, i); + + if (!v_col->m_col.ord_part) { + continue; + } + + for (ulint j = 0; j < unsigned{v_col->num_base}; j++) { + if (strcmp(col_name, dict_table_get_col_name( + table, + v_col->base_col[j]->ind)) == 0) { + + if (*v_cols == NULL) { + *v_cols = UT_NEW_NOKEY(dict_vcol_set()); + } + + (*v_cols)->insert(v_col); + } + } + } +} + +/** Fills the dependent virtual columns in a set. +Reason for being dependent are +1) FK can be present on base column of virtual columns +2) FK can be present on column which is a part of virtual index +@param[in,out] foreign foreign key information. */ +void +dict_mem_foreign_fill_vcol_set( + dict_foreign_t* foreign) +{ + ulint type = foreign->type; + + if (type == 0) { + return; + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + /** FK can be present on base columns + of virtual columns. */ + dict_mem_fill_vcol_set_for_base_col( + foreign->foreign_col_names[i], + foreign->foreign_table, + &foreign->v_cols); + + /** FK can be present on the columns + which can be a part of virtual index. */ + dict_mem_fill_vcol_from_v_indexes( + foreign->foreign_col_names[i], + foreign->foreign_table, + &foreign->v_cols); + } +} + +/** Fill virtual columns set in each fk constraint present in the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_fill_foreign_vcol_set( + dict_table_t* table) +{ + dict_foreign_set fk_set = table->foreign_set; + dict_foreign_t* foreign; + + dict_foreign_set::iterator it; + for (it = fk_set.begin(); it != fk_set.end(); ++it) { + foreign = *it; + + dict_mem_foreign_fill_vcol_set(foreign); + } +} + +/** Free the vcol_set from all foreign key constraint on the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_free_foreign_vcol_set( + dict_table_t* table) +{ + dict_foreign_set fk_set = table->foreign_set; + dict_foreign_t* foreign; + + dict_foreign_set::iterator it; + for (it = fk_set.begin(); it != fk_set.end(); ++it) { + + foreign = *it; + + if (foreign->v_cols != NULL) { + UT_DELETE(foreign->v_cols); + foreign->v_cols = NULL; + } + } +} + +/**********************************************************************//** +Frees an index memory object. */ +void +dict_mem_index_free( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->zip_pad.mutex.~mutex(); + + if (dict_index_is_spatial(index)) { + for (auto& rtr_info : index->rtr_track->rtr_active) { + rtr_info->index = NULL; + } + + mysql_mutex_destroy(&index->rtr_track->rtr_active_mutex); + index->rtr_track->~rtr_info_track_t(); + } + + index->detach_columns(); + mem_heap_free(index->heap); +} + +/** Create a temporary tablename like "#sql-ibNNN". +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id) +{ + size_t size; + char* name; + const char* dbend = strchr(dbtab, '/'); + ut_ad(dbend); + size_t dblen = size_t(dbend - dbtab) + 1; + + size = dblen + (sizeof(TEMP_FILE_PREFIX_INNODB) + 20); + name = static_cast<char*>(mem_heap_alloc(heap, size)); + memcpy(name, dbtab, dblen); + snprintf(name + dblen, size - dblen, + TEMP_FILE_PREFIX_INNODB UINT64PF, id); + + return(name); +} + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set) +{ + dict_foreign_not_exists not_exists(fk_set); + + dict_foreign_set::const_iterator it = std::find_if( + fk_set.begin(), fk_set.end(), not_exists); + + if (it == fk_set.end()) { + return(true); + } + + dict_foreign_t* foreign = *it; + std::cerr << "Foreign key lookup failed: " << *foreign; + std::cerr << fk_set; + ut_ad(0); + return(false); +} + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table) +{ + return(dict_foreign_set_validate(table.foreign_set) + && dict_foreign_set_validate(table.referenced_set)); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign) +{ + out << "[dict_foreign_t: id='" << foreign.id << "'"; + + if (foreign.foreign_table_name != NULL) { + out << ",for: '" << foreign.foreign_table_name << "'"; + } + + out << "]"; + return(out); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set) +{ + out << "[dict_foreign_set:"; + std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out)); + out << "]" << std::endl; + return(out); +} + +/** Check whether fulltext index gets affected by foreign +key constraint. */ +bool dict_foreign_t::affects_fulltext() const +{ + if (foreign_table == referenced_table || !foreign_table->fts) + return false; + + for (ulint i= 0; i < n_fields; i++) + { + const dict_col_t *col= dict_index_get_nth_col(foreign_index, i); + if (dict_table_is_fts_column(foreign_table->fts->indexes, col->ind, + col->is_virtual()) != ULINT_UNDEFINED) + return true; + } + + return false; +} + +/** Reconstruct the clustered index fields. +@return whether metadata is incorrect */ +inline bool dict_index_t::reconstruct_fields() +{ + DBUG_ASSERT(is_primary()); + + const auto old_n_fields = n_fields; + + n_fields = (n_fields + table->instant->n_dropped) + & dict_index_t::MAX_N_FIELDS; + n_def = (n_def + table->instant->n_dropped) + & dict_index_t::MAX_N_FIELDS; + + const unsigned n_first = first_user_field(); + + dict_field_t* tfields = static_cast<dict_field_t*>( + mem_heap_zalloc(heap, n_fields * sizeof *fields)); + + memcpy(tfields, fields, n_first * sizeof *fields); + + n_nullable = 0; + ulint n_core_null = 0; + const bool comp = dict_table_is_comp(table); + const auto* field_map_it = table->instant->field_map; + for (unsigned i = n_first, j = 0; i < n_fields; ) { + dict_field_t& f = tfields[i++]; + auto c = *field_map_it++; + if (c.is_dropped()) { + f.col = &table->instant->dropped[j++]; + DBUG_ASSERT(f.col->is_dropped()); + f.fixed_len = dict_col_get_fixed_size(f.col, comp) + & ((1U << 10) - 1); + } else { + DBUG_ASSERT(!c.is_not_null()); + const auto old = std::find_if( + fields + n_first, fields + old_n_fields, + [c](const dict_field_t& o) + { return o.col->ind == c.ind(); }); + + if (old >= fields + old_n_fields + || old->prefix_len + || old->col != &table->cols[c.ind()]) { + return true; + } + + ut_ad(old >= &fields[n_first]); + f = *old; + } + + f.col->clear_instant(); + if (f.col->is_nullable()) { + n_nullable++; + n_core_null += i <= n_core_fields; + } + } + + fields = tfields; + n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_core_null)); + + return false; +} + +/** Reconstruct dropped or reordered columns. +@param[in] metadata data from serialise_columns() +@param[in] len length of the metadata, in bytes +@return whether parsing the metadata failed */ +bool dict_table_t::deserialise_columns(const byte* metadata, ulint len) +{ + DBUG_ASSERT(!instant); + + unsigned num_non_pk_fields = mach_read_from_4(metadata); + metadata += 4; + + if (num_non_pk_fields >= REC_MAX_N_FIELDS - 3) { + return true; + } + + dict_index_t* index = UT_LIST_GET_FIRST(indexes); + + if (num_non_pk_fields < unsigned(index->n_fields) + - index->first_user_field()) { + return true; + } + + field_map_element_t* field_map = static_cast<field_map_element_t*>( + mem_heap_alloc(heap, + num_non_pk_fields * sizeof *field_map)); + + unsigned n_dropped_cols = 0; + + for (unsigned i = 0; i < num_non_pk_fields; i++) { + auto c = field_map[i] = mach_read_from_2(metadata); + metadata += 2; + + if (field_map[i].is_dropped()) { + if (c.ind() > DICT_MAX_FIXED_COL_LEN + 1) { + return true; + } + n_dropped_cols++; + } else if (c >= n_cols) { + return true; + } + } + + dict_col_t* dropped_cols = static_cast<dict_col_t*>(mem_heap_zalloc( + heap, n_dropped_cols * sizeof(dict_col_t))); + instant = new (mem_heap_alloc(heap, sizeof *instant)) dict_instant_t(); + instant->n_dropped = n_dropped_cols; + instant->dropped = dropped_cols; + instant->field_map = field_map; + + dict_col_t* col = dropped_cols; + for (unsigned i = 0; i < num_non_pk_fields; i++) { + if (field_map[i].is_dropped()) { + auto fixed_len = field_map[i].ind(); + DBUG_ASSERT(fixed_len <= DICT_MAX_FIXED_COL_LEN + 1); + (col++)->set_dropped(field_map[i].is_not_null(), + fixed_len == 1, + fixed_len > 1 ? fixed_len - 1 + : 0); + } + } + DBUG_ASSERT(col == &dropped_cols[n_dropped_cols]); + + return UT_LIST_GET_FIRST(indexes)->reconstruct_fields(); +} + +/** Check if record in clustered index is historical row. +@param[in] rec clustered row +@param[in] offsets offsets +@return true if row is historical */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + const rec_offs* offsets) +{ + ut_ad(is_primary()); + + ulint len; + dict_col_t& col= table->cols[table->vers_end]; + ut_ad(col.vers_sys_end()); + ulint nfield = dict_col_get_clust_pos(&col, this); + const byte *data = rec_get_nth_field(rec, offsets, nfield, &len); + if (col.vers_native()) { + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); +} + +/** Check if record in secondary index is historical row. +@param[in] rec record in a secondary index +@param[out] history_row true if row is historical +@return true on error */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + bool &history_row) +{ + ut_ad(!is_primary()); + + /* + Get row_end from clustered index + + TODO (optimization): row_end can be taken from unique secondary index + as well. For that dict_index_t::vers_end member should be added and + updated at index init (dict_index_build_internal_non_clust()). + + Test case: + + create or replace table t1 (x int unique, y int unique, + foreign key r (y) references t1 (x)) + with system versioning engine innodb; + insert into t1 values (1, 1); + */ + bool error = false; + mem_heap_t* heap = NULL; + dict_index_t* clust_index = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_t mtr; + mtr.start(); + + rec_t* clust_rec = + row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr); + if (clust_rec) { + offsets = rec_get_offsets(clust_rec, clust_index, offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + history_row = clust_index->vers_history_row(clust_rec, offsets); + } else { + ib::error() << "foreign constraints: secondary index is out of " + "sync"; + ut_ad("secondary index is out of sync" == 0); + error = true; + } + mtr.commit(); + if (heap) { + mem_heap_free(heap); + } + return(error); +} diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc new file mode 100644 index 00000000..40969335 --- /dev/null +++ b/storage/innobase/dict/dict0stats.cc @@ -0,0 +1,4724 @@ +/***************************************************************************** + +Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats.cc +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#include "dict0stats.h" +#include "dyn0buf.h" +#include "row0sel.h" +#include "trx0trx.h" +#include "lock0lock.h" +#include "pars0pars.h" +#include <mysql_com.h> +#include "log.h" +#include "btr0btr.h" +#include "que0que.h" +#include "scope.h" +#include "debug_sync.h" + +#include <algorithm> +#include <map> +#include <vector> +#include <thread> + +/* Sampling algorithm description @{ + +The algorithm is controlled by one number - N_SAMPLE_PAGES(index), +let it be A, which is the number of leaf pages to analyze for a given index +for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be +analyzed). + +Let the total number of leaf pages in the table be T. +Level 0 - leaf pages, level H - root. + +Definition: N-prefix-boring record is a record on a non-leaf page that equals +the next (to the right, cross page boundaries, skipping the supremum and +infimum) record on the same level when looking at the fist n-prefix columns. +The last (user) record on a level is not boring (it does not match the +non-existent user record to the right). We call the records boring because all +the records on the page below a boring record are equal to that boring record. + +We avoid diving below boring records when searching for a leaf page to +estimate the number of distinct records because we know that such a leaf +page will have number of distinct records == 1. + +For each n-prefix: start from the root level and full scan subsequent lower +levels until a level that contains at least A*10 distinct records is found. +Lets call this level LA. +As an optimization the search is canceled if it has reached level 1 (never +descend to the level 0 (leaf)) and also if the next level to be scanned +would contain more than A pages. The latter is because the user has asked +to analyze A leaf pages and it does not make sense to scan much more than +A non-leaf pages with the sole purpose of finding a good sample of A leaf +pages. + +After finding the appropriate level LA with >A*10 distinct records (or less in +the exceptions described above), divide it into groups of equal records and +pick A such groups. Then pick the last record from each group. For example, +let the level be: + +index: 0,1,2,3,4,5,6,7,8,9,10 +record: 1,1,1,2,2,7,7,7,7,7,9 + +There are 4 groups of distinct records and if A=2 random ones are selected, +e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected. + +After selecting A records as described above, dive below them to find A leaf +pages and analyze them, finding the total number of distinct records. The +dive to the leaf level is performed by selecting a non-boring record from +each page and diving below it. + +This way, a total of A leaf pages are analyzed for the given n-prefix. + +Let the number of different key values found in each leaf page i be Pi (i=1..A). +Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A. +Let the number of different key values on level LA be N_DIFF_LA. +Let the total number of records on level LA be TOTAL_LA. +Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the +leaf level. +Let the number of leaf pages be N. +Then the total number of different key values on the leaf level is: +N * R * N_DIFF_AVG_LEAF. +See REF01 for the implementation. + +The above describes how to calculate the cardinality of an index. +This algorithm is executed for each n-prefix of a multi-column index +where n=1..n_uniq. +@} */ + +/* names of the tables from the persistent statistics storage */ +#define TABLE_STATS_NAME_PRINT "mysql.innodb_table_stats" +#define INDEX_STATS_NAME_PRINT "mysql.innodb_index_stats" + +#ifdef UNIV_STATS_DEBUG +#define DEBUG_PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else /* UNIV_STATS_DEBUG */ +#define DEBUG_PRINTF(fmt, ...) /* noop */ +#endif /* UNIV_STATS_DEBUG */ + +/* Gets the number of leaf pages to sample in persistent stats estimation */ +#define N_SAMPLE_PAGES(index) \ + static_cast<ib_uint64_t>( \ + (index)->table->stats_sample_pages != 0 \ + ? (index)->table->stats_sample_pages \ + : srv_stats_persistent_sample_pages) + +/* number of distinct records on a given level that are required to stop +descending to lower levels and fetch N_SAMPLE_PAGES(index) records +from that level */ +#define N_DIFF_REQUIRED(index) (N_SAMPLE_PAGES(index) * 10) + +/* A dynamic array where we store the boundaries of each distinct group +of keys. For example if a btree level is: +index: 0,1,2,3,4,5,6,7,8,9,10,11,12 +data: b,b,b,b,b,b,g,g,j,j,j, x, y +then we would store 5,7,10,11,12 in the array. */ +typedef std::vector<ib_uint64_t, ut_allocator<ib_uint64_t> > boundaries_t; + +/** Allocator type used for index_map_t. */ +typedef ut_allocator<std::pair<const char* const, dict_index_t*> > + index_map_t_allocator; + +/** Auxiliary map used for sorting indexes by name in dict_stats_save(). */ +typedef std::map<const char*, dict_index_t*, ut_strcmp_functor, + index_map_t_allocator> index_map_t; + +bool dict_table_t::is_stats_table() const +{ + return !strcmp(name.m_name, TABLE_STATS_NAME) || + !strcmp(name.m_name, INDEX_STATS_NAME); +} + +bool trx_t::has_stats_table_lock() const +{ + for (const lock_t *l : lock.table_locks) + if (l && l->un_member.tab_lock.table->is_stats_table()) + return true; + return false; +} + +/*********************************************************************//** +Checks whether an index should be ignored in stats manipulations: +* stats fetch +* stats recalc +* stats save +@return true if exists and all tables are ok */ +UNIV_INLINE +bool +dict_stats_should_ignore_index( +/*===========================*/ + const dict_index_t* index) /*!< in: index */ +{ + return !index->is_btree() || index->to_be_dropped || !index->is_committed(); +} + + +/** expected column definition */ +struct dict_col_meta_t +{ + /** column name */ + const char *name; + /** main type */ + unsigned mtype; + /** prtype mask; all these bits have to be set in prtype */ + unsigned prtype_mask; + /** column length in bytes */ + unsigned len; +}; + +/** For checking whether a table exists and has a predefined schema */ +struct dict_table_schema_t +{ + /** table name */ + span<const char> table_name; + /** table name in SQL */ + const char *table_name_sql; + /** number of columns */ + unsigned n_cols; + /** columns */ + const dict_col_meta_t columns[8]; +}; + +static const dict_table_schema_t table_stats_schema = +{ + {C_STRING_WITH_LEN(TABLE_STATS_NAME)}, TABLE_STATS_NAME_PRINT, 6, + { + {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + It presents if the server is running in a pure MariaDB installation, + because MariaDB's Field_timestampf::flags has UNSIGNED_FLAG. + But DATA_UNSIGNED misses when the server starts on a MySQL-5.7 directory + (during a migration), because MySQL's Field_timestampf::flags does not + have UNSIGNED_FLAG. + This is fine not to check DATA_UNSIGNED, because Field_timestampf + in both MariaDB and MySQL support only non-negative time_t values. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, + {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"clustered_index_size", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"sum_of_other_index_sizes", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + } +}; + +static const dict_table_schema_t index_stats_schema = +{ + {C_STRING_WITH_LEN(INDEX_STATS_NAME)}, INDEX_STATS_NAME_PRINT, 8, + { + {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, + {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + See comments about last_update in table_stats_schema above. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, + {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3}, + {"stat_value", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"sample_size", DATA_INT, DATA_UNSIGNED, 8}, + {"stat_description", DATA_VARMYSQL, DATA_NOT_NULL, 1024*3} + } +}; + +/** Construct the type's SQL name (e.g. BIGINT UNSIGNED) +@param mtype InnoDB main type +@param prtype InnoDB precise type +@param len length of the column +@param name the SQL name +@param name_sz size of the name buffer +@return number of bytes written (excluding the terminating NUL byte) */ +static int dtype_sql_name(unsigned mtype, unsigned prtype, unsigned len, + char *name, size_t name_sz) +{ + const char *Unsigned= ""; + const char *Main= "UNKNOWN"; + + switch (mtype) { + case DATA_INT: + switch (len) { + case 1: + Main= "TINYINT"; + break; + case 2: + Main= "SMALLINT"; + break; + case 3: + Main= "MEDIUMINT"; + break; + case 4: + Main= "INT"; + break; + case 8: + Main= "BIGINT"; + break; + } + + append_unsigned: + if (prtype & DATA_UNSIGNED) + Unsigned= " UNSIGNED"; + len= 0; + break; + case DATA_FLOAT: + Main= "FLOAT"; + goto append_unsigned; + case DATA_DOUBLE: + Main= "DOUBLE"; + goto append_unsigned; + case DATA_FIXBINARY: + Main= "BINARY"; + break; + case DATA_CHAR: + case DATA_MYSQL: + Main= "CHAR"; + break; + case DATA_VARCHAR: + case DATA_VARMYSQL: + Main= "VARCHAR"; + break; + case DATA_BINARY: + Main= "VARBINARY"; + break; + case DATA_GEOMETRY: + Main= "GEOMETRY"; + len= 0; + break; + case DATA_BLOB: + switch (len) { + case 9: + Main= "TINYBLOB"; + break; + case 10: + Main= "BLOB"; + break; + case 11: + Main= "MEDIUMBLOB"; + break; + case 12: + Main= "LONGBLOB"; + break; + } + len= 0; + } + + const char* Not_null= (prtype & DATA_NOT_NULL) ? " NOT NULL" : ""; + if (len) + return snprintf(name, name_sz, "%s(%u)%s%s", Main, len, Unsigned, + Not_null); + else + return snprintf(name, name_sz, "%s%s%s", Main, Unsigned, Not_null); +} + +static bool innodb_table_stats_not_found; +static bool innodb_index_stats_not_found; +static bool innodb_table_stats_not_found_reported; +static bool innodb_index_stats_not_found_reported; + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +static +dberr_t +dict_table_schema_check( +/*====================*/ + const dict_table_schema_t* req_schema, /*!< in: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS is + returned */ + size_t errstr_sz) /*!< in: errstr size */ +{ + const dict_table_t* table= dict_sys.load_table(req_schema->table_name); + + if (!table) { + if (opt_bootstrap) + return DB_TABLE_NOT_FOUND; + if (req_schema == &table_stats_schema) { + if (innodb_table_stats_not_found_reported) { + return DB_STATS_DO_NOT_EXIST; + } + innodb_table_stats_not_found = true; + innodb_table_stats_not_found_reported = true; + } else { + ut_ad(req_schema == &index_stats_schema); + if (innodb_index_stats_not_found_reported) { + return DB_STATS_DO_NOT_EXIST; + } + innodb_index_stats_not_found = true; + innodb_index_stats_not_found_reported = true; + } + + snprintf(errstr, errstr_sz, "Table %s not found.", + req_schema->table_name_sql); + return DB_TABLE_NOT_FOUND; + } + + if (!table->is_readable() && !table->space) { + /* missing tablespace */ + snprintf(errstr, errstr_sz, + "Tablespace for table %s is missing.", + req_schema->table_name_sql); + return DB_TABLE_NOT_FOUND; + } + + if (unsigned(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) { + /* the table has a different number of columns than required */ + snprintf(errstr, errstr_sz, + "%s has %d columns but should have %u.", + req_schema->table_name_sql, + table->n_def - DATA_N_SYS_COLS, + req_schema->n_cols); + return DB_ERROR; + } + + /* For each column from req_schema->columns[] search + whether it is present in table->cols[]. + The following algorithm is O(n_cols^2), but is optimized to + be O(n_cols) if the columns are in the same order in both arrays. */ + + for (unsigned i = 0; i < req_schema->n_cols; i++) { + ulint j = dict_table_has_column( + table, req_schema->columns[i].name, i); + + if (j == table->n_def) { + snprintf(errstr, errstr_sz, + "required column %s" + " not found in table %s.", + req_schema->columns[i].name, + req_schema->table_name_sql); + + return(DB_ERROR); + } + + /* we found a column with the same name on j'th position, + compare column types and flags */ + + /* check length for exact match */ + if (req_schema->columns[i].len != table->cols[j].len) { + sql_print_warning("InnoDB: Table %s has" + " length mismatch in the" + " column name %s." + " Please run mariadb-upgrade", + req_schema->table_name_sql, + req_schema->columns[i].name); + } + + /* + check mtype for exact match. + This check is relaxed to allow use to use TIMESTAMP + (ie INT) for last_update instead of DATA_BINARY. + We have to test for both values as the innodb_table_stats + table may come from MySQL and have the old type. + */ + if (req_schema->columns[i].mtype != table->cols[j].mtype && + !(req_schema->columns[i].mtype == DATA_INT && + table->cols[j].mtype == DATA_FIXBINARY)) { + } else if ((~table->cols[j].prtype + & req_schema->columns[i].prtype_mask)) { + } else { + continue; + } + + int s = snprintf(errstr, errstr_sz, + "Column %s in table %s is ", + req_schema->columns[i].name, + req_schema->table_name_sql); + if (s < 0 || static_cast<size_t>(s) >= errstr_sz) { + return DB_ERROR; + } + errstr += s; + errstr_sz -= s; + s = dtype_sql_name(table->cols[j].mtype, table->cols[j].prtype, + table->cols[j].len, errstr, errstr_sz); + if (s < 0 || static_cast<size_t>(s) + sizeof " but should be " + >= errstr_sz) { + return DB_ERROR; + } + errstr += s; + memcpy(errstr, " but should be ", sizeof " but should be "); + errstr += (sizeof " but should be ") - 1; + errstr_sz -= s + (sizeof " but should be ") - 1; + s = dtype_sql_name(req_schema->columns[i].mtype, + req_schema->columns[i].prtype_mask, + req_schema->columns[i].len, + errstr, errstr_sz); + return DB_ERROR; + } + + if (size_t n_foreign = table->foreign_set.size()) { + snprintf(errstr, errstr_sz, + "Table %s has %zu foreign key(s) pointing" + " to other tables, but it must have 0.", + req_schema->table_name_sql, n_foreign); + return DB_ERROR; + } + + if (size_t n_referenced = table->referenced_set.size()) { + snprintf(errstr, errstr_sz, + "There are %zu foreign key(s) pointing to %s, " + "but there must be 0.", n_referenced, + req_schema->table_name_sql); + return DB_ERROR; + } + + return DB_SUCCESS; +} + +/*********************************************************************//** +Checks whether the persistent statistics storage exists and that all +tables have the proper structure. +@return true if exists and all tables are ok */ +static bool dict_stats_persistent_storage_check(bool dict_already_locked) +{ + char errstr[512]; + dberr_t ret; + + if (!dict_already_locked) { + dict_sys.lock(SRW_LOCK_CALL); + } + + ut_ad(dict_sys.locked()); + + /* first check table_stats */ + ret = dict_table_schema_check(&table_stats_schema, errstr, + sizeof(errstr)); + if (ret == DB_SUCCESS) { + /* if it is ok, then check index_stats */ + ret = dict_table_schema_check(&index_stats_schema, errstr, + sizeof(errstr)); + } + + if (!dict_already_locked) { + dict_sys.unlock(); + } + + switch (ret) { + case DB_SUCCESS: + return true; + default: + if (!opt_bootstrap) { + ib::error() << errstr; + } + /* fall through */ + case DB_STATS_DO_NOT_EXIST: + return false; + } +} + +/** Executes a given SQL statement using the InnoDB internal SQL parser. +This function will free the pinfo object. +@param[in,out] pinfo pinfo to pass to que_eval_sql() must already +have any literals bound to it +@param[in] sql SQL string to execute +@param[in,out] trx transaction +@return DB_SUCCESS or error code */ +static +dberr_t dict_stats_exec_sql(pars_info_t *pinfo, const char* sql, trx_t *trx) +{ + ut_ad(dict_sys.locked()); + + if (!dict_stats_persistent_storage_check(true)) + { + pars_info_free(pinfo); + return DB_STATS_DO_NOT_EXIST; + } + + return que_eval_sql(pinfo, sql, trx); +} + +/*********************************************************************//** +Duplicate a table object and its indexes. +This function creates a dummy dict_table_t object and initializes the +following table and index members: +dict_table_t::id (copied) +dict_table_t::heap (newly created) +dict_table_t::name (copied) +dict_table_t::corrupted (copied) +dict_table_t::indexes<> (newly created) +dict_table_t::magic_n +for each entry in dict_table_t::indexes, the following are initialized: +(indexes that have DICT_FTS set in index->type are skipped) +dict_index_t::id (copied) +dict_index_t::name (copied) +dict_index_t::table_name (points to the copied table name) +dict_index_t::table (points to the above semi-initialized object) +dict_index_t::type (copied) +dict_index_t::to_be_dropped (copied) +dict_index_t::online_status (copied) +dict_index_t::n_uniq (copied) +dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name) +dict_index_t::indexes<> (newly created) +dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized) +dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized) +dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized) +dict_index_t::magic_n +The returned object should be freed with dict_stats_table_clone_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_table_clone_create( +/*==========================*/ + const dict_table_t* table) /*!< in: table whose stats to copy */ +{ + size_t heap_size; + dict_index_t* index; + + /* Estimate the size needed for the table and all of its indexes */ + + heap_size = 0; + heap_size += sizeof(dict_table_t); + heap_size += strlen(table->name.m_name) + 1; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + ulint n_uniq = dict_index_get_n_unique(index); + + heap_size += sizeof(dict_index_t); + heap_size += strlen(index->name) + 1; + heap_size += n_uniq * sizeof(index->fields[0]); + for (ulint i = 0; i < n_uniq; i++) { + heap_size += strlen(index->fields[i].name) + 1; + } + heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]); + heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]); + heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]); + } + + /* Allocate the memory and copy the members */ + + mem_heap_t* heap; + + heap = mem_heap_create(heap_size); + + dict_table_t* t; + + t = (dict_table_t*) mem_heap_zalloc(heap, sizeof(*t)); + + t->stats_mutex_init(); + + MEM_CHECK_DEFINED(&table->id, sizeof(table->id)); + t->id = table->id; + + t->heap = heap; + + t->name.m_name = mem_heap_strdup(heap, table->name.m_name); + t->mdl_name.m_name = t->name.m_name; + + t->corrupted = table->corrupted; + + UT_LIST_INIT(t->indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(t->freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + dict_index_t* idx; + + idx = (dict_index_t*) mem_heap_zalloc(heap, sizeof(*idx)); + + MEM_CHECK_DEFINED(&index->id, sizeof(index->id)); + idx->id = index->id; + + idx->name = mem_heap_strdup(heap, index->name); + + idx->table = t; + + idx->type = index->type; + + idx->to_be_dropped = 0; + + idx->online_status = ONLINE_INDEX_COMPLETE; + idx->set_committed(true); + + idx->n_uniq = index->n_uniq; + + idx->fields = (dict_field_t*) mem_heap_zalloc( + heap, idx->n_uniq * sizeof(idx->fields[0])); + + for (ulint i = 0; i < idx->n_uniq; i++) { + idx->fields[i].name = mem_heap_strdup( + heap, index->fields[i].name); + } + + /* hook idx into t->indexes */ + UT_LIST_ADD_LAST(t->indexes, idx); + + idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0])); + + idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0])); + + idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + idx->stat_defrag_n_page_split = 0; + idx->stat_defrag_n_pages_freed = 0; + } + + ut_d(t->magic_n = DICT_TABLE_MAGIC_N); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_table_clone_create(). */ +static +void +dict_stats_table_clone_free( +/*========================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + t->stats_mutex_destroy(); + mem_heap_free(t->heap); +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into an index +statistics members. The resulting stats correspond to an empty index. */ +static +void +dict_stats_empty_index( +/*===================*/ + dict_index_t* index, /*!< in/out: index */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(index->table->stats_mutex_is_owner()); + + ulint n_uniq = index->n_uniq; + + for (ulint i = 0; i < n_uniq; i++) { + index->stat_n_diff_key_vals[i] = 0; + index->stat_n_sample_sizes[i] = 1; + index->stat_n_non_null_key_vals[i] = 0; + } + + index->stat_index_size = 1; + index->stat_n_leaf_pages = 1; + + if (empty_defrag_stats) { + dict_stats_empty_defrag_stats(index); + dict_stats_empty_defrag_summary(index); + } +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into a table and its indexes' +statistics members. The resulting stats correspond to an empty table. */ +static +void +dict_stats_empty_table( +/*===================*/ + dict_table_t* table, /*!< in/out: table */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ +{ + /* Initialize table/index level stats is now protected by + table level lock_mutex.*/ + table->stats_mutex_lock(); + + /* Zero the stats members */ + table->stat_n_rows = 0; + table->stat_clustered_index_size = 1; + /* 1 page for each index, not counting the clustered */ + table->stat_sum_of_other_index_sizes + = UT_LIST_GET_LEN(table->indexes) - 1; + table->stat_modified_counter = 0; + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->type & DICT_FTS) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + dict_stats_empty_index(index, empty_defrag_stats); + } + + table->stat_initialized = TRUE; + table->stats_mutex_unlock(); +} + +/*********************************************************************//** +Check whether index's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized_index( +/*================================*/ + const dict_index_t* index) /*!< in: index */ +{ + MEM_CHECK_DEFINED( + index->stat_n_diff_key_vals, + index->n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + + MEM_CHECK_DEFINED( + index->stat_n_sample_sizes, + index->n_uniq * sizeof(index->stat_n_sample_sizes[0])); + + MEM_CHECK_DEFINED( + index->stat_n_non_null_key_vals, + index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + + MEM_CHECK_DEFINED( + &index->stat_index_size, + sizeof(index->stat_index_size)); + + MEM_CHECK_DEFINED( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); +} + +/*********************************************************************//** +Check whether table's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_a(table->stat_initialized); + + MEM_CHECK_DEFINED(&table->stats_last_recalc, + sizeof table->stats_last_recalc); + + MEM_CHECK_DEFINED(&table->stat_persistent, + sizeof table->stat_persistent); + + MEM_CHECK_DEFINED(&table->stats_auto_recalc, + sizeof table->stats_auto_recalc); + + MEM_CHECK_DEFINED(&table->stats_sample_pages, + sizeof table->stats_sample_pages); + + MEM_CHECK_DEFINED(&table->stat_n_rows, + sizeof table->stat_n_rows); + + MEM_CHECK_DEFINED(&table->stat_clustered_index_size, + sizeof table->stat_clustered_index_size); + + MEM_CHECK_DEFINED(&table->stat_sum_of_other_index_sizes, + sizeof table->stat_sum_of_other_index_sizes); + + MEM_CHECK_DEFINED(&table->stat_modified_counter, + sizeof table->stat_modified_counter); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!dict_stats_should_ignore_index(index)) { + dict_stats_assert_initialized_index(index); + } + } +} + +#define INDEX_EQ(i1, i2) \ + ((i1) != NULL \ + && (i2) != NULL \ + && (i1)->id == (i2)->id \ + && strcmp((i1)->name, (i2)->name) == 0) + +/*********************************************************************//** +Copy table and index statistics from one table to another, including index +stats. Extra indexes in src are ignored and extra indexes in dst are +initialized to correspond to an empty index. */ +static +void +dict_stats_copy( +/*============*/ + dict_table_t* dst, /*!< in/out: destination table */ + const dict_table_t* src, /*!< in: source table */ + bool reset_ignored_indexes) /*!< in: if true, set ignored indexes + to have the same statistics as if + the table was empty */ +{ + ut_ad(src->stats_mutex_is_owner()); + ut_ad(dst->stats_mutex_is_owner()); + + dst->stats_last_recalc = src->stats_last_recalc; + dst->stat_n_rows = src->stat_n_rows; + dst->stat_clustered_index_size = src->stat_clustered_index_size; + dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes; + dst->stat_modified_counter = src->stat_modified_counter; + + dict_index_t* dst_idx; + dict_index_t* src_idx; + + for (dst_idx = dict_table_get_first_index(dst), + src_idx = dict_table_get_first_index(src); + dst_idx != NULL; + dst_idx = dict_table_get_next_index(dst_idx), + (src_idx != NULL + && (src_idx = dict_table_get_next_index(src_idx)))) { + + if (dict_stats_should_ignore_index(dst_idx)) { + if (reset_ignored_indexes) { + /* Reset index statistics for all ignored indexes, + unless they are FT indexes (these have no statistics)*/ + if (dst_idx->type & DICT_FTS) { + continue; + } + dict_stats_empty_index(dst_idx, true); + } else { + continue; + } + } + + ut_ad(!dict_index_is_ibuf(dst_idx)); + + if (!INDEX_EQ(src_idx, dst_idx)) { + for (src_idx = dict_table_get_first_index(src); + src_idx != NULL; + src_idx = dict_table_get_next_index(src_idx)) { + + if (INDEX_EQ(src_idx, dst_idx)) { + break; + } + } + } + + if (!INDEX_EQ(src_idx, dst_idx)) { + dict_stats_empty_index(dst_idx, true); + continue; + } + + ulint n_copy_el; + + if (dst_idx->n_uniq > src_idx->n_uniq) { + n_copy_el = src_idx->n_uniq; + /* Since src is smaller some elements in dst + will remain untouched by the following memmove(), + thus we init all of them here. */ + dict_stats_empty_index(dst_idx, true); + } else { + n_copy_el = dst_idx->n_uniq; + } + + memmove(dst_idx->stat_n_diff_key_vals, + src_idx->stat_n_diff_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0])); + + memmove(dst_idx->stat_n_sample_sizes, + src_idx->stat_n_sample_sizes, + n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0])); + + memmove(dst_idx->stat_n_non_null_key_vals, + src_idx->stat_n_non_null_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0])); + + dst_idx->stat_index_size = src_idx->stat_index_size; + + dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + + dst_idx->stat_defrag_modified_counter = + src_idx->stat_defrag_modified_counter; + dst_idx->stat_defrag_n_pages_freed = + src_idx->stat_defrag_n_pages_freed; + dst_idx->stat_defrag_n_page_split = + src_idx->stat_defrag_n_page_split; + } + + dst->stat_initialized = TRUE; +} + +/** Duplicate the stats of a table and its indexes. +This function creates a dummy dict_table_t object and copies the input +table's stats into it. The returned table object is not in the dictionary +cache and cannot be accessed by any other threads. In addition to the +members copied in dict_stats_table_clone_create() this function initializes +the following: +dict_table_t::stat_initialized +dict_table_t::stat_persistent +dict_table_t::stat_n_rows +dict_table_t::stat_clustered_index_size +dict_table_t::stat_sum_of_other_index_sizes +dict_table_t::stat_modified_counter +dict_index_t::stat_n_diff_key_vals[] +dict_index_t::stat_n_sample_sizes[] +dict_index_t::stat_n_non_null_key_vals[] +dict_index_t::stat_index_size +dict_index_t::stat_n_leaf_pages +dict_index_t::stat_defrag_modified_counter +dict_index_t::stat_defrag_n_pages_freed +dict_index_t::stat_defrag_n_page_split +The returned object should be freed with dict_stats_snapshot_free() +when no longer needed. +@param[in] table table whose stats to copy +@return incomplete table object */ +static +dict_table_t* +dict_stats_snapshot_create( + dict_table_t* table) +{ + dict_sys.lock(SRW_LOCK_CALL); + + dict_stats_assert_initialized(table); + + dict_table_t* t; + + t = dict_stats_table_clone_create(table); + + table->stats_mutex_lock(); + ut_d(t->stats_mutex_lock()); + + dict_stats_copy(t, table, false); + + ut_d(t->stats_mutex_unlock()); + table->stats_mutex_unlock(); + + t->stat_persistent = table->stat_persistent; + t->stats_auto_recalc = table->stats_auto_recalc; + t->stats_sample_pages = table->stats_sample_pages; + + dict_sys.unlock(); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_snapshot_create(). */ +static +void +dict_stats_snapshot_free( +/*=====================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + dict_stats_table_clone_free(t); +} + +/** Statistics for one field of an index. */ +struct index_field_stats_t +{ + ib_uint64_t n_diff_key_vals; + ib_uint64_t n_sample_sizes; + ib_uint64_t n_non_null_key_vals; + + index_field_stats_t(ib_uint64_t n_diff_key_vals= 0, + ib_uint64_t n_sample_sizes= 0, + ib_uint64_t n_non_null_key_vals= 0) + : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes), + n_non_null_key_vals(n_non_null_key_vals) + { + } + + bool is_bulk_operation() const + { + return n_diff_key_vals == UINT64_MAX && + n_sample_sizes == UINT64_MAX && n_non_null_key_vals == UINT64_MAX; + } +}; + +/*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_uint64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + if (rec_offs_nth_sql_null(offsets, i)) { + break; + } + + n_not_null[i]++; + } +} + +inline dberr_t +btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr) +{ + ut_ad(!index()->is_spatial()); + ut_ad(!mtr.get_savepoint()); + + mtr_s_lock_index(index(), &mtr); + + if (index()->page == FIL_NULL) + return DB_CORRUPTION; + + dberr_t err; + auto offset= index()->page; + bool merge= false; + ulint height= ULINT_UNDEFINED; + + while (buf_block_t *block= + btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err)) + { + page_cur.block= block; + + if (height == ULINT_UNDEFINED) + { + height= btr_page_get_level(block->page.frame); + if (height > BTR_MAX_LEVELS) + return DB_CORRUPTION; + + if (height == 0) + goto got_leaf; + } + + if (height == 0) + { + mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1); + got_leaf: + page_cur.rec= page_get_infimum_rec(block->page.frame); + return DB_SUCCESS; + } + + if (!--height) + merge= !index()->is_clust(); + + page_cur_open_on_rnd_user_rec(&page_cur); + + offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0, + ULINT_UNDEFINED, &heap); + + /* Go to the child node */ + offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); + } + + return err; +} + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \ + (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + +/** Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +result.n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array result.n_non_null_key_vals. +@param index B-tree index +@param bulk_trx_id the value of index->table->bulk_trx_id at the start +@return vector with statistics information +empty vector if the index is unavailable. */ +static +std::vector<index_field_stats_t> +btr_estimate_number_of_different_key_vals(dict_index_t* index, + trx_id_t bulk_trx_id) +{ + page_t* page; + rec_t* rec; + ulint n_cols; + ib_uint64_t* n_diff; + ib_uint64_t* n_not_null; + ibool stats_null_not_equal; + uintmax_t n_sample_pages=1; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + uintmax_t add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + rec_offs* offsets_rec = NULL; + rec_offs* offsets_next_rec = NULL; + + std::vector<index_field_stats_t> result; + + ut_ad(index->is_btree()); + + n_cols = dict_index_get_n_unique(index); + + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * n_cols + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof(n_diff[0])); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } + + if (srv_stats_sample_traditional) { + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_transient_sample_pages > index->stat_index_size) { + if (index->stat_index_size > 0) { + n_sample_pages = index->stat_index_size; + } + } else { + n_sample_pages = srv_stats_transient_sample_pages; + } + } else { + /* New logaritmic number of pages that are estimated. + Number of pages estimated should be between 1 and + index->stat_index_size. + + If we have only 0 or 1 index pages then we can only take 1 + sample. We have already initialized n_sample_pages to 1. + + So taking index size as I and sample as S and log(I)*S as L + + requirement 1) we want the out limit of the expression to not exceed I; + requirement 2) we want the ideal pages to be at least S; + so the current expression is min(I, max( min(S,I), L) + + looking for simplifications: + + case 1: assume S < I + min(I, max( min(S,I), L) -> min(I , max( S, L)) + + but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L. + + so we have: min(I , L) + + case 2: assume I < S + min(I, max( min(S,I), L) -> min(I, max( I, L)) + + case 2a: L > I + min(I, max( I, L)) -> min(I, L) -> I + + case 2b: when L < I + min(I, max( I, L)) -> min(I, I ) -> I + + so taking all case2 paths is I, our expression is: + n_pages = S < I? min(I,L) : I + */ + if (index->stat_index_size > 1) { + n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) + ? ut_min(index->stat_index_size, + static_cast<ulint>( + log2(double(index->stat_index_size)) + * double(srv_stats_transient_sample_pages))) + : index->stat_index_size; + } + } + + /* Sanity check */ + ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size)); + + /* We sample some pages in the index to get an estimate */ + btr_cur_t cursor; + cursor.page_cur.index = index; + + for (ulint i = 0; i < n_sample_pages; i++) { + mtr.start(); + + if (cursor.open_random_leaf(offsets_rec, heap, mtr) != + DB_SUCCESS + || index->table->bulk_trx_id != bulk_trx_id) { + mtr.commit(); + goto exit_loop; + } + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + page = btr_cur_get_page(&cursor); + + rec = page_rec_get_next(cursor.page_cur.rec); + const ulint n_core = index->n_core_fields; + + if (rec && !page_rec_is_supremum(rec)) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + n_core, + ULINT_UNDEFINED, &heap); + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_rec, n_not_null); + } + } + + while (!page_rec_is_supremum(rec)) { + ulint matched_fields; + rec_t* next_rec = page_rec_get_next(rec); + if (!next_rec || page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); + break; + } + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_core, + ULINT_UNDEFINED, + &heap); + + cmp_rec_rec(rec, next_rec, + offsets_rec, offsets_next_rec, + index, stats_null_not_equal, + &matched_fields); + + for (ulint j = matched_fields; j < n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_next_rec, n_not_null); + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + rec_offs* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + if (n_cols == dict_index_get_n_unique_in_tree(index) + && page_has_siblings(page)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + n_diff[n_cols - 1]++; + } + + mtr.commit(); + } + +exit_loop: + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + result.reserve(n_cols); + + for (ulint j = 0; j < n_cols; j++) { + index_field_stats_t stat; + + stat.n_diff_key_vals + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = index->stat_n_leaf_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + stat.n_diff_key_vals += add_on; + + stat.n_sample_sizes = n_sample_pages; + + if (n_not_null != NULL) { + stat.n_non_null_key_vals = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } + + result.push_back(stat); + } + + mem_heap_free(heap); + return result; +} + +/*********************************************************************//** +Calculates new estimates for index statistics. This function is +relatively quick and is used to calculate transient statistics that +are not saved on disk. This was the only way to calculate statistics +before the Persistent Statistics feature was introduced. +This function doesn't update the defragmentation related stats. +Only persistent statistics supports defragmentation stats. +@return error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_transient_for_index( +/*==================================*/ + dict_index_t* index) /*!< in/out: index */ +{ + dberr_t err = DB_SUCCESS; + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO + || !dict_index_is_clust(index))) { + /* If we have set a high innodb_force_recovery + level, do not calculate statistics, as a badly + corrupted index can cause a crash in it. + Initialize some bogus index cardinality + statistics, so that the data can be queried in + various means, also via secondary indexes. */ +dummy_empty: + index->table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + index->table->stats_mutex_unlock(); + return err; +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + } else if (ibuf_debug && !dict_index_is_clust(index)) { + goto dummy_empty; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + } else if (dict_index_is_online_ddl(index) || !index->is_committed() + || !index->table->space) { + goto dummy_empty; + } else { + mtr_t mtr; + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, + &mtr, &err); + if (!root) { +invalid: + mtr.commit(); + goto dummy_empty; + } + + const auto bulk_trx_id = index->table->bulk_trx_id; + if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { + err= DB_SUCCESS_LOCKED_REC; + goto invalid; + } + + mtr.x_lock_space(index->table->space); + + ulint dummy, size; + index->stat_index_size + = fseg_n_reserved_pages(*root, PAGE_HEADER + + PAGE_BTR_SEG_LEAF + + root->page.frame, &size, + &mtr) + + fseg_n_reserved_pages(*root, PAGE_HEADER + + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, + &mtr); + + mtr.commit(); + + index->stat_n_leaf_pages = size ? size : 1; + + /* Do not continue if table decryption has failed or + table is already marked as corrupted. */ + if (index->is_readable()) { + std::vector<index_field_stats_t> stats + = btr_estimate_number_of_different_key_vals( + index, bulk_trx_id); + + if (!stats.empty()) { + index->table->stats_mutex_lock(); + for (size_t i = 0; i < stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats[i].n_non_null_key_vals; + } + index->table->stats_mutex_unlock(); + } + } + } + + return err; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. +@return error code +@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_transient( +/*========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!table->stats_mutex_is_owner()); + + dict_index_t* index; + ulint sum_of_index_sizes = 0; + dberr_t err = DB_SUCCESS; + + /* Find out the sizes of the indexes and how many different values + for the key they approximately have */ + + index = dict_table_get_first_index(table); + + if (!table->space) { + /* Nothing to do. */ +empty_table: + dict_stats_empty_table(table, true); + return err; + } else if (index == NULL) { + /* Table definition is corrupt */ + + ib::warn() << "Table " << table->name + << " has no indexes. Cannot calculate statistics."; + goto empty_table; + } + + for (; index != NULL; index = dict_table_get_next_index(index)) { + + ut_ad(!dict_index_is_ibuf(index)); + + if (!index->is_btree()) { + continue; + } + + if (dict_stats_should_ignore_index(index) + || !index->is_readable() + || err == DB_SUCCESS_LOCKED_REC) { + index->table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + index->table->stats_mutex_unlock(); + continue; + } + + err = dict_stats_update_transient_for_index(index); + + sum_of_index_sizes += index->stat_index_size; + } + + table->stats_mutex_lock(); + + index = dict_table_get_first_index(table); + + table->stat_n_rows = index->stat_n_diff_key_vals[ + dict_index_get_n_unique(index) - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + table->stat_sum_of_other_index_sizes = sum_of_index_sizes + - index->stat_index_size; + + table->stats_last_recalc = time(NULL); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; + + table->stats_mutex_unlock(); + + return err; +} + +/** Open a cursor at the first page in a tree level. +@param page_cur cursor +@param level level to search for (0=leaf) +@param mtr mini-transaction */ +static dberr_t page_cur_open_level(page_cur_t *page_cur, ulint level, + mtr_t *mtr) +{ + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + dberr_t err; + + dict_index_t *const index= page_cur->index; + + rec_offs_init(offsets_); + ut_ad(level != ULINT_UNDEFINED); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_SX_LOCK)); + ut_ad(mtr->get_savepoint() == 1); + + uint32_t page= index->page; + + for (ulint height = ULINT_UNDEFINED;; height--) + { + buf_block_t* block= + btr_block_get(*index, page, RW_S_LATCH, + !height && !index->is_clust(), mtr, &err); + if (!block) + break; + + const uint32_t l= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + ut_ad(!heap); + /* We are in the root node */ + height= l; + if (UNIV_UNLIKELY(height < level)) + return DB_CORRUPTION; + } + else if (UNIV_UNLIKELY(height != l) || page_has_prev(block->page.frame)) + { + err= DB_CORRUPTION; + break; + } + + page_cur_set_before_first(block, page_cur); + + if (height == level) + break; + + ut_ad(height); + + if (!page_cur_move_to_next(page_cur)) + { + err= DB_CORRUPTION; + break; + } + + offsets= rec_get_offsets(page_cur->rec, index, offsets, 0, ULINT_UNDEFINED, + &heap); + page= btr_node_ptr_get_child_page_no(page_cur->rec, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + /* Release all page latches except the one on the desired page. */ + const auto end= mtr->get_savepoint(); + if (end > 1) + mtr->rollback_to_savepoint(1, end - 1); + + return err; +} + +/** Open a cursor at the first page in a tree level. +@param page_cur cursor +@param level level to search for (0=leaf) +@param mtr mini-transaction +@param index index tree */ +static dberr_t btr_pcur_open_level(btr_pcur_t *pcur, ulint level, mtr_t *mtr, + dict_index_t *index) +{ + pcur->latch_mode= BTR_SEARCH_LEAF; + pcur->search_mode= PAGE_CUR_G; + pcur->pos_state= BTR_PCUR_IS_POSITIONED; + pcur->btr_cur.page_cur.index= index; + return page_cur_open_level(&pcur->btr_cur.page_cur, level, mtr); +} + + +/* @{ Pseudo code about the relation between the following functions + +let N = N_SAMPLE_PAGES(index) + +dict_stats_analyze_index() + for each n_prefix + search for good enough level: + dict_stats_analyze_index_level() // only called if level has <= N pages + // full scan of the level in one mtr + collect statistics about the given level + if we are not satisfied with the level, search next lower level + we have found a good enough level here + dict_stats_analyze_index_for_n_prefix(that level, stats collected above) + // full scan of the level in one mtr + dive below some records and analyze the leaf page there: + dict_stats_analyze_index_below_cur() +@} */ + +/*********************************************************************//** +Find the total number and the number of distinct keys on a given level in +an index. Each of the 1..n_uniq prefixes are looked up and the results are +saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of +records on the level is saved in total_recs. +Also, the index of the last record in each group of equal records is saved +in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost +record on the level and continues cross pages boundaries, counting from 0. */ +static +void +dict_stats_analyze_index_level( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level */ + ib_uint64_t* n_diff, /*!< out: array for number of + distinct keys for all prefixes */ + ib_uint64_t* total_recs, /*!< out: total number of records */ + ib_uint64_t* total_pages, /*!< out: total number of pages */ + boundaries_t* n_diff_boundaries,/*!< out: boundaries of the groups + of distinct keys */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n_uniq; + mem_heap_t* heap; + btr_pcur_t pcur; + const page_t* page; + const rec_t* rec; + const rec_t* prev_rec; + bool prev_rec_is_copied; + byte* prev_rec_buf = NULL; + ulint prev_rec_buf_size = 0; + rec_offs* rec_offsets; + rec_offs* prev_rec_offsets; + ulint i; + + DEBUG_PRINTF(" %s(table=%s, index=%s, level=" ULINTPF ")\n", + __func__, index->table->name, index->name, level); + + *total_recs = 0; + *total_pages = 0; + + n_uniq = dict_index_get_n_unique(index); + + /* elements in the n_diff array are 0..n_uniq-1 (inclusive) */ + memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0])); + + /* Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_uniq + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + i = (REC_OFFS_HEADER_SIZE + 1 + 1) + n_uniq; + + heap = mem_heap_create((2 * sizeof *rec_offsets) * i); + rec_offsets = static_cast<rec_offs*>( + mem_heap_alloc(heap, i * sizeof *rec_offsets)); + prev_rec_offsets = static_cast<rec_offs*>( + mem_heap_alloc(heap, i * sizeof *prev_rec_offsets)); + rec_offs_set_n_alloc(rec_offsets, i); + rec_offs_set_n_alloc(prev_rec_offsets, i); + + /* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */ + if (n_diff_boundaries != NULL) { + for (i = 0; i < n_uniq; i++) { + n_diff_boundaries[i].erase( + n_diff_boundaries[i].begin(), + n_diff_boundaries[i].end()); + } + } + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + if (btr_pcur_open_level(&pcur, level, mtr, index) != DB_SUCCESS + || !btr_pcur_move_to_next_on_page(&pcur)) { + goto func_exit; + } + + page = btr_pcur_get_page(&pcur); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); + + prev_rec = NULL; + prev_rec_is_copied = false; + + if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page))) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + if (level == 0) { + /* Skip the metadata pseudo-record */ + ut_ad(index->is_instant()); + btr_pcur_move_to_next_user_rec(&pcur, mtr); + } + } else if (UNIV_UNLIKELY(level != 0)) { + /* The first record on the leftmost page must be + marked as such on each level except the leaf level. */ + goto func_exit; + } + + /* iterate over all user records on this level + and compare each two adjacent ones, even the last on page + X and the fist on page X+1 */ + for (; + btr_pcur_is_on_user_rec(&pcur); + btr_pcur_move_to_next_user_rec(&pcur, mtr)) { + + bool rec_is_last_on_page; + + rec = btr_pcur_get_rec(&pcur); + + /* If rec and prev_rec are on different pages, then prev_rec + must have been copied, because we hold latch only on the page + where rec resides. */ + if (prev_rec != NULL + && page_align(rec) != page_align(prev_rec)) { + + ut_a(prev_rec_is_copied); + } + + rec_is_last_on_page = + page_rec_is_supremum(page_rec_get_next_const(rec)); + + /* increment the pages counter at the end of each page */ + if (rec_is_last_on_page) { + + (*total_pages)++; + } + + /* Skip delete-marked records on the leaf level. If we + do not skip them, then ANALYZE quickly after DELETE + could count them or not (purge may have already wiped + them away) which brings non-determinism. We skip only + leaf-level delete marks because delete marks on + non-leaf level do not make sense. */ + + if (level == 0 + && !srv_stats_include_delete_marked + && rec_get_deleted_flag(rec, page_rec_is_comp(rec))) { + if (rec_is_last_on_page + && !prev_rec_is_copied + && prev_rec != NULL) { + /* copy prev_rec */ + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + index->n_core_fields, + n_uniq, &heap); + + prev_rec = rec_copy_prefix_to_buf( + prev_rec, index, n_uniq, + &prev_rec_buf, &prev_rec_buf_size); + + prev_rec_is_copied = true; + } + + continue; + } + rec_offsets = rec_get_offsets(rec, index, rec_offsets, + level ? 0 : index->n_core_fields, + n_uniq, &heap); + + (*total_recs)++; + + if (prev_rec != NULL) { + ulint matched_fields; + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + level ? 0 : index->n_core_fields, + n_uniq, &heap); + + cmp_rec_rec(prev_rec, rec, + prev_rec_offsets, rec_offsets, index, + false, &matched_fields); + + for (i = matched_fields; i < n_uniq; i++) { + + if (n_diff_boundaries != NULL) { + /* push the index of the previous + record, that is - the last one from + a group of equal keys */ + + ib_uint64_t idx; + + /* the index of the current record + is total_recs - 1, the index of the + previous record is total_recs - 2; + we know that idx is not going to + become negative here because if we + are in this branch then there is a + previous record and thus + total_recs >= 2 */ + idx = *total_recs - 2; + + n_diff_boundaries[i].push_back(idx); + } + + /* increment the number of different keys + for n_prefix=i+1 (e.g. if i=0 then we increment + for n_prefix=1 which is stored in n_diff[0]) */ + n_diff[i]++; + } + } else { + /* this is the first non-delete marked record */ + for (i = 0; i < n_uniq; i++) { + n_diff[i] = 1; + } + } + + if (rec_is_last_on_page) { + /* end of a page has been reached */ + + /* we need to copy the record instead of assigning + like prev_rec = rec; because when we traverse the + records on this level at some point we will jump from + one page to the next and then rec and prev_rec will + be on different pages and + btr_cur_move_to_next_user_rec() will release the + latch on the page that prev_rec is on */ + prev_rec = rec_copy_prefix_to_buf( + rec, index, n_uniq, + &prev_rec_buf, &prev_rec_buf_size); + prev_rec_is_copied = true; + + } else { + /* still on the same page, the next call to + btr_cur_move_to_next_user_rec() will not jump + on the next page, we can simply assign pointers + instead of copying the records like above */ + + prev_rec = rec; + prev_rec_is_copied = false; + } + } + + /* if *total_pages is left untouched then the above loop was not + entered at all and there is one page in the whole tree which is + empty or the loop was entered but this is level 0, contains one page + and all records are delete-marked */ + if (*total_pages == 0) { + + ut_ad(level == 0); + ut_ad(*total_recs == 0); + + *total_pages = 1; + } + + /* if there are records on this level and boundaries + should be saved */ + if (*total_recs > 0 && n_diff_boundaries != NULL) { + + /* remember the index of the last record on the level as the + last one from the last group of equal keys; this holds for + all possible prefixes */ + for (i = 0; i < n_uniq; i++) { + ib_uint64_t idx; + + idx = *total_recs - 1; + + n_diff_boundaries[i].push_back(idx); + } + } + + /* now in n_diff_boundaries[i] there are exactly n_diff[i] integers, + for i=0..n_uniq-1 */ + +#ifdef UNIV_STATS_DEBUG + for (i = 0; i < n_uniq; i++) { + + DEBUG_PRINTF(" %s(): total recs: " UINT64PF + ", total pages: " UINT64PF + ", n_diff[" ULINTPF "]: " UINT64PF "\n", + __func__, *total_recs, + *total_pages, + i, n_diff[i]); + +#if 0 + if (n_diff_boundaries != NULL) { + ib_uint64_t j; + + DEBUG_PRINTF(" %s(): boundaries[%lu]: ", + __func__, i); + + for (j = 0; j < n_diff[i]; j++) { + ib_uint64_t idx; + + idx = n_diff_boundaries[i][j]; + + DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ", + j, idx); + } + DEBUG_PRINTF("\n"); + } +#endif + } +#endif /* UNIV_STATS_DEBUG */ + +func_exit: + ut_free(prev_rec_buf); + mem_heap_free(heap); +} + + +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +static +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const page_t *const page= page_align(rec); + + if (page_is_comp(page)) + { + for (rec= page_rec_get_next_low(rec, TRUE); + rec && rec_get_deleted_flag(rec, TRUE); + rec= page_rec_get_next_low(rec, TRUE)); + return rec ? rec : page + PAGE_NEW_SUPREMUM; + } + else + { + for (rec= page_rec_get_next_low(rec, FALSE); + rec && rec_get_deleted_flag(rec, FALSE); + rec= page_rec_get_next_low(rec, FALSE)); + return rec ? rec : page + PAGE_OLD_SUPREMUM; + } +} + +/** Scan a page, reading records from left to right and counting the number +of distinct records (looking only at the first n_prefix +columns) and the number of external pages pointed by records from this page. +If scan_method is QUIT_ON_FIRST_NON_BORING then the function +will return as soon as it finds a record that does not match its neighbor +to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the +returned n_diff can either be 0 (empty page), 1 (the whole page has all keys +equal) or 2 (the function found a non-boring record and returned). +@param[out] out_rec record, or NULL +@param[out] offsets1 rec_get_offsets() working space (must +be big enough) +@param[out] offsets2 rec_get_offsets() working space (must +be big enough) +@param[in] index index of the page +@param[in] page the page to scan +@param[in] n_prefix look at the first n_prefix columns +@param[in] n_core 0, or index->n_core_fields for leaf +@param[out] n_diff number of distinct records encountered +@param[out] n_external_pages if this is non-NULL then it will be set +to the number of externally stored pages which were encountered +@return offsets1 or offsets2 (the offsets of *out_rec), +or NULL if the page is empty and does not contain user records. */ +UNIV_INLINE +rec_offs* +dict_stats_scan_page( + const rec_t** out_rec, + rec_offs* offsets1, + rec_offs* offsets2, + const dict_index_t* index, + const page_t* page, + ulint n_prefix, + ulint n_core, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) +{ + rec_offs* offsets_rec = offsets1; + rec_offs* offsets_next_rec = offsets2; + const rec_t* rec; + const rec_t* next_rec; + /* A dummy heap, to be passed to rec_get_offsets(). + Because offsets1,offsets2 should be big enough, + this memory heap should never be used. */ + mem_heap_t* heap = NULL; + ut_ad(!!n_core == page_is_leaf(page)); + const rec_t* (*get_next)(const rec_t*) + = !n_core || srv_stats_include_delete_marked + ? page_rec_get_next_const + : page_rec_get_next_non_del_marked; + + const bool should_count_external_pages = n_external_pages != NULL; + + if (should_count_external_pages) { + *n_external_pages = 0; + } + + rec = get_next(page_get_infimum_rec(page)); + + if (!rec || page_rec_is_supremum(rec)) { + /* the page is empty or contains only delete-marked records */ + *n_diff = 0; + *out_rec = NULL; + return(NULL); + } + + offsets_rec = rec_get_offsets(rec, index, offsets_rec, n_core, + ULINT_UNDEFINED, &heap); + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(rec); + + *n_diff = 1; + + while (next_rec && !page_rec_is_supremum(next_rec)) { + + ulint matched_fields; + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, n_core, + ULINT_UNDEFINED, + &heap); + + /* check whether rec != next_rec when looking at + the first n_prefix fields */ + cmp_rec_rec(rec, next_rec, offsets_rec, offsets_next_rec, + index, false, &matched_fields); + + if (matched_fields < n_prefix) { + /* rec != next_rec, => rec is non-boring */ + + (*n_diff)++; + + if (!n_core) { + break; + } + } + + rec = next_rec; + /* Assign offsets_rec = offsets_next_rec so that + offsets_rec matches with rec which was just assigned + rec = next_rec above. Also need to point + offsets_next_rec to the place where offsets_rec was + pointing before because we have just 2 placeholders + where data is actually stored: offsets1 and offsets2 + and we are using them in circular fashion + (offsets[_next]_rec are just pointers to those + placeholders). */ + std::swap(offsets_rec, offsets_next_rec); + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(next_rec); + } + + /* offsets1,offsets2 should have been big enough */ + ut_a(heap == NULL); + *out_rec = rec; + return(offsets_rec); +} + +/** Dive below the current position of a cursor and calculate the number of +distinct records on the leaf page, when looking at the fist n_prefix +columns. Also calculate the number of external pages pointed by records +on the leaf page. +@param[in] cur cursor +@param[in] n_prefix look at the first n_prefix columns +when comparing records +@param[out] n_diff number of distinct records +@param[out] n_external_pages number of external pages +@return number of distinct records on the leaf page */ +static +void +dict_stats_analyze_index_below_cur( + const btr_cur_t* cur, + ulint n_prefix, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) +{ + dict_index_t* index; + buf_block_t* block; + const page_t* page; + mem_heap_t* heap; + const rec_t* rec; + rec_offs* offsets1; + rec_offs* offsets2; + rec_offs* offsets_rec; + ulint size; + mtr_t mtr; + + index = btr_cur_get_index(cur); + + /* Allocate offsets for the record and the node pointer, for + node pointer records. In a secondary index, the node pointer + record will consist of all index fields followed by a child + page number. + Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index); + + heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2)); + + offsets1 = static_cast<rec_offs*>(mem_heap_alloc( + heap, size * sizeof *offsets1)); + + offsets2 = static_cast<rec_offs*>(mem_heap_alloc( + heap, size * sizeof *offsets2)); + + rec_offs_set_n_alloc(offsets1, size); + rec_offs_set_n_alloc(offsets2, size); + + rec = btr_cur_get_rec(cur); + page = page_align(rec); + ut_ad(!page_rec_is_leaf(rec)); + + offsets_rec = rec_get_offsets(rec, index, offsets1, 0, + ULINT_UNDEFINED, &heap); + + page_id_t page_id(index->table->space_id, + btr_node_ptr_get_child_page_no( + rec, offsets_rec)); + const ulint zip_size = index->table->space->zip_size(); + + /* assume no external pages by default - in case we quit from this + function without analyzing any leaf pages */ + *n_external_pages = 0; + + mtr_start(&mtr); + + /* descend to the leaf level on the B-tree */ + for (;;) { + dberr_t err; + + block = buf_page_get_gen(page_id, zip_size, + RW_S_LATCH, NULL, BUF_GET, + &mtr, &err, + !index->is_clust() + && 1 == btr_page_get_level(page)); + if (!block) { + goto func_exit; + } + + page = block->page.frame; + + if (page_is_leaf(page)) { + /* leaf level */ + break; + } + /* else */ + + /* search for the first non-boring record on the page */ + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + 0, n_diff, NULL); + + /* pages on level > 0 are not allowed to be empty */ + ut_a(offsets_rec != NULL); + /* if page is not empty (offsets_rec != NULL) then n_diff must + be > 0, otherwise there is a bug in dict_stats_scan_page() */ + ut_a(*n_diff > 0); + + if (*n_diff == 1) { + mtr_commit(&mtr); + + /* page has all keys equal and the end of the page + was reached by dict_stats_scan_page(), no need to + descend to the leaf level */ + mem_heap_free(heap); + /* can't get an estimate for n_external_pages here + because we do not dive to the leaf level, assume no + external pages (*n_external_pages was assigned to 0 + above). */ + return; + } + /* else */ + + /* when we instruct dict_stats_scan_page() to quit on the + first non-boring record it finds, then the returned n_diff + can either be 0 (empty page), 1 (page has all keys equal) or + 2 (non-boring record was found) */ + ut_a(*n_diff == 2); + + /* we have a non-boring record in rec, descend below it */ + + page_id.set_page_no( + btr_node_ptr_get_child_page_no(rec, offsets_rec)); + } + + /* make sure we got a leaf page as a result from the above loop */ + ut_ad(page_is_leaf(page)); + + /* scan the leaf page and find the number of distinct keys, + when looking only at the first n_prefix columns; also estimate + the number of externally stored pages pointed by records on this + page */ + + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + index->n_core_fields, n_diff, + n_external_pages); + +#if 0 + DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n", + __func__, page_no, n_diff); +#endif + +func_exit: + mtr_commit(&mtr); + mem_heap_free(heap); +} + +/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[] +for each n-columns prefix (n from 1 to n_uniq). */ +struct n_diff_data_t { + /** Index of the level on which the descent through the btree + stopped. level 0 is the leaf level. This is >= 1 because we + avoid scanning the leaf level because it may contain too many + pages and doing so is useless when combined with the random dives - + if we are to scan the leaf level, this means a full scan and we can + simply do that instead of fiddling with picking random records higher + in the tree and to dive below them. At the start of the analyzing + we may decide to do full scan of the leaf level, but then this + structure is not used in that code path. */ + ulint level; + + /** Number of records on the level where the descend through the btree + stopped. When we scan the btree from the root, we stop at some mid + level, choose some records from it and dive below them towards a leaf + page to analyze. */ + ib_uint64_t n_recs_on_level; + + /** Number of different key values that were found on the mid level. */ + ib_uint64_t n_diff_on_level; + + /** Number of leaf pages that are analyzed. This is also the same as + the number of records that we pick from the mid level and dive below + them. */ + ib_uint64_t n_leaf_pages_to_analyze; + + /** Cumulative sum of the number of different key values that were + found on all analyzed pages. */ + ib_uint64_t n_diff_all_analyzed_pages; + + /** Cumulative sum of the number of external pages (stored outside of + the btree but in the same file segment). */ + ib_uint64_t n_external_pages_sum; +}; + +/** Estimate the number of different key values in an index when looking at +the first n_prefix columns. For a given level in an index select +n_diff_data->n_leaf_pages_to_analyze records from that level and dive below +them to the corresponding leaf pages, then scan those leaf pages and save the +sampling results in n_diff_data->n_diff_all_analyzed_pages. +@param[in] index index +@param[in] n_prefix look at first 'n_prefix' columns when +comparing records +@param[in] boundaries a vector that contains +n_diff_data->n_diff_on_level integers each of which represents the index (on +level 'level', counting from left/smallest to right/biggest from 0) of the +last record from each group of distinct keys +@param[in,out] n_diff_data n_diff_all_analyzed_pages and +n_external_pages_sum in this structure will be set by this function. The +members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the +caller in advance - they are used by some calculations inside this function +@param[in,out] mtr mini-transaction */ +static +void +dict_stats_analyze_index_for_n_prefix( + dict_index_t* index, + ulint n_prefix, + const boundaries_t* boundaries, + n_diff_data_t* n_diff_data, + mtr_t* mtr) +{ + btr_pcur_t pcur; + const page_t* page; + ib_uint64_t rec_idx; + ib_uint64_t i; + +#if 0 + DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu," + " n_diff_on_level=" UINT64PF ")\n", + __func__, index->table->name, index->name, level, + n_prefix, n_diff_data->n_diff_on_level); +#endif + + ut_ad(n_diff_data->level); + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + n_diff_data->n_diff_all_analyzed_pages = 0; + n_diff_data->n_external_pages_sum = 0; + + if (btr_pcur_open_level(&pcur, n_diff_data->level, mtr, index) + != DB_SUCCESS + || !btr_pcur_move_to_next_on_page(&pcur)) { + return; + } + + page = btr_pcur_get_page(&pcur); + + const rec_t* first_rec = btr_pcur_get_rec(&pcur); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + if (page_has_prev(page) + || !btr_pcur_is_on_user_rec(&pcur) + || btr_page_get_level(page) != n_diff_data->level + || first_rec != page_rec_get_next_const(page_get_infimum_rec(page)) + || !(rec_get_info_bits(first_rec, page_is_comp(page)) + & REC_INFO_MIN_REC_FLAG)) { + return; + } + + const ib_uint64_t last_idx_on_level = boundaries->at( + static_cast<unsigned>(n_diff_data->n_diff_on_level - 1)); + + rec_idx = 0; + + for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) { + /* there are n_diff_on_level elements + in 'boundaries' and we divide those elements + into n_leaf_pages_to_analyze segments, for example: + + let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then: + segment i=0: [0, 24] + segment i=1: [25, 49] + segment i=2: [50, 74] + segment i=3: [75, 99] or + + let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then: + segment i=0: [0, 0] or + + let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then: + segment i=0: [0, 0] + segment i=1: [1, 1] or + + let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then: + segment i=0: [0, 0] + segment i=1: [1, 2] + segment i=2: [3, 4] + segment i=3: [5, 6] + segment i=4: [7, 8] + segment i=5: [9, 10] + segment i=6: [11, 12] + + then we select a random record from each segment and dive + below it */ + const ib_uint64_t n_diff = n_diff_data->n_diff_on_level; + const ib_uint64_t n_pick + = n_diff_data->n_leaf_pages_to_analyze; + + const ib_uint64_t left = n_diff * i / n_pick; + const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1; + + ut_a(left <= right); + ut_a(right <= last_idx_on_level); + + const ulint rnd = ut_rnd_interval( + static_cast<ulint>(right - left)); + + const ib_uint64_t dive_below_idx + = boundaries->at(static_cast<unsigned>(left + rnd)); + +#if 0 + DEBUG_PRINTF(" %s(): dive below record with index=" + UINT64PF "\n", __func__, dive_below_idx); +#endif + + /* seek to the record with index dive_below_idx */ + while (rec_idx < dive_below_idx + && btr_pcur_is_on_user_rec(&pcur)) { + + btr_pcur_move_to_next_user_rec(&pcur, mtr); + rec_idx++; + } + + /* if the level has finished before the record we are + searching for, this means that the B-tree has changed in + the meantime, quit our sampling and use whatever stats + we have collected so far */ + if (rec_idx < dive_below_idx) { + + ut_ad(!btr_pcur_is_on_user_rec(&pcur)); + break; + } + + /* it could be that the tree has changed in such a way that + the record under dive_below_idx is the supremum record, in + this case rec_idx == dive_below_idx and pcur is positioned + on the supremum, we do not want to dive below it */ + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + + ut_a(rec_idx == dive_below_idx); + + ib_uint64_t n_diff_on_leaf_page; + ib_uint64_t n_external_pages; + + dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur), + n_prefix, + &n_diff_on_leaf_page, + &n_external_pages); + + /* We adjust n_diff_on_leaf_page here to avoid counting + one value twice - once as the last on some page and once + as the first on another page. Consider the following example: + Leaf level: + page: (2,2,2,2,3,3) + ... many pages like (3,3,3,3,3,3) ... + page: (3,3,3,3,5,5) + ... many pages like (5,5,5,5,5,5) ... + page: (5,5,5,5,8,8) + page: (8,8,8,8,9,9) + our algo would (correctly) get an estimate that there are + 2 distinct records per page (average). Having 4 pages below + non-boring records, it would (wrongly) estimate the number + of distinct records to 8. */ + if (n_diff_on_leaf_page > 0) { + n_diff_on_leaf_page--; + } + + n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page; + + n_diff_data->n_external_pages_sum += n_external_pages; + } +} + +/** statistics for an index */ +struct index_stats_t +{ + std::vector<index_field_stats_t> stats; + ulint index_size; + ulint n_leaf_pages; + + index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1) + { + stats.reserve(n_uniq); + for (ulint i= 0; i < n_uniq; ++i) + stats.push_back(index_field_stats_t{0, 1, 0}); + } + + void set_bulk_operation() + { + memset((void*) &stats[0], 0xff, stats.size() * sizeof stats[0]); + } + + bool is_bulk_operation() const + { + for (auto &s : stats) + if (!s.is_bulk_operation()) + return false; + return true; + } +}; + +/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[]. +@param[in] n_diff_data input data to use to derive the results +@param[in,out] index_stats index stats to set */ +UNIV_INLINE +void +dict_stats_index_set_n_diff( + const n_diff_data_t* n_diff_data, + index_stats_t& index_stats) +{ + for (ulint n_prefix = index_stats.stats.size(); + n_prefix >= 1; + n_prefix--) { + /* n_diff_all_analyzed_pages can be 0 here if + all the leaf pages sampled contained only + delete-marked records. In this case we should assign + 0 to index->stat_n_diff_key_vals[n_prefix - 1], which + the formula below does. */ + + const n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + ut_ad(data->n_leaf_pages_to_analyze > 0); + ut_ad(data->n_recs_on_level > 0); + + ib_uint64_t n_ordinary_leaf_pages; + + if (data->level == 1) { + /* If we know the number of records on level 1, then + this number is the same as the number of pages on + level 0 (leaf). */ + n_ordinary_leaf_pages = data->n_recs_on_level; + } else { + /* If we analyzed D ordinary leaf pages and found E + external pages in total linked from those D ordinary + leaf pages, then this means that the ratio + ordinary/external is D/E. Then the ratio ordinary/total + is D / (D + E). Knowing that the total number of pages + is T (including ordinary and external) then we estimate + that the total number of ordinary leaf pages is + T * D / (D + E). */ + n_ordinary_leaf_pages + = index_stats.n_leaf_pages + * data->n_leaf_pages_to_analyze + / (data->n_leaf_pages_to_analyze + + data->n_external_pages_sum); + } + + /* See REF01 for an explanation of the algorithm */ + index_stats.stats[n_prefix - 1].n_diff_key_vals + = n_ordinary_leaf_pages + + * data->n_diff_on_level + / data->n_recs_on_level + + * data->n_diff_all_analyzed_pages + / data->n_leaf_pages_to_analyze; + + index_stats.stats[n_prefix - 1].n_sample_sizes + = data->n_leaf_pages_to_analyze; + + DEBUG_PRINTF(" %s(): n_diff=" UINT64PF + " for n_prefix=" ULINTPF + " (" ULINTPF + " * " UINT64PF " / " UINT64PF + " * " UINT64PF " / " UINT64PF ")\n", + __func__, + index_stats.stats[n_prefix - 1].n_diff_key_vals, + n_prefix, + index_stats.n_leaf_pages, + data->n_diff_on_level, + data->n_recs_on_level, + data->n_diff_all_analyzed_pages, + data->n_leaf_pages_to_analyze); + } +} + +/** Calculates new statistics for a given index and saves them to the index +members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and +stat_n_leaf_pages. This function can be slow. +@param[in] index index to analyze +@return index stats */ +static index_stats_t dict_stats_analyze_index(dict_index_t* index) +{ + bool level_is_analyzed; + ulint n_uniq; + ulint n_prefix; + ib_uint64_t total_recs; + ib_uint64_t total_pages; + mtr_t mtr; + index_stats_t result(index->n_uniq); + DBUG_ENTER("dict_stats_analyze_index"); + + DBUG_PRINT("info", ("index: %s, online status: %d", index->name(), + dict_index_get_online_status(index))); + + ut_ad(!index->table->stats_mutex_is_owner()); + ut_ad(index->table->get_ref_count()); + + if (!index->is_btree()) { + DBUG_RETURN(result); + } + + DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name()); + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); + if (!root) { +empty_index: + mtr.commit(); + dict_stats_assert_initialized_index(index); + DBUG_RETURN(result); + } + + uint16_t root_level = btr_page_get_level(root->page.frame); + mtr.x_lock_space(index->table->space); + ulint dummy, size; + result.index_size + = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->page.frame, &size, &mtr) + + fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, &mtr); + result.n_leaf_pages = size ? size : 1; + + const auto bulk_trx_id = index->table->bulk_trx_id; + if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { + result.set_bulk_operation(); + goto empty_index; + } + + mtr.commit(); + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + + n_uniq = dict_index_get_n_unique(index); + + /* If the tree has just one level (and one page) or if the user + has requested to sample too many pages then do full scan. + + For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index) + will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf + pages will be sampled. If that number is bigger than the total + number of leaf pages then do full scan of the leaf level instead + since it will be faster and will give better results. */ + + if (root_level == 0 + || N_SAMPLE_PAGES(index) * n_uniq > result.n_leaf_pages) { + + if (root_level == 0) { + DEBUG_PRINTF(" %s(): just one page," + " doing full scan\n", __func__); + } else { + DEBUG_PRINTF(" %s(): too many pages requested for" + " sampling, doing full scan\n", __func__); + } + + /* do full scan of level 0; save results directly + into the index */ + + dict_stats_analyze_index_level(index, + 0 /* leaf level */, + index->stat_n_diff_key_vals, + &total_recs, + &total_pages, + NULL /* boundaries not needed */, + &mtr); + + mtr.commit(); + + index->table->stats_mutex_lock(); + for (ulint i = 0; i < n_uniq; i++) { + result.stats[i].n_diff_key_vals = index->stat_n_diff_key_vals[i]; + result.stats[i].n_sample_sizes = total_pages; + result.stats[i].n_non_null_key_vals = index->stat_n_non_null_key_vals[i]; + } + result.n_leaf_pages = index->stat_n_leaf_pages; + index->table->stats_mutex_unlock(); + + DBUG_RETURN(result); + } + + /* For each level that is being scanned in the btree, this contains the + number of different key values for all possible n-column prefixes. */ + ib_uint64_t* n_diff_on_level = UT_NEW_ARRAY( + ib_uint64_t, n_uniq, mem_key_dict_stats_n_diff_on_level); + + /* For each level that is being scanned in the btree, this contains the + index of the last record from each group of equal records (when + comparing only the first n columns, n=1..n_uniq). */ + boundaries_t* n_diff_boundaries = UT_NEW_ARRAY_NOKEY(boundaries_t, + n_uniq); + + /* For each n-column prefix this array contains the input data that is + used to calculate dict_index_t::stat_n_diff_key_vals[]. */ + n_diff_data_t* n_diff_data = UT_NEW_ARRAY_NOKEY(n_diff_data_t, n_uniq); + + /* total_recs is also used to estimate the number of pages on one + level below, so at the start we have 1 page (the root) */ + total_recs = 1; + + /* Here we use the following optimization: + If we find that level L is the first one (searching from the + root) that contains at least D distinct keys when looking at + the first n_prefix columns, then: + if we look at the first n_prefix-1 columns then the first + level that contains D distinct keys will be either L or a + lower one. + So if we find that the first level containing D distinct + keys (on n_prefix columns) is L, we continue from L when + searching for D distinct keys on n_prefix-1 columns. */ + auto level = root_level; + level_is_analyzed = false; + + for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) { + + DEBUG_PRINTF(" %s(): searching level with >=%llu " + "distinct records, n_prefix=" ULINTPF "\n", + __func__, N_DIFF_REQUIRED(index), n_prefix); + + /* Commit the mtr to release the tree S lock to allow + other threads to do some work too. */ + mtr.commit(); + mtr.start(); + mtr_sx_lock_index(index, &mtr); + ut_ad(mtr.get_savepoint() == 1); + buf_block_t *root = btr_root_block_get(index, RW_S_LATCH, + &mtr, &err); + if (!root || root_level != btr_page_get_level(root->page.frame) + || index->table->bulk_trx_id != bulk_trx_id) { + /* Just quit if the tree has changed beyond + recognition here. The old stats from previous + runs will remain in the values that we have + not calculated yet. Initially when the index + object is created the stats members are given + some sensible values so leaving them untouched + here even the first time will not cause us to + read uninitialized memory later. */ + break; + } + + mtr.rollback_to_savepoint(1); + + /* check whether we should pick the current level; + we pick level 1 even if it does not have enough + distinct records because we do not want to scan the + leaf level because it may contain too many records */ + if (level_is_analyzed + && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index) + || level == 1)) { + + goto found_level; + } + + /* search for a level that contains enough distinct records */ + + if (level_is_analyzed && level > 1) { + + /* if this does not hold we should be on + "found_level" instead of here */ + ut_ad(n_diff_on_level[n_prefix - 1] + < N_DIFF_REQUIRED(index)); + + level--; + level_is_analyzed = false; + } + + /* descend into the tree, searching for "good enough" level */ + for (;;) { + + /* make sure we do not scan the leaf level + accidentally, it may contain too many pages */ + ut_ad(level > 0); + + /* scanning the same level twice is an optimization + bug */ + ut_ad(!level_is_analyzed); + + /* Do not scan if this would read too many pages. + Here we use the following fact: + the number of pages on level L equals the number + of records on level L+1, thus we deduce that the + following call would scan total_recs pages, because + total_recs is left from the previous iteration when + we scanned one level upper or we have not scanned any + levels yet in which case total_recs is 1. */ + if (total_recs > N_SAMPLE_PAGES(index)) { + + /* if the above cond is true then we are + not at the root level since on the root + level total_recs == 1 (set before we + enter the n-prefix loop) and cannot + be > N_SAMPLE_PAGES(index) */ + ut_a(level != root_level); + + /* step one level back and be satisfied with + whatever it contains */ + level++; + level_is_analyzed = true; + + break; + } + + mtr.rollback_to_savepoint(1); + dict_stats_analyze_index_level(index, + level, + n_diff_on_level, + &total_recs, + &total_pages, + n_diff_boundaries, + &mtr); + mtr.rollback_to_savepoint(1); + level_is_analyzed = true; + + if (level == 1 + || n_diff_on_level[n_prefix - 1] + >= N_DIFF_REQUIRED(index)) { + /* we have reached the last level we could scan + or we found a good level with many distinct + records */ + break; + } + + level--; + level_is_analyzed = false; + } +found_level: + + DEBUG_PRINTF(" %s(): found level " ULINTPF + " that has " UINT64PF + " distinct records for n_prefix=" ULINTPF "\n", + __func__, level, n_diff_on_level[n_prefix - 1], + n_prefix); + /* here we are either on level 1 or the level that we are on + contains >= N_DIFF_REQUIRED distinct keys or we did not scan + deeper levels because they would contain too many pages */ + + ut_ad(level > 0); + + ut_ad(level_is_analyzed); + + /* if any of these is 0 then there is exactly one page in the + B-tree and it is empty and we should have done full scan and + should not be here */ + ut_ad(total_recs > 0); + ut_ad(n_diff_on_level[n_prefix - 1] > 0); + + ut_ad(N_SAMPLE_PAGES(index) > 0); + + n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + data->level = level; + + data->n_recs_on_level = total_recs; + + data->n_diff_on_level = n_diff_on_level[n_prefix - 1]; + + data->n_leaf_pages_to_analyze = std::min( + N_SAMPLE_PAGES(index), + n_diff_on_level[n_prefix - 1]); + + /* pick some records from this level and dive below them for + the given n_prefix */ + + dict_stats_analyze_index_for_n_prefix( + index, n_prefix, &n_diff_boundaries[n_prefix - 1], + data, &mtr); + } + + mtr.commit(); + + UT_DELETE_ARRAY(n_diff_boundaries); + + UT_DELETE_ARRAY(n_diff_on_level); + + /* n_prefix == 0 means that the above loop did not end up prematurely + due to tree being changed and so n_diff_data[] is set up. */ + if (n_prefix == 0) { + dict_stats_index_set_n_diff(n_diff_data, result); + } + + UT_DELETE_ARRAY(n_diff_data); + + DBUG_RETURN(result); +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively slow and is used to calculate persistent statistics that +will be saved on disk. +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_persistent( +/*=========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + + DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name); + + DEBUG_SYNC_C("dict_stats_update_persistent"); + + /* analyze the clustered index first */ + + index = dict_table_get_first_index(table); + + if (index == NULL + || index->is_corrupted() + || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) { + + /* Table definition is corrupt */ + dict_stats_empty_table(table, true); + + return(DB_CORRUPTION); + } + + ut_ad(!dict_index_is_ibuf(index)); + table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + table->stats_mutex_unlock(); + + index_stats_t stats = dict_stats_analyze_index(index); + + if (stats.is_bulk_operation()) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + + table->stats_mutex_lock(); + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] = stats.stats[i].n_non_null_key_vals; + } + + ulint n_unique = dict_index_get_n_unique(index); + + table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + /* analyze other indexes from the table, if any */ + + table->stat_sum_of_other_index_sizes = 0; + + for (index = dict_table_get_next_index(index); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!index->is_btree()) { + continue; + } + + dict_stats_empty_index(index, false); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + table->stats_mutex_unlock(); + stats = dict_stats_analyze_index(index); + table->stats_mutex_lock(); + + if (stats.is_bulk_operation()) { + table->stats_mutex_unlock(); + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats.stats[i].n_non_null_key_vals; + } + + table->stat_sum_of_other_index_sizes + += index->stat_index_size; + } + + table->stats_last_recalc = time(NULL); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; + + dict_stats_assert_initialized(table); + + table->stats_mutex_unlock(); + + return(DB_SUCCESS); +} + +#include "mysql_com.h" +/** Save an individual index's statistic into the persistent statistics +storage. +@param[in] index index to be updated +@param[in] last_update timestamp of the stat +@param[in] stat_name name of the stat +@param[in] stat_value value of the stat +@param[in] sample_size n pages sampled or NULL +@param[in] stat_description description of the stat +@param[in,out] trx transaction +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_index_stat( + dict_index_t* index, + time_t last_update, + const char* stat_name, + ib_uint64_t stat_value, + ib_uint64_t* sample_size, + const char* stat_description, + trx_t* trx) +{ + dberr_t ret; + pars_info_t* pinfo; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + ut_ad(dict_sys.locked()); + + dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + pars_info_add_str_literal(pinfo, "index_name", index->name); + MEM_CHECK_DEFINED(&last_update, 4); + pars_info_add_int4_literal(pinfo, "last_update", uint32(last_update)); + MEM_CHECK_DEFINED(stat_name, strlen(stat_name)); + pars_info_add_str_literal(pinfo, "stat_name", stat_name); + MEM_CHECK_DEFINED(&stat_value, 8); + pars_info_add_ull_literal(pinfo, "stat_value", stat_value); + if (sample_size != NULL) { + MEM_CHECK_DEFINED(sample_size, 8); + pars_info_add_ull_literal(pinfo, "sample_size", *sample_size); + } else { + pars_info_add_literal(pinfo, "sample_size", NULL, + UNIV_SQL_NULL, DATA_FIXBINARY, 0); + } + pars_info_add_str_literal(pinfo, "stat_description", + stat_description); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE INDEX_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" INDEX_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name AND\n" + "stat_name = :stat_name;\n" + + "INSERT INTO \"" INDEX_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":index_name,\n" + ":last_update,\n" + ":stat_name,\n" + ":stat_value,\n" + ":sample_size,\n" + ":stat_description\n" + ");\n" + "END;", trx); + + if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + ib::error() << "Cannot save index statistics for table " + << index->table->name + << ", index " << index->name + << ", stat name \"" << stat_name << "\": " + << ret; + index->stats_error_printed = true; + } + } + + return(ret); +} + +/** Report an error if updating table statistics failed because +.ibd file is missing, table decryption failed or table is corrupted. +@param[in,out] table Table +@param[in] defragment true if statistics is for defragment +@retval DB_DECRYPTION_FAILED if decryption of the table failed +@retval DB_TABLESPACE_DELETED if .ibd file is missing +@retval DB_CORRUPTION if table is marked as corrupted */ +dberr_t +dict_stats_report_error(dict_table_t* table, bool defragment) +{ + dberr_t err; + + const char* df = defragment ? " defragment" : ""; + + if (!table->space) { + ib::warn() << "Cannot save" << df << " statistics for table " + << table->name + << " because the .ibd file is missing. " + << TROUBLESHOOTING_MSG; + err = DB_TABLESPACE_DELETED; + } else { + ib::warn() << "Cannot save" << df << " statistics for table " + << table->name + << " because file " + << table->space->chain.start->name + << (table->corrupted + ? " is corrupted." + : " cannot be decrypted."); + err = table->corrupted ? DB_CORRUPTION : DB_DECRYPTION_FAILED; + } + + dict_stats_empty_table(table, defragment); + return err; +} + +/** Save the table's statistics into the persistent statistics storage. +@param[in] table_orig table whose stats to save +@param[in] only_for_index if this is non-NULL, then stats for indexes +that are not equal to it will not be saved, if NULL, then all indexes' stats +are saved +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_save( + dict_table_t* table_orig, + const index_id_t* only_for_index) +{ + pars_info_t* pinfo; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("dict_stats_save_exit_notify", + SCOPE_EXIT([] { + debug_sync_set_action(current_thd, + STRING_WITH_LEN("now SIGNAL dict_stats_save_finished")); + }); + ); +#endif /* ENABLED_DEBUG_SYNC */ + + if (high_level_read_only) { + return DB_READ_ONLY; + } + + if (!table_orig->is_readable()) { + return (dict_stats_report_error(table_orig)); + } + + THD* thd = current_thd; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dict_table_t* table_stats = dict_table_open_on_name( + TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared<false>(table_stats, thd, + &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats + || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { +release_and_exit: + if (table_stats) { + dict_table_close(table_stats, false, thd, mdl_table); + } + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t* index_stats = dict_table_open_on_name( + INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared<false>(index_stats, thd, + &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) { + goto release_and_exit; + } + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + dict_table_t* table = dict_stats_snapshot_create(table_orig); + + dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + const time_t now = time(NULL); + trx_t* trx = trx_create(); + trx->mysql_thd = thd; + trx_start_internal(trx); + dberr_t ret = trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) { + ret = lock_table_for_trx(index_stats, trx, LOCK_X); + } + if (ret != DB_SUCCESS) { + if (trx->state != TRX_STATE_NOT_STARTED) { + trx->commit(); + } + goto unlocked_free_and_exit; + } + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + pars_info_add_int4_literal(pinfo, "last_update", uint32(now)); + pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows); + pars_info_add_ull_literal(pinfo, "clustered_index_size", + table->stat_clustered_index_size); + pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes", + table->stat_sum_of_other_index_sizes); + + dict_sys.lock(SRW_LOCK_CALL); + trx->dict_operation_lock_mode = true; + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE TABLE_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" TABLE_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + + "INSERT INTO \"" TABLE_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":last_update,\n" + ":n_rows,\n" + ":clustered_index_size,\n" + ":sum_of_other_index_sizes\n" + ");\n" + "END;", trx); + + if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { + ib::error() << "Cannot save table statistics for table " + << table->name << ": " << ret; +rollback_and_exit: + trx->rollback(); +free_and_exit: + trx->dict_operation_lock_mode = false; + dict_sys.unlock(); +unlocked_free_and_exit: + trx->free(); + dict_stats_snapshot_free(table); + dict_table_close(table_stats, false, thd, mdl_table); + dict_table_close(index_stats, false, thd, mdl_index); + return ret; + } + + dict_index_t* index; + index_map_t indexes( + (ut_strcmp_functor()), + index_map_t_allocator(mem_key_dict_stats_index_map_t)); + + /* Below we do all the modifications in innodb_index_stats in a single + transaction for performance reasons. Modifying more than one row in a + single transaction may deadlock with other transactions if they + lock the rows in different order. Other transaction could be for + example when we DROP a table and do + DELETE FROM innodb_index_stats WHERE database_name = '...' + AND table_name = '...'; which will affect more than one row. To + prevent deadlocks we always lock the rows in the same order - the + order of the PK, which is (database_name, table_name, index_name, + stat_name). This is why below we sort the indexes by name and then + for each index, do the mods ordered by stat_name. */ + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + indexes[index->name] = index; + } + + index_map_t::const_iterator it; + + for (it = indexes.begin(); it != indexes.end(); ++it) { + + index = it->second; + + if (only_for_index != NULL && index->id != *only_for_index) { + continue; + } + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + for (unsigned i = 0; i < index->n_uniq; i++) { + + char stat_name[16]; + char stat_description[1024]; + + snprintf(stat_name, sizeof(stat_name), + "n_diff_pfx%02u", i + 1); + + /* craft a string that contains the column names */ + snprintf(stat_description, sizeof(stat_description), + "%s", index->fields[0].name()); + for (unsigned j = 1; j <= i; j++) { + size_t len; + + len = strlen(stat_description); + + snprintf(stat_description + len, + sizeof(stat_description) - len, + ",%s", index->fields[j].name()); + } + + ret = dict_stats_save_index_stat( + index, now, stat_name, + index->stat_n_diff_key_vals[i], + &index->stat_n_sample_sizes[i], + stat_description, trx); + + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + } + + ret = dict_stats_save_index_stat(index, now, "n_leaf_pages", + index->stat_n_leaf_pages, + NULL, + "Number of leaf pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + + ret = dict_stats_save_index_stat(index, now, "size", + index->stat_index_size, + NULL, + "Number of pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + } + + ret= trx->bulk_insert_apply(); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + + trx->commit(); + goto free_and_exit; +} + +/*********************************************************************//** +Called for the row that is selected by +SELECT ... FROM mysql.innodb_table_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to it. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_table_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* table_void) /*!< out: table */ +{ + sel_node_t* node = (sel_node_t*) node_void; + dict_table_t* table = (dict_table_t*) table_void; + que_common_t* cnode; + int i; + + /* this should loop exactly 3 times - for + n_rows,clustered_index_size,sum_of_other_index_sizes */ + for (cnode = static_cast<que_common_t*>(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast<que_common_t*>(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast<const byte*>(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_table_stats.n_rows */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_n_rows = mach_read_from_8(data); + + break; + + case 1: /* mysql.innodb_table_stats.clustered_index_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_clustered_index_size + = (ulint) mach_read_from_8(data); + + break; + + case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_sum_of_other_index_sizes + = (ulint) mach_read_from_8(data); + + break; + + default: + + /* someone changed SELECT + n_rows,clustered_index_size,sum_of_other_index_sizes + to select more columns from innodb_table_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 3 this means someone changed the + SELECT n_rows,clustered_index_size,sum_of_other_index_sizes + to select less columns from innodb_table_stats without adjusting here; + if i > 3 we would have ut_error'ed earlier */ + ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/); + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/** Aux struct used to pass a table and a boolean to +dict_stats_fetch_index_stats_step(). */ +struct index_fetch_t { + dict_table_t* table; /*!< table whose indexes are to be modified */ + bool stats_were_modified; /*!< will be set to true if at + least one index stats were modified */ +}; + +/*********************************************************************//** +Called for the rows that are selected by +SELECT ... FROM mysql.innodb_index_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to its indexes. +Let a table has N indexes and each index has Ui unique columns for i=1..N, +then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table. +So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude +N*AVG(Ui). In each call it searches for the currently fetched index into +table->indexes linearly, assuming this list is not sorted. Thus, overall, +fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N +is the number of indexes. +This can be improved if we sort table->indexes in a temporary area just once +and then search in that sorted list. Then the complexity will be O(N*log(N)). +We assume a table will not have more than 100 indexes, so we go with the +simpler N^2 algorithm. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_index_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* arg_void) /*!< out: table + a flag that tells if we + modified anything */ +{ + sel_node_t* node = (sel_node_t*) node_void; + index_fetch_t* arg = (index_fetch_t*) arg_void; + dict_table_t* table = arg->table; + dict_index_t* index = NULL; + que_common_t* cnode; + const char* stat_name = NULL; + ulint stat_name_len = ULINT_UNDEFINED; + ib_uint64_t stat_value = UINT64_UNDEFINED; + ib_uint64_t sample_size = UINT64_UNDEFINED; + int i; + + /* this should loop exactly 4 times - for the columns that + were selected: index_name,stat_name,stat_value,sample_size */ + for (cnode = static_cast<que_common_t*>(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast<que_common_t*>(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast<const byte*>(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_index_stats.index_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + /* search for index in table's indexes whose name + matches data; the fetched index name is in data, + has no terminating '\0' and has length len */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->is_committed() + && strlen(index->name) == len + && memcmp(index->name, data, len) == 0) { + /* the corresponding index was found */ + break; + } + } + + /* if index is NULL here this means that + mysql.innodb_index_stats contains more rows than the + number of indexes in the table; this is ok, we just + return ignoring those extra rows; in other words + dict_stats_fetch_index_stats_step() has been called + for a row from index_stats with unknown index_name + column */ + if (index == NULL) { + + return(TRUE); + } + + break; + + case 1: /* mysql.innodb_index_stats.stat_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + ut_a(index != NULL); + + stat_name = (const char*) data; + stat_name_len = len; + + break; + + case 2: /* mysql.innodb_index_stats.stat_value */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + + stat_value = mach_read_from_8(data); + + break; + + case 3: /* mysql.innodb_index_stats.sample_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8 || len == UNIV_SQL_NULL); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + + if (len == UNIV_SQL_NULL) { + break; + } + /* else */ + + sample_size = mach_read_from_8(data); + + break; + + default: + + /* someone changed + SELECT index_name,stat_name,stat_value,sample_size + to select more columns from innodb_index_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 4 this means someone changed the + SELECT index_name,stat_name,stat_value,sample_size + to select less columns from innodb_index_stats without adjusting here; + if i > 4 we would have ut_error'ed earlier */ + ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + /* sample_size could be UINT64_UNDEFINED here, if it is NULL */ + +#define PFX "n_diff_pfx" +#define PFX_LEN 10 + + if (stat_name_len == 4 /* strlen("size") */ + && strncasecmp("size", stat_name, stat_name_len) == 0) { + index->stat_index_size = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */ + && strncasecmp("n_leaf_pages", stat_name, stat_name_len) + == 0) { + index->stat_n_leaf_pages = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_page_split") */ + && strncasecmp("n_page_split", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_page_split = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 13 /* strlen("n_pages_freed") */ + && strncasecmp("n_pages_freed", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_pages_freed = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ + && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { + + const char* num_ptr; + unsigned long n_pfx; + + /* point num_ptr into "1" from "n_diff_pfx12..." */ + num_ptr = stat_name + PFX_LEN; + + /* stat_name should have exactly 2 chars appended to PFX + and they should be digits */ + if (stat_name_len != PFX_LEN + 2 + || num_ptr[0] < '0' || num_ptr[0] > '9' + || num_ptr[1] < '0' || num_ptr[1] > '9') { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name.m_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ib::info out; + out << "Ignoring strange row from " + << INDEX_STATS_NAME_PRINT << " WHERE" + " database_name = '" << db_utf8 + << "' AND table_name = '" << table_utf8 + << "' AND index_name = '" << index->name() + << "' AND stat_name = '"; + out.write(stat_name, stat_name_len); + out << "'; because stat_name is malformed"; + return(TRUE); + } + /* else */ + + /* extract 12 from "n_diff_pfx12..." into n_pfx + note that stat_name does not have a terminating '\0' */ + n_pfx = ulong(num_ptr[0] - '0') * 10 + ulong(num_ptr[1] - '0'); + + ulint n_uniq = index->n_uniq; + + if (n_pfx == 0 || n_pfx > n_uniq) { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name.m_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ib::info out; + out << "Ignoring strange row from " + << INDEX_STATS_NAME_PRINT << " WHERE" + " database_name = '" << db_utf8 + << "' AND table_name = '" << table_utf8 + << "' AND index_name = '" << index->name() + << "' AND stat_name = '"; + out.write(stat_name, stat_name_len); + out << "'; because stat_name is out of range, the index" + " has " << n_uniq << " unique columns"; + + return(TRUE); + } + /* else */ + + index->stat_n_diff_key_vals[n_pfx - 1] = stat_value; + + if (sample_size != UINT64_UNDEFINED) { + index->stat_n_sample_sizes[n_pfx - 1] = sample_size; + } else { + /* hmm, strange... the user must have UPDATEd the + table manually and SET sample_size = NULL */ + index->stat_n_sample_sizes[n_pfx - 1] = 0; + } + + index->stat_n_non_null_key_vals[n_pfx - 1] = 0; + + arg->stats_were_modified = true; + } else { + /* silently ignore rows with unknown stat_name, the + user may have developed her own stats */ + } + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/*********************************************************************//** +Read table's statistics from the persistent statistics storage. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_fetch_from_ps( +/*=====================*/ + dict_table_t* table) /*!< in/out: table */ +{ + index_fetch_t index_fetch_arg; + trx_t* trx; + pars_info_t* pinfo; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + /* Initialize all stats to dummy values before fetching because if + the persistent storage contains incomplete stats (e.g. missing stats + for some index) then we would end up with (partially) uninitialized + stats. */ + dict_stats_empty_table(table, true); + + THD* thd = current_thd; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dict_table_t* table_stats = dict_table_open_on_name( + TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared<false>(table_stats, thd, + &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats + || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { +release_and_exit: + if (table_stats) { + dict_table_close(table_stats, false, thd, mdl_table); + } + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t* index_stats = dict_table_open_on_name( + INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared<false>(index_stats, thd, + &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) { + goto release_and_exit; + } + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx = trx_create(); + + trx_start_internal_read_only(trx); + + dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + + pars_info_bind_function(pinfo, + "fetch_table_stats_step", + dict_stats_fetch_table_stats_step, + table); + + index_fetch_arg.table = table; + index_fetch_arg.stats_were_modified = false; + pars_info_bind_function(pinfo, + "fetch_index_stats_step", + dict_stats_fetch_index_stats_step, + &index_fetch_arg); + dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */ + ret = que_eval_sql(pinfo, + "PROCEDURE FETCH_STATS () IS\n" + "found INT;\n" + "DECLARE FUNCTION fetch_table_stats_step;\n" + "DECLARE FUNCTION fetch_index_stats_step;\n" + "DECLARE CURSOR table_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_table_stats_step() */ + " n_rows,\n" + " clustered_index_size,\n" + " sum_of_other_index_sizes\n" + " FROM \"" TABLE_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + "DECLARE CURSOR index_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_index_stats_step() */ + " index_name,\n" + " stat_name,\n" + " stat_value,\n" + " sample_size\n" + " FROM \"" INDEX_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + + "BEGIN\n" + + "OPEN table_stats_cur;\n" + "FETCH table_stats_cur INTO\n" + " fetch_table_stats_step();\n" + "IF (SQL % NOTFOUND) THEN\n" + " CLOSE table_stats_cur;\n" + " RETURN;\n" + "END IF;\n" + "CLOSE table_stats_cur;\n" + + "OPEN index_stats_cur;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_stats_cur INTO\n" + " fetch_index_stats_step();\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_stats_cur;\n" + + "END;", trx); + /* pinfo is freed by que_eval_sql() */ + dict_sys.unlock(); + + dict_table_close(table_stats, false, thd, mdl_table); + dict_table_close(index_stats, false, thd, mdl_index); + + trx_commit_for_mysql(trx); + + trx->free(); + + if (!index_fetch_arg.stats_were_modified) { + return(DB_STATS_DO_NOT_EXIST); + } + + return(ret); +} + +/*********************************************************************//** +Clear defragmentation stats modified counter for all indices in table. */ +static +void +dict_stats_empty_defrag_modified_counter( + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ut_a(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ +{ + DBUG_ENTER("dict_stats_update_for_index"); + + if (dict_stats_is_persistent_enabled(index->table)) { + + if (dict_stats_persistent_storage_check(false)) { + index_stats_t stats = dict_stats_analyze_index(index); + index->table->stats_mutex_lock(); + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats.stats[i].n_non_null_key_vals; + } + index->table->stat_sum_of_other_index_sizes + += index->stat_index_size; + index->table->stats_mutex_unlock(); + + dict_stats_save(index->table, &index->id); + DBUG_VOID_RETURN; + } + /* else */ + + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + + ib::info() << "Recalculation of persistent statistics" + " requested for table " << index->table->name + << " index " << index->name + << " but the required" + " persistent statistics storage is not present or is" + " corrupted. Using transient stats instead."; + index->stats_error_printed = false; + } + } + + dict_stats_update_transient_for_index(index); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option) + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent statistics + storage */ +{ + ut_ad(!table->stats_mutex_is_owner()); + + if (!table->is_readable()) { + return (dict_stats_report_error(table)); + } else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + /* If we have set a high innodb_force_recovery level, do + not calculate statistics, as a badly corrupted index can + cause a crash in it. */ + dict_stats_empty_table(table, false); + return(DB_SUCCESS); + } + + if (trx_id_t bulk_trx_id = table->bulk_trx_id) { + if (trx_sys.find(nullptr, bulk_trx_id, false)) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + } + + switch (stats_upd_option) { + case DICT_STATS_RECALC_PERSISTENT: + + if (srv_read_only_mode) { + goto transient; + } + + /* Persistent recalculation requested, called from + 1) ANALYZE TABLE, or + 2) the auto recalculation background thread, or + 3) open table if stats do not exist on disk and auto recalc + is enabled */ + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name.m_name, '/') != NULL); + + /* check if the persistent statistics storage exists + before calling the potentially slow function + dict_stats_update_persistent(); that is a + prerequisite for dict_stats_save() succeeding */ + if (dict_stats_persistent_storage_check(false)) { + + dberr_t err; + + err = dict_stats_update_persistent(table); + + if (err != DB_SUCCESS) { + return(err); + } + + err = dict_stats_save(table, NULL); + + return(err); + } + + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ib::warn() << "Recalculation of persistent statistics" + " requested for table " + << table->name + << " but the required persistent" + " statistics storage is not present or is corrupted." + " Using transient stats instead."; + table->stats_error_printed = true; + } + + goto transient; + + case DICT_STATS_RECALC_TRANSIENT: + + goto transient; + + case DICT_STATS_EMPTY_TABLE: + + dict_stats_empty_table(table, true); + + /* If table is using persistent stats, + then save the stats on disk */ + + if (dict_stats_is_persistent_enabled(table)) { + + if (dict_stats_persistent_storage_check(false)) { + + return(dict_stats_save(table, NULL)); + } + + return(DB_STATS_DO_NOT_EXIST); + } + + return(DB_SUCCESS); + + case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: + + /* fetch requested, either fetch from persistent statistics + storage or use the old method */ + + if (table->stat_initialized) { + return(DB_SUCCESS); + } + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name.m_name, '/') != NULL); + + if (!dict_stats_persistent_storage_check(false)) { + /* persistent statistics storage does not exist + or is corrupted, calculate the transient stats */ + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false && + !opt_bootstrap) { + ib::error() << "Fetch of persistent statistics" + " requested for table " + << table->name + << " but the required system tables " + << TABLE_STATS_NAME_PRINT + << " and " << INDEX_STATS_NAME_PRINT + << " are not present or have unexpected" + " structure. Using transient stats instead."; + table->stats_error_printed = true; + } + + goto transient; + } + + dict_table_t* t; + + /* Create a dummy table object with the same name and + indexes, suitable for fetching the stats into it. */ + t = dict_stats_table_clone_create(table); + + dberr_t err = dict_stats_fetch_from_ps(t); + + t->stats_last_recalc = table->stats_last_recalc; + t->stat_modified_counter = 0; + dict_stats_empty_defrag_modified_counter(t); + + switch (err) { + case DB_SUCCESS: + + table->stats_mutex_lock(); + /* t is localized to this thread so no need to + take stats mutex lock (limiting it to debug only) */ + ut_d(t->stats_mutex_lock()); + + /* Pass reset_ignored_indexes=true as parameter + to dict_stats_copy. This will cause statictics + for corrupted indexes to be set to empty values */ + dict_stats_copy(table, t, true); + + dict_stats_assert_initialized(table); + + ut_d(t->stats_mutex_unlock()); + table->stats_mutex_unlock(); + + dict_stats_table_clone_free(t); + + return(DB_SUCCESS); + case DB_STATS_DO_NOT_EXIST: + + dict_stats_table_clone_free(t); + + if (srv_read_only_mode) { + goto transient; + } + + if (dict_stats_auto_recalc_is_enabled(table)) { + return(dict_stats_update( + table, + DICT_STATS_RECALC_PERSISTENT)); + } + + ib::info() << "Trying to use table " << table->name + << " which has persistent statistics enabled," + " but auto recalculation turned off and the" + " statistics do not exist in " + TABLE_STATS_NAME_PRINT + " and " INDEX_STATS_NAME_PRINT + ". Please either run \"ANALYZE TABLE " + << table->name << ";\" manually or enable the" + " auto recalculation with \"ALTER TABLE " + << table->name << " STATS_AUTO_RECALC=1;\"." + " InnoDB will now use transient statistics for " + << table->name << "."; + + goto transient; + default: + + dict_stats_table_clone_free(t); + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ib::error() << "Error fetching persistent statistics" + " for table " + << table->name + << " from " TABLE_STATS_NAME_PRINT " and " + INDEX_STATS_NAME_PRINT ": " << err + << ". Using transient stats method instead."; + } + + goto transient; + } + /* no "default:" in order to produce a compilation warning + about unhandled enumeration value */ + } + +transient: + return dict_stats_update_transient(table); +} + +/** Execute DELETE FROM mysql.innodb_table_stats +@param database_name database name +@param table_name table name +@param trx transaction (nullptr=start and commit a new one) +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_table_stats(const char *database_name, + const char *table_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_TABLE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", trx); +} + +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", trx); +} + +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param index_name name of the index +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, + const char *index_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + pars_info_add_str_literal(pinfo, "index_name", index_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name;\n" + "END;\n", trx); +} + +/** Rename a table in InnoDB persistent stats storage. +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_table(const char *old_name, const char *new_name, + trx_t *trx) +{ + /* skip the statistics tables themselves */ + if (!strcmp(old_name, TABLE_STATS_NAME) || + !strcmp(old_name, INDEX_STATS_NAME) || + !strcmp(new_name, TABLE_STATS_NAME) || + !strcmp(new_name, INDEX_STATS_NAME)) + return DB_SUCCESS; + + char old_db[MAX_DB_UTF8_LEN]; + char new_db[MAX_DB_UTF8_LEN]; + char old_table[MAX_TABLE_UTF8_LEN]; + char new_table[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(old_name, old_db, sizeof old_db, old_table, sizeof old_table); + dict_fs2utf8(new_name, new_db, sizeof new_db, new_table, sizeof new_table); + + if (dict_table_t::is_temporary_name(old_name) || + dict_table_t::is_temporary_name(new_name)) + { + if (dberr_t e= dict_stats_delete_from_table_stats(old_db, old_table, trx)) + return e; + return dict_stats_delete_from_index_stats(old_db, old_table, trx); + } + + pars_info_t *pinfo= pars_info_create(); + pars_info_add_str_literal(pinfo, "old_db", old_db); + pars_info_add_str_literal(pinfo, "old_table", old_table); + pars_info_add_str_literal(pinfo, "new_db", new_db); + pars_info_add_str_literal(pinfo, "new_table", new_table); + + static const char sql[]= + "PROCEDURE RENAME_TABLE_IN_STATS() IS\n" + "BEGIN\n" + "UPDATE \"" TABLE_STATS_NAME "\" SET\n" + "database_name=:new_db, table_name=:new_table\n" + "WHERE database_name=:old_db AND table_name=:old_table;\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "database_name=:new_db, table_name=:new_table\n" + "WHERE database_name=:old_db AND table_name=:old_table;\n" + "END;\n"; + + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/** Rename an index in InnoDB persistent statistics. +@param db database name +@param table table name +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_index(const char *db, const char *table, + const char *old_name, const char *new_name, + trx_t *trx) +{ + if (!dict_stats_persistent_storage_check(true)) + return DB_STATS_DO_NOT_EXIST; + pars_info_t *pinfo= pars_info_create(); + + pars_info_add_str_literal(pinfo, "db", db); + pars_info_add_str_literal(pinfo, "table", table); + pars_info_add_str_literal(pinfo, "old", old_name); + pars_info_add_str_literal(pinfo, "new", new_name); + + static const char sql[]= + "PROCEDURE RENAME_INDEX_IN_STATS() IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET index_name=:new\n" + "WHERE database_name=:db AND table_name=:table AND index_name=:old;\n" + "END;\n"; + + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/** Delete all persistent statistics for a database. +@param db database name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete(const char *db, trx_t *trx) +{ + static const char sql[] = + "PROCEDURE DROP_DATABASE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE database_name=:db;\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE database_name=:db;\n" + "END;\n"; + + pars_info_t *pinfo= pars_info_create(); + pars_info_add_str_literal(pinfo, "db", db); + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/* tests @{ */ +#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS +/* save/fetch aux macros @{ */ +#define TEST_DATABASE_NAME "foobardb" +#define TEST_TABLE_NAME "test_dict_stats" + +#define TEST_N_ROWS 111 +#define TEST_CLUSTERED_INDEX_SIZE 222 +#define TEST_SUM_OF_OTHER_INDEX_SIZES 333 + +#define TEST_IDX1_NAME "tidx1" +#define TEST_IDX1_COL1_NAME "tidx1_col1" +#define TEST_IDX1_INDEX_SIZE 123 +#define TEST_IDX1_N_LEAF_PAGES 234 +#define TEST_IDX1_N_DIFF1 50 +#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE 500 + +#define TEST_IDX2_NAME "tidx2" +#define TEST_IDX2_COL1_NAME "tidx2_col1" +#define TEST_IDX2_COL2_NAME "tidx2_col2" +#define TEST_IDX2_COL3_NAME "tidx2_col3" +#define TEST_IDX2_COL4_NAME "tidx2_col4" +#define TEST_IDX2_INDEX_SIZE 321 +#define TEST_IDX2_N_LEAF_PAGES 432 +#define TEST_IDX2_N_DIFF1 60 +#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE 600 +#define TEST_IDX2_N_DIFF2 61 +#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE 610 +#define TEST_IDX2_N_DIFF3 62 +#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE 620 +#define TEST_IDX2_N_DIFF4 63 +#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE 630 +/* @} */ + +/* test_dict_stats_save() @{ */ +void +test_dict_stats_save() +{ + dict_table_t table; + dict_index_t index1; + dict_field_t index1_fields[1]; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + dict_field_t index2_fields[4]; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + table.stat_n_rows = TEST_N_ROWS; + table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE; + table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES; + UT_LIST_INIT(table.indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + UT_LIST_ADD_LAST(table.indexes, &index1); + UT_LIST_ADD_LAST(table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + index1.table = &table; + index1.cached = 1; + index1.n_uniq = 1; + index1.fields = index1_fields; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + index1.stat_index_size = TEST_IDX1_INDEX_SIZE; + index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES; + index1_fields[0].name = TEST_IDX1_COL1_NAME; + index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1; + index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE; + + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.name = TEST_IDX2_NAME; + index2.table = &table; + index2.cached = 1; + index2.n_uniq = 4; + index2.fields = index2_fields; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + index2.stat_index_size = TEST_IDX2_INDEX_SIZE; + index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES; + index2_fields[0].name = TEST_IDX2_COL1_NAME; + index2_fields[1].name = TEST_IDX2_COL2_NAME; + index2_fields[2].name = TEST_IDX2_COL3_NAME; + index2_fields[3].name = TEST_IDX2_COL4_NAME; + index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1; + index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2; + index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3; + index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4; + index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE; + index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE; + index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; + index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; + + ret = dict_stats_save(&table, NULL); + + ut_a(ret == DB_SUCCESS); + + printf("\nOK: stats saved successfully, now go ahead and read" + " what's inside %s and %s:\n\n", + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); + + printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "n_rows = %d AND\n" + "clustered_index_size = %d AND\n" + "sum_of_other_index_sizes = %d;\n" + "\n", + TABLE_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_N_ROWS, + TEST_CLUSTERED_INDEX_SIZE, + TEST_SUM_OF_OTHER_INDEX_SIZES); + + printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX1_NAME, + TEST_IDX1_INDEX_SIZE, + TEST_IDX1_N_LEAF_PAGES, + TEST_IDX1_N_DIFF1, + TEST_IDX1_N_DIFF1_SAMPLE_SIZE, + TEST_IDX1_COL1_NAME); + + printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s') OR\n" + " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s') OR\n" + " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n" + " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX2_NAME, + TEST_IDX2_INDEX_SIZE, + TEST_IDX2_N_LEAF_PAGES, + TEST_IDX2_N_DIFF1, + TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME, + TEST_IDX2_N_DIFF2, + TEST_IDX2_N_DIFF2_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, + TEST_IDX2_N_DIFF3, + TEST_IDX2_N_DIFF3_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_N_DIFF4, + TEST_IDX2_N_DIFF4_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_COL4_NAME); +} +/* @} */ + +/* test_dict_stats_fetch_from_ps() @{ */ +void +test_dict_stats_fetch_from_ps() +{ + dict_table_t table; + dict_index_t index1; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + UT_LIST_INIT(table.indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + UT_LIST_ADD_LAST(table.indexes, &index1); + UT_LIST_ADD_LAST(table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + index1.cached = 1; + index1.n_uniq = 1; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + + index2.name = TEST_IDX2_NAME; + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.cached = 1; + index2.n_uniq = 4; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + + ret = dict_stats_fetch_from_ps(&table); + + ut_a(ret == DB_SUCCESS); + + ut_a(table.stat_n_rows == TEST_N_ROWS); + ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE); + ut_a(table.stat_sum_of_other_index_sizes + == TEST_SUM_OF_OTHER_INDEX_SIZES); + + ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE); + ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES); + ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1); + ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE); + + ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE); + ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES); + ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1); + ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2); + ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3); + ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4); + ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE); + + printf("OK: fetch successful\n"); +} +/* @} */ + +/* test_dict_stats_all() @{ */ +void +test_dict_stats_all() +{ + test_dict_table_schema_check(); + + test_dict_stats_save(); + + test_dict_stats_fetch_from_ps(); +} +/* @} */ + +#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ +/* @} */ diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc new file mode 100644 index 00000000..a66aac22 --- /dev/null +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -0,0 +1,424 @@ +/***************************************************************************** + +Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats_bg.cc +Code used for background table and index stats gathering. + +Created Apr 25, 2012 Vasil Dimov +*******************************************************/ + +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "mysqld.h" +#ifdef WITH_WSREP +# include "trx0trx.h" +# include "mysql/service_wsrep.h" +# include "wsrep.h" +# include "log.h" +#endif + +#include <vector> + +/** Minimum time interval between stats recalc for a given table */ +#define MIN_RECALC_INTERVAL 10 /* seconds */ +static void dict_stats_schedule(int ms); + +/** Protects recalc_pool */ +static mysql_mutex_t recalc_pool_mutex; + +/** for signaling recalc::state */ +static pthread_cond_t recalc_pool_cond; + +/** Work item of the recalc_pool; protected by recalc_pool_mutex */ +struct recalc +{ + /** identifies a table with persistent statistics */ + table_id_t id; + /** state of the entry */ + enum { IDLE, IN_PROGRESS, IN_PROGRESS_DELETING, DELETING} state; +}; + +/** The multitude of tables whose stats are to be automatically recalculated */ +typedef std::vector<recalc, ut_allocator<recalc>> recalc_pool_t; + +/** Pool where we store information on which tables are to be processed +by background statistics gathering. */ +static recalc_pool_t recalc_pool; +/** Whether the global data structures have been initialized */ +static bool stats_initialised; + +/*****************************************************************//** +Free the resources occupied by the recalc pool, called once during +thread de-initialization. */ +static void dict_stats_recalc_pool_deinit() +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.clear(); + defrag_pool.clear(); + /* + recalc_pool may still have its buffer allocated. It will free it when + its destructor is called. + The problem is, memory leak detector is run before the recalc_pool's + destructor is invoked, and will report recalc_pool's buffer as leaked + memory. To avoid that, we force recalc_pool to surrender its buffer + to empty_pool object, which will free it when leaving this function: + */ + recalc_pool_t recalc_empty_pool; + defrag_pool_t defrag_empty_pool; + recalc_pool.swap(recalc_empty_pool); + defrag_pool.swap(defrag_empty_pool); +} + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. */ +static void dict_stats_recalc_pool_add(table_id_t id) +{ + ut_ad(!srv_read_only_mode); + ut_ad(id); + bool schedule = false; + mysql_mutex_lock(&recalc_pool_mutex); + + const auto begin= recalc_pool.begin(), end= recalc_pool.end(); + if (end == std::find_if(begin, end, [&](const recalc &r){return r.id == id;})) + { + recalc_pool.emplace_back(recalc{id, recalc::IDLE}); + schedule = true; + } + + mysql_mutex_unlock(&recalc_pool_mutex); + if (schedule) + dict_stats_schedule_now(); +} + +#ifdef WITH_WSREP +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table +@param[in] thd current session */ +void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx) +#else +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table */ +void dict_stats_update_if_needed_func(dict_table_t *table) +#endif +{ + if (UNIV_UNLIKELY(!table->stat_initialized)) { + /* The table may have been evicted from dict_sys + and reloaded internally by InnoDB for FOREIGN KEY + processing, but not reloaded by the SQL layer. + + We can (re)compute the transient statistics when the + table is actually loaded by the SQL layer. + + Note: If InnoDB persistent statistics are enabled, + we will skip the updates. We must do this, because + dict_table_get_n_rows() below assumes that the + statistics have been initialized. The DBA may have + to execute ANALYZE TABLE. */ + return; + } + + ulonglong counter = table->stat_modified_counter++; + ulonglong n_rows = dict_table_get_n_rows(table); + + if (dict_stats_is_persistent_enabled(table)) { + if (table->name.is_temporary()) { + return; + } + if (counter > n_rows / 10 /* 10% */ + && dict_stats_auto_recalc_is_enabled(table)) { + +#ifdef WITH_WSREP + /* Do not add table to background + statistic calculation if this thread is not a + applier (as all DDL, which is replicated (i.e + is binlogged in master node), will be executed + with high priority (a.k.a BF) in slave nodes) + and is BF. This could again lead BF lock + waits in applier node but it is better than + no persistent index/table statistics at + applier nodes. TODO: allow BF threads + wait for these InnoDB internal SQL-parser + generated row locks and allow BF thread + lock waits to be enqueued at head of waiting + queue. */ + if (trx.is_wsrep() + && !wsrep_thd_is_applying(trx.mysql_thd) + && wsrep_thd_is_BF(trx.mysql_thd, 0)) { + WSREP_DEBUG("Avoiding background statistics" + " calculation for table %s.", + table->name.m_name); + return; + } +#endif /* WITH_WSREP */ + + dict_stats_recalc_pool_add(table->id); + table->stat_modified_counter = 0; + } + return; + } + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run. + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + ulonglong threshold = 16 + n_rows / 16; /* 6.25% */ + + if (srv_stats_modified_counter) { + threshold = std::min(srv_stats_modified_counter, threshold); + } + + if (counter > threshold) { + /* this will reset table->stat_modified_counter to 0 */ + dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); + } +} + +/** Delete a table from the auto recalc pool, and ensure that +no statistics are being updated on it. */ +void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive) +{ + ut_ad(!srv_read_only_mode); + ut_ad(id); + + mysql_mutex_lock(&recalc_pool_mutex); + + auto end= recalc_pool.end(); + auto i= std::find_if(recalc_pool.begin(), end, + [&](const recalc &r){return r.id == id;}); + if (i != end) + { + switch (i->state) { + case recalc::IN_PROGRESS: + if (!have_mdl_exclusive) + { + i->state= recalc::IN_PROGRESS_DELETING; + do + { + my_cond_wait(&recalc_pool_cond, &recalc_pool_mutex.m_mutex); + end= recalc_pool.end(); + i= std::find_if(recalc_pool.begin(), end, + [&](const recalc &r){return r.id == id;}); + if (i == end) + goto done; + } + while (i->state == recalc::IN_PROGRESS_DELETING); + } + /* fall through */ + case recalc::IDLE: + recalc_pool.erase(i); + break; + case recalc::IN_PROGRESS_DELETING: + case recalc::DELETING: + /* another thread will delete the entry in dict_stats_recalc_pool_del() */ + break; + } + } + +done: + mysql_mutex_unlock(&recalc_pool_mutex); +} + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread() +Must be called before dict_stats_thread() is started. */ +void dict_stats_init() +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_init(recalc_pool_mutex_key, &recalc_pool_mutex, nullptr); + pthread_cond_init(&recalc_pool_cond, nullptr); + dict_defrag_pool_init(); + stats_initialised= true; +} + +/*****************************************************************//** +Free resources allocated by dict_stats_init(), must be called +after dict_stats task has exited. */ +void dict_stats_deinit() +{ + if (!stats_initialised) { + return; + } + + ut_ad(!srv_read_only_mode); + stats_initialised = false; + + dict_stats_recalc_pool_deinit(); + dict_defrag_pool_deinit(); + + mysql_mutex_destroy(&recalc_pool_mutex); + pthread_cond_destroy(&recalc_pool_cond); +} + +/** +Get the first table that has been added for auto recalc and eventually +update its stats. +@return whether the first entry can be processed immediately */ +static bool dict_stats_process_entry_from_recalc_pool(THD *thd) +{ + ut_ad(!srv_read_only_mode); + table_id_t table_id; + mysql_mutex_lock(&recalc_pool_mutex); +next_table_id_with_mutex: + for (auto &r : recalc_pool) + { + if ((table_id= r.id) && r.state == recalc::IDLE) + { + r.state= recalc::IN_PROGRESS; + mysql_mutex_unlock(&recalc_pool_mutex); + goto process; + } + } + mysql_mutex_unlock(&recalc_pool_mutex); + return false; + +process: + MDL_ticket *mdl= nullptr; + dict_table_t *table= dict_table_open_on_id(table_id, false, + DICT_TABLE_OP_NORMAL, thd, &mdl); + if (!table) + { +invalid_table_id: + mysql_mutex_lock(&recalc_pool_mutex); + auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(), + [&](const recalc &r){return r.id == table_id;}); + if (i == recalc_pool.end()); + else if (UNIV_LIKELY(i->state == recalc::IN_PROGRESS)) + recalc_pool.erase(i); + else + { + ut_ad(i->state == recalc::IN_PROGRESS_DELETING); + i->state= recalc::DELETING; + pthread_cond_broadcast(&recalc_pool_cond); + } + goto next_table_id_with_mutex; + } + + ut_ad(!table->is_temporary()); + + if (!mdl || !table->is_accessible()) + { + dict_table_close(table, false, thd, mdl); + goto invalid_table_id; + } + + /* time() could be expensive, the current function + is called once every time a table has been changed more than 10% and + on a system with lots of small tables, this could become hot. If we + find out that this is a problem, then the check below could eventually + be replaced with something else, though a time interval is the natural + approach. */ + const bool update_now= + difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL; + + const dberr_t err= update_now + ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT) + : DB_SUCCESS_LOCKED_REC; + + dict_table_close(table, false, thd, mdl); + + mysql_mutex_lock(&recalc_pool_mutex); + auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(), + [&](const recalc &r){return r.id == table_id;}); + if (i == recalc_pool.end()) + goto done; + else if (i->state == recalc::IN_PROGRESS_DELETING) + { + i->state= recalc::DELETING; + pthread_cond_broadcast(&recalc_pool_cond); +done: + mysql_mutex_unlock(&recalc_pool_mutex); + } + else + { + ut_ad(i->state == recalc::IN_PROGRESS); + recalc_pool.erase(i); + const bool reschedule= !update_now && recalc_pool.empty(); + if (err == DB_SUCCESS_LOCKED_REC) + recalc_pool.emplace_back(recalc{table_id, recalc::IDLE}); + mysql_mutex_unlock(&recalc_pool_mutex); + if (reschedule) + dict_stats_schedule(MIN_RECALC_INTERVAL * 1000); + } + + return update_now; +} + +static tpool::timer* dict_stats_timer; +static std::mutex dict_stats_mutex; + +static void dict_stats_func(void*) +{ + THD *thd= innobase_create_background_thd("InnoDB statistics"); + set_current_thd(thd); + while (dict_stats_process_entry_from_recalc_pool(thd)) {} + dict_defrag_process_entries_from_defrag_pool(thd); + set_current_thd(nullptr); + destroy_background_thd(thd); +} + + +void dict_stats_start() +{ + std::lock_guard<std::mutex> lk(dict_stats_mutex); + if (!dict_stats_timer) + dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func); +} + + +static void dict_stats_schedule(int ms) +{ + std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock); + /* + Use try_lock() to avoid deadlock in dict_stats_shutdown(), which + uses dict_stats_mutex too. If there is simultaneous timer reschedule, + the first one will win, which is fine. + */ + if (!lk.try_lock()) + { + return; + } + if (dict_stats_timer) + dict_stats_timer->set_time(ms,0); +} + +void dict_stats_schedule_now() +{ + dict_stats_schedule(0); +} + +/** Shut down the dict_stats_thread. */ +void dict_stats_shutdown() +{ + std::lock_guard<std::mutex> lk(dict_stats_mutex); + delete dict_stats_timer; + dict_stats_timer= 0; +} diff --git a/storage/innobase/dict/drop.cc b/storage/innobase/dict/drop.cc new file mode 100644 index 00000000..dce71974 --- /dev/null +++ b/storage/innobase/dict/drop.cc @@ -0,0 +1,297 @@ +/***************************************************************************** + +Copyright (c) 2021, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** +@file dict/drop.cc +Data Dictionary Language operations that delete .ibd files */ + +/* We implement atomic data dictionary operations as follows. + +1. A data dictionary transaction is started. +2. We acquire exclusive lock on all the tables that are to be dropped +during the execution of the transaction. +3. We lock the data dictionary cache. +4. All metadata tables will be updated within the single DDL transaction, +including deleting or renaming InnoDB persistent statistics. +4b. If any lock wait would occur while we are holding the dict_sys latches, +we will instantly report a timeout error and roll back the transaction. +5. The transaction metadata is marked as committed. +6. If any files were deleted, we will durably write FILE_DELETE +to the redo log and start deleting the files. +6b. Also purge after a commit may perform file deletion. This is also the +recovery mechanism if the server was killed between step 5 and 6. +7. We unlock the data dictionary cache. +8. The file handles of the unlinked files will be closed. This will actually +reclaim the space in the file system (delete-on-close semantics). + +Notes: + +(a) Purge will be locked out by MDL. For internal tables related to +FULLTEXT INDEX, purge will not acquire MDL on the user table name, +and therefore, when we are dropping any FTS_ tables, we must suspend +and resume purge to prevent a race condition. + +(b) If a transaction needs to both drop and create a table by some +name, it must rename the table in between. This is used by +ha_innobase::truncate() and fts_drop_common_tables(). + +(c) No data is ever destroyed before the transaction is committed, +so we can trivially roll back the transaction at any time. +Lock waits during a DDL operation are no longer a fatal error +that would cause the InnoDB to hang or to intentionally crash. +(Only ALTER TABLE...DISCARD TABLESPACE may discard data before commit.) + +(d) The only changes to the data dictionary cache that are performed +before transaction commit and must be rolled back explicitly are as follows: +(d1) fts_optimize_add_table() to undo fts_optimize_remove_table() +*/ + +#include "trx0purge.h" +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" + +#include "dict0defrag_bg.h" +#include "btr0defragment.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" + +#include "que0que.h" +#include "pars0pars.h" + +/** Try to drop the foreign key constraints for a persistent table. +@param name name of persistent table +@return error code */ +dberr_t trx_t::drop_table_foreign(const table_name_t &name) +{ + ut_ad(dict_sys.locked()); + ut_ad(state == TRX_STATE_ACTIVE); + ut_ad(dict_operation); + ut_ad(dict_operation_lock_mode); + + if (!dict_sys.sys_foreign || dict_sys.sys_foreign->corrupted) + return DB_SUCCESS; + + if (!dict_sys.sys_foreign_cols || dict_sys.sys_foreign_cols->corrupted) + return DB_SUCCESS; + + pars_info_t *info= pars_info_create(); + pars_info_add_str_literal(info, "name", name.m_name); + return que_eval_sql(info, + "PROCEDURE DROP_FOREIGN() IS\n" + "fid CHAR;\n" + + "DECLARE CURSOR fk IS\n" + "SELECT ID FROM SYS_FOREIGN\n" + "WHERE FOR_NAME=:name\n" + "AND TO_BINARY(FOR_NAME)=TO_BINARY(:name)\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "OPEN fk;\n" + "WHILE 1=1 LOOP\n" + " FETCH fk INTO fid;\n" + " IF (SQL % NOTFOUND)THEN RETURN;END IF;\n" + " DELETE FROM SYS_FOREIGN_COLS" + " WHERE ID=fid;\n" + " DELETE FROM SYS_FOREIGN WHERE ID=fid;\n" + "END LOOP;\n" + "CLOSE fk;\n" + "END;\n", this); +} + +/** Try to drop the statistics for a persistent table. +@param name name of persistent table +@return error code */ +dberr_t trx_t::drop_table_statistics(const table_name_t &name) +{ + ut_ad(dict_sys.locked()); + ut_ad(dict_operation_lock_mode); + + if (strstr(name.m_name, "/" TEMP_FILE_PREFIX_INNODB) || + !strcmp(name.m_name, TABLE_STATS_NAME) || + !strcmp(name.m_name, INDEX_STATS_NAME)) + return DB_SUCCESS; + + char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN]; + dict_fs2utf8(name.m_name, db, sizeof db, table, sizeof table); + + dberr_t err= dict_stats_delete_from_table_stats(db, table, this); + if (err == DB_SUCCESS || err == DB_STATS_DO_NOT_EXIST) + { + err= dict_stats_delete_from_index_stats(db, table, this); + if (err == DB_STATS_DO_NOT_EXIST) + err= DB_SUCCESS; + } + return err; +} + +/** Try to drop a persistent table. +@param table persistent table +@param fk whether to drop FOREIGN KEY metadata +@return error code */ +dberr_t trx_t::drop_table(const dict_table_t &table) +{ + ut_ad(dict_sys.locked()); + ut_ad(state == TRX_STATE_ACTIVE); + ut_ad(dict_operation); + ut_ad(dict_operation_lock_mode); + ut_ad(!table.is_temporary()); + /* The table must be exclusively locked by this transaction. */ + ut_ad(table.get_ref_count() <= 1); + ut_ad(table.n_lock_x_or_s == 1); + ut_ad(UT_LIST_GET_LEN(table.locks) >= 1); +#ifdef UNIV_DEBUG + bool found_x= false; + for (lock_t *lock= UT_LIST_GET_FIRST(table.locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + { + ut_ad(lock->trx == this); + switch (lock->type_mode) { + case LOCK_TABLE | LOCK_X: + found_x= true; + break; + case LOCK_TABLE | LOCK_IX: + case LOCK_TABLE | LOCK_AUTO_INC: + break; + default: + ut_ad("unexpected lock type" == 0); + } + } + ut_ad(found_x); +#endif + + if (dict_sys.sys_virtual && !dict_sys.sys_virtual->corrupted) + { + pars_info_t *info= pars_info_create(); + pars_info_add_ull_literal(info, "id", table.id); + if (dberr_t err= que_eval_sql(info, + "PROCEDURE DROP_VIRTUAL() IS\n" + "BEGIN\n" + "DELETE FROM SYS_VIRTUAL" + " WHERE TABLE_ID=:id;\n" + "END;\n", this)) + return err; + } + + /* Once DELETE FROM SYS_INDEXES is committed, purge may invoke + dict_drop_index_tree(). */ + + if (!(table.flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))); + else if (dberr_t err= fts_drop_tables(this, table)) + { + ib::error() << "Unable to remove FTS tables for " + << table.name << ": " << err; + return err; + } + + mod_tables.emplace(const_cast<dict_table_t*>(&table), undo_no). + first->second.set_dropped(); + + pars_info_t *info= pars_info_create(); + pars_info_add_ull_literal(info, "id", table.id); + return que_eval_sql(info, + "PROCEDURE DROP_TABLE() IS\n" + "iid CHAR;\n" + + "DECLARE CURSOR idx IS\n" + "SELECT ID FROM SYS_INDEXES\n" + "WHERE TABLE_ID=:id FOR UPDATE;\n" + + "BEGIN\n" + + "DELETE FROM SYS_TABLES WHERE ID=:id;\n" + "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=:id;\n" + + "OPEN idx;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH idx INTO iid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n" + "END LOOP;\n" + "CLOSE idx;\n" + + "END;\n", this); +} + +/** Commit the transaction, possibly after drop_table(). +@param deleted handles of data files that were deleted */ +void trx_t::commit(std::vector<pfs_os_file_t> &deleted) +{ + ut_ad(dict_operation); + flush_log_later= true; + commit_persist(); + flush_log_later= false; + if (dict_operation) + { + std::vector<uint32_t> space_ids; + space_ids.reserve(mod_tables.size()); + ut_ad(dict_sys.locked()); + lock_sys.wr_lock(SRW_LOCK_CALL); + mutex_lock(); + lock_release_on_drop(this); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(ib_vector_is_empty(autoinc_locks)); + mem_heap_empty(lock.lock_heap); + lock.table_locks.clear(); + /* commit_persist() already reset this. */ + ut_ad(!lock.was_chosen_as_deadlock_victim); + lock.n_rec_locks= 0; + while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) + { + UT_LIST_REMOVE(lock.evicted_tables, table); + dict_mem_table_free(table); + } + dict_operation= false; + id= 0; + mutex_unlock(); + + for (const auto &p : mod_tables) + { + if (p.second.is_dropped()) + { + dict_table_t *table= p.first; + dict_stats_recalc_pool_del(table->id, true); + dict_stats_defrag_pool_del(table, nullptr); + if (btr_defragment_active) + btr_defragment_remove_table(table); + const fil_space_t *space= table->space; + ut_ad(!p.second.is_aux_table() || purge_sys.must_wait_FTS()); + dict_sys.remove(table); + if (const auto id= space ? space->id : 0) + { + space_ids.emplace_back(id); + pfs_os_file_t d= fil_delete_tablespace(id); + if (d != OS_FILE_CLOSED) + deleted.emplace_back(d); + } + } + } + + lock_sys.wr_unlock(); + + mysql_mutex_lock(&lock_sys.wait_mutex); + lock_sys.deadlock_check(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + + for (const auto id : space_ids) + ibuf_delete_for_discarded_space(id); + } + commit_cleanup(); +} |