summaryrefslogtreecommitdiffstats
path: root/storage/innobase
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/CMakeLists.txt28
-rw-r--r--storage/innobase/btr/btr0btr.cc183
-rw-r--r--storage/innobase/btr/btr0bulk.cc2
-rw-r--r--storage/innobase/btr/btr0cur.cc107
-rw-r--r--storage/innobase/btr/btr0pcur.cc209
-rw-r--r--storage/innobase/buf/buf0block_hint.cc59
-rw-r--r--storage/innobase/buf/buf0buf.cc264
-rw-r--r--storage/innobase/buf/buf0flu.cc208
-rw-r--r--storage/innobase/buf/buf0lru.cc184
-rw-r--r--storage/innobase/buf/buf0rea.cc2
-rw-r--r--storage/innobase/dict/dict0dict.cc136
-rw-r--r--storage/innobase/dict/dict0stats.cc4
-rw-r--r--storage/innobase/fil/fil0crypt.cc2
-rw-r--r--storage/innobase/fil/fil0fil.cc86
-rw-r--r--storage/innobase/fsp/fsp0file.cc7
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc299
-rw-r--r--storage/innobase/fts/fts0fts.cc62
-rw-r--r--storage/innobase/fts/fts0opt.cc2
-rw-r--r--storage/innobase/fts/fts0que.cc47
-rw-r--r--storage/innobase/fut/fut0lst.cc135
-rw-r--r--storage/innobase/gis/gis0sea.cc117
-rw-r--r--storage/innobase/handler/ha_innodb.cc806
-rw-r--r--storage/innobase/handler/ha_innodb.h7
-rw-r--r--storage/innobase/handler/handler0alter.cc71
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.cc61
-rw-r--r--storage/innobase/include/btr0btr.h17
-rw-r--r--storage/innobase/include/btr0pcur.h8
-rw-r--r--storage/innobase/include/buf0block_hint.h76
-rw-r--r--storage/innobase/include/buf0buf.h50
-rw-r--r--storage/innobase/include/buf0buf.inl14
-rw-r--r--storage/innobase/include/buf0flu.h10
-rw-r--r--storage/innobase/include/cache.h33
-rw-r--r--storage/innobase/include/data0data.h13
-rw-r--r--storage/innobase/include/data0data.inl30
-rw-r--r--storage/innobase/include/db0err.h12
-rw-r--r--storage/innobase/include/dict0dict.h67
-rw-r--r--storage/innobase/include/dict0mem.h5
-rw-r--r--storage/innobase/include/dict0mem.inl1
-rw-r--r--storage/innobase/include/dyn0buf.h9
-rw-r--r--storage/innobase/include/dyn0types.h3
-rw-r--r--storage/innobase/include/fil0fil.h60
-rw-r--r--storage/innobase/include/fsp0fsp.h20
-rw-r--r--storage/innobase/include/fts0fts.h6
-rw-r--r--storage/innobase/include/fts0priv.h21
-rw-r--r--storage/innobase/include/fts0priv.inl44
-rw-r--r--storage/innobase/include/fts0types.h38
-rw-r--r--storage/innobase/include/fts0types.inl47
-rw-r--r--storage/innobase/include/fut0lst.h50
-rw-r--r--storage/innobase/include/gis0type.h6
-rw-r--r--storage/innobase/include/lock0lock.h74
-rw-r--r--storage/innobase/include/log0crypt.h3
-rw-r--r--storage/innobase/include/log0log.h171
-rw-r--r--storage/innobase/include/mtr0mtr.h28
-rw-r--r--storage/innobase/include/os0file.h89
-rw-r--r--storage/innobase/include/os0file.inl8
-rw-r--r--storage/innobase/include/row0merge.h11
-rw-r--r--storage/innobase/include/row0row.h6
-rw-r--r--storage/innobase/include/row0sel.h4
-rw-r--r--storage/innobase/include/srv0mon.h2
-rw-r--r--storage/innobase/include/srv0srv.h4
-rw-r--r--storage/innobase/include/srw_lock.h50
-rw-r--r--storage/innobase/include/trx0purge.h181
-rw-r--r--storage/innobase/include/trx0rseg.h12
-rw-r--r--storage/innobase/include/trx0trx.h31
-rw-r--r--storage/innobase/include/trx0undo.inl3
-rw-r--r--storage/innobase/include/ut0new.h3
-rw-r--r--storage/innobase/include/ut0ut.h14
-rw-r--r--storage/innobase/include/ut0vec.h9
-rw-r--r--storage/innobase/include/ut0vec.inl13
-rw-r--r--storage/innobase/lock/lock0lock.cc437
-rw-r--r--storage/innobase/log/log0log.cc38
-rw-r--r--storage/innobase/log/log0recv.cc32
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc525
-rw-r--r--storage/innobase/os/os0file.cc805
-rw-r--r--storage/innobase/page/page0zip.cc3
-rw-r--r--storage/innobase/rem/rem0rec.cc2
-rw-r--r--storage/innobase/row/row0import.cc511
-rw-r--r--storage/innobase/row/row0ins.cc23
-rw-r--r--storage/innobase/row/row0merge.cc92
-rw-r--r--storage/innobase/row/row0mysql.cc5
-rw-r--r--storage/innobase/row/row0quiesce.cc5
-rw-r--r--storage/innobase/row/row0sel.cc25
-rw-r--r--storage/innobase/row/row0umod.cc2
-rw-r--r--storage/innobase/srv/srv0mon.cc10
-rw-r--r--storage/innobase/srv/srv0start.cc165
-rw-r--r--storage/innobase/sync/cache.cc160
-rw-r--r--storage/innobase/sync/srw_lock.cc121
-rw-r--r--storage/innobase/trx/trx0purge.cc247
-rw-r--r--storage/innobase/trx/trx0rseg.cc62
-rw-r--r--storage/innobase/trx/trx0trx.cc24
-rw-r--r--storage/innobase/trx/trx0undo.cc42
-rw-r--r--storage/innobase/unittest/CMakeLists.txt4
-rw-r--r--storage/innobase/unittest/innodb_rbt-t.cc83
-rw-r--r--storage/innobase/ut/ut0ut.cc47
94 files changed, 4290 insertions, 3624 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 32c0a437..b3125ca9 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -48,6 +48,13 @@ IF(UNIX)
IF(HAVE_LIBNUMA)
LINK_LIBRARIES(numa)
ENDIF()
+ IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch|AARCH|p(ower)?pc|x86_|amd)64")
+ OPTION(WITH_INNODB_PMEM "Support memory-mapped InnoDB redo log" ON)
+ ELSE() # Disable by default on ISA that are not covered by our CI
+ OPTION(WITH_INNODB_PMEM "Support memory-mapped InnoDB redo log" OFF)
+ ENDIF()
+ ENDIF()
ENDIF()
ENDIF()
@@ -71,7 +78,7 @@ ADD_FEATURE_INFO(INNODB_ROOT_GUESS WITH_INNODB_ROOT_GUESS
OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF)
IF(WITH_INNODB_EXTRA_DEBUG)
- ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG)
+ ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG -DLOG_LATCH_DEBUG)
ENDIF()
ADD_FEATURE_INFO(INNODB_EXTRA_DEBUG WITH_INNODB_EXTRA_DEBUG "Extra InnoDB debug checks")
@@ -133,7 +140,6 @@ SET(INNOBASE_SOURCES
btr/btr0pcur.cc
btr/btr0sea.cc
btr/btr0defragment.cc
- buf/buf0block_hint.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@@ -428,26 +434,16 @@ SET(INNOBASE_SOURCES
ut/ut0vec.cc
ut/ut0wqueue.cc)
-OPTION(WITH_PMEM "Support redo log in persistent memory" OFF)
-FIND_PACKAGE(PMEM)
-IF(PMEM_FOUND)
- INCLUDE_DIRECTORIES(${PMEM_INCLUDES})
- ADD_COMPILE_FLAGS(log/log0log.cc log/log0recv.cc
- buf/buf0flu.cc mtr/mtr0mtr.cc trx/trx0trx.cc srv/srv0start.cc
- COMPILE_FLAGS "-DHAVE_PMEM")
- SET(PMEM_LIBRARY ${PMEM_LIBRARIES})
-ELSE()
- IF(WITH_PMEM)
- MESSAGE(FATAL_ERROR "WITH_PMEM=ON cannot be satisfied")
- ENDIF()
+IF(WITH_INNODB_PMEM)
+ ADD_DEFINITIONS(-DHAVE_PMEM)
+ SET(INNOBASE_SOURCES ${INNOBASE_SOURCES} include/cache.h sync/cache.cc)
ENDIF()
MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
MODULE_OUTPUT_NAME ha_innodb
DEFAULT RECOMPILE_FOR_EMBEDDED
LINK_LIBRARIES
- ${ZLIB_LIBRARY}
- ${PMEM_LIBRARY}
+ ${ZLIB_LIBRARIES}
${NUMA_LIBRARY}
${LIBSYSTEMD}
${LINKER_SCRIPT})
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 705ff035..6b3a3733 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -264,6 +264,8 @@ btr_root_block_get(
mtr_t* mtr, /*!< in: mtr */
dberr_t* err) /*!< out: error code */
{
+ ut_ad(mode != RW_NO_LATCH);
+
if (!index->table || !index->table->space)
{
*err= DB_TABLESPACE_NOT_FOUND;
@@ -285,13 +287,12 @@ btr_root_block_get(
if (UNIV_LIKELY(block != nullptr))
{
- if (UNIV_UNLIKELY(mode == RW_NO_LATCH));
- else if (!!page_is_comp(block->page.frame) !=
- index->table->not_redundant() ||
- btr_page_get_index_id(block->page.frame) != index->id ||
- !fil_page_index_page_check(block->page.frame) ||
- index->is_spatial() !=
- (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+ if (!!page_is_comp(block->page.frame) !=
+ index->table->not_redundant() ||
+ btr_page_get_index_id(block->page.frame) != index->id ||
+ !fil_page_index_page_check(block->page.frame) ||
+ index->is_spatial() !=
+ (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
{
*err= DB_PAGE_CORRUPTED;
block= nullptr;
@@ -561,13 +562,39 @@ btr_page_alloc_for_ibuf(
{
buf_page_make_young_if_needed(&new_block->page);
*err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
- PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+ PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+ fil_system.sys_space->free_limit, mtr);
}
ut_d(if (*err == DB_SUCCESS)
flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
return new_block;
}
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Acquire a latch on the index root page for allocating or freeing pages.
+@param index index tree
+@param mtr mini-transaction
+@param err error code
+@return root page
+@retval nullptr if an error occurred */
+buf_block_t *btr_root_block_sx(dict_index_t *index, mtr_t *mtr, dberr_t *err)
+{
+ buf_block_t *root=
+ mtr->get_already_latched(page_id_t{index->table->space_id, index->page},
+ MTR_MEMO_PAGE_SX_FIX);
+ if (!root)
+ {
+ root= btr_root_block_get(index, RW_SX_LATCH, mtr, err);
+ if (UNIV_UNLIKELY(!root))
+ return root;
+ }
+#ifdef BTR_CUR_HASH_ADAPT
+ else
+ ut_ad(!root->index || !root->index->freed());
+#endif
+ return root;
+}
+
/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@@ -589,21 +616,9 @@ btr_page_alloc_low(
page should be initialized. */
dberr_t* err) /*!< out: error code */
{
- const auto savepoint= mtr->get_savepoint();
- buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err);
+ buf_block_t *root= btr_root_block_sx(index, mtr, err);
if (UNIV_UNLIKELY(!root))
return root;
-
- const bool have_latch= mtr->have_u_or_x_latch(*root);
-#ifdef BTR_CUR_HASH_ADAPT
- ut_ad(!have_latch || !root->index || !root->index->freed());
-#endif
- mtr->rollback_to_savepoint(savepoint);
-
- if (!have_latch &&
- UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))))
- return root;
-
fseg_header_t *seg_header= root->page.frame +
(level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction,
@@ -652,7 +667,8 @@ btr_page_free_for_ibuf(
buf_block_t *root= btr_get_latched_root(*index, mtr);
dberr_t err=
flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
- block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+ block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+ fil_system.sys_space->free_limit, mtr);
ut_d(if (err == DB_SUCCESS)
flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
return err;
@@ -696,24 +712,16 @@ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
fil_space_t *space= index->table->space;
dberr_t err;
- const auto savepoint= mtr->get_savepoint();
- if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
+ if (buf_block_t *root= btr_root_block_sx(index, mtr, &err))
{
- const bool have_latch= mtr->have_u_or_x_latch(*root);
-#ifdef BTR_CUR_HASH_ADAPT
- ut_ad(!have_latch || !root->index || !root->index->freed());
-#endif
- mtr->rollback_to_savepoint(savepoint);
- if (have_latch ||
- (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)))
- err= fseg_free_page(&root->page.frame[blob ||
- page_is_leaf(block->page.frame)
- ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
- : PAGE_HEADER + PAGE_BTR_SEG_TOP],
- space, page, mtr, space_latched);
+ err= fseg_free_page(&root->page.frame[blob ||
+ page_is_leaf(block->page.frame)
+ ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+ : PAGE_HEADER + PAGE_BTR_SEG_TOP],
+ space, page, mtr, space_latched);
+ if (err == DB_SUCCESS)
+ buf_page_free(space, page, mtr);
}
- if (err == DB_SUCCESS)
- buf_page_free(space, page, mtr);
/* The page was marked free in the allocation bitmap, but it
should remain exclusively latched until mtr_t::commit() or until it
@@ -1291,54 +1299,71 @@ btr_read_autoinc(dict_index_t* index)
return autoinc;
}
-/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
-or fall back to MAX(auto_increment_column).
-@param[in] table table containing an AUTO_INCREMENT column
-@param[in] col_no index of the AUTO_INCREMENT column
-@return the AUTO_INCREMENT value
-@retval 0 on error or if no AUTO_INCREMENT value was used yet */
-ib_uint64_t
-btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+dict_index_t *dict_table_t::get_index(const dict_col_t &col) const
{
- ut_ad(table->persistent_autoinc);
- ut_ad(!table->is_temporary());
+ dict_index_t *index= dict_table_get_first_index(this);
- dict_index_t* index = dict_table_get_first_index(table);
+ while (index && (index->fields[0].col != &col || index->is_corrupted()))
+ index= dict_table_get_next_index(index);
- if (index == NULL) {
- return 0;
- }
+ return index;
+}
- mtr_t mtr;
- mtr.start();
- buf_block_t* block = buf_page_get(
- page_id_t(index->table->space_id, index->page),
- index->table->space->zip_size(),
- RW_S_LATCH, &mtr);
-
- ib_uint64_t autoinc = block
- ? page_get_autoinc(block->page.frame) : 0;
- const bool retry = block && autoinc == 0
- && !page_is_empty(block->page.frame);
- mtr.commit();
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param table table containing an AUTO_INCREMENT column
+@param col_no index of the AUTO_INCREMENT column
+@param mysql_version TABLE_SHARE::mysql_version
+@param max the maximum value of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+uint64_t btr_read_autoinc_with_fallback(const dict_table_t *table,
+ unsigned col_no, ulong mysql_version,
+ uint64_t max)
+{
+ ut_ad(table->persistent_autoinc);
+ ut_ad(!table->is_temporary());
- if (retry) {
- /* This should be an old data file where
- PAGE_ROOT_AUTO_INC was initialized to 0.
- Fall back to reading MAX(autoinc_col).
- There should be an index on it. */
- const dict_col_t* autoinc_col
- = dict_table_get_nth_col(table, col_no);
- while (index && index->fields[0].col != autoinc_col) {
- index = dict_table_get_next_index(index);
- }
+ uint64_t autoinc= 0;
+ mtr_t mtr;
+ mtr.start();
- if (index) {
- autoinc = row_search_max_autoinc(index);
- }
- }
+ if (buf_block_t *block=
+ buf_page_get(page_id_t(table->space_id,
+ dict_table_get_first_index(table)->page),
+ table->space->zip_size(), RW_SX_LATCH, &mtr))
+ {
+ autoinc= page_get_autoinc(block->page.frame);
- return autoinc;
+ if (autoinc > 0 && autoinc <= max && mysql_version >= 100210);
+ else if (dict_index_t *index=
+ table->get_index(*dict_table_get_nth_col(table, col_no)))
+ {
+ /* Read MAX(autoinc_col), in case this table had originally been
+ created before MariaDB 10.2.4 introduced persistent AUTO_INCREMENT
+ and MariaDB 10.2.10 fixed MDEV-12123, and there could be a garbage
+ value in the PAGE_ROOT_AUTO_INC field. */
+ const uint64_t max_autoinc= row_search_max_autoinc(index);
+ const bool need_adjust{autoinc > max || autoinc < max_autoinc};
+ ut_ad(max_autoinc <= max);
+
+ if (UNIV_UNLIKELY(need_adjust) && !high_level_read_only && !opt_readonly)
+ {
+ sql_print_information("InnoDB: Resetting PAGE_ROOT_AUTO_INC from "
+ UINT64PF " to " UINT64PF
+ " on table %`.*s.%`s (created with version %lu)",
+ autoinc, max_autoinc,
+ int(table->name.dblen()), table->name.m_name,
+ table->name.basename(), mysql_version);
+ autoinc= max_autoinc;
+ index->set_modified(mtr);
+ page_set_autoinc(block, max_autoinc, &mtr, true);
+ }
+ }
+ }
+
+ mtr.commit();
+ return autoinc;
}
/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index 5bf68c58..e2513ad6 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -837,7 +837,7 @@ PageBulk::release()
m_block->page.fix();
/* No other threads can modify this block. */
- m_modify_clock = buf_block_get_modify_clock(m_block);
+ m_modify_clock = m_block->modify_clock;
m_mtr.commit();
}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 46afb73b..2fc05b06 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -817,7 +817,7 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index)
/* Determine the maximum length of the index field. */
field_max_size = dict_col_get_fixed_size(col, comp);
- if (field_max_size) {
+ if (field_max_size && field->fixed_len) {
/* dict_index_add_col() should guarantee this */
ut_ad(!field->prefix_len
|| field->fixed_len == field->prefix_len);
@@ -935,7 +935,7 @@ static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
return PAGE_CUR_LE;
}
-static MY_ATTRIBUTE((nonnull))
+MY_ATTRIBUTE((nonnull,warn_unused_result))
/** Acquire a latch on the previous page without violating the latching order.
@param block index page
@param page_id page identifier with valid space identifier
@@ -946,8 +946,9 @@ static MY_ATTRIBUTE((nonnull))
@retval 0 if an error occurred
@retval 1 if the page could be latched in the wrong order
@retval -1 if the latch on block was temporarily released */
-int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size,
- rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
+static int btr_latch_prev(buf_block_t *block, page_id_t page_id,
+ ulint zip_size,
+ rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
{
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
ut_ad(page_id.space() == block->page.id().space());
@@ -955,47 +956,80 @@ int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size,
const auto prev_savepoint= mtr->get_savepoint();
ut_ad(block == mtr->at_savepoint(prev_savepoint - 1));
- page_id.set_page_no(btr_page_get_prev(block->page.frame));
+ const page_t *const page= block->page.frame;
+ page_id.set_page_no(btr_page_get_prev(page));
+ /* We are holding a latch on the current page.
+
+ We will start by buffer-fixing the left sibling. Waiting for a latch
+ on it while holding a latch on the current page could lead to a
+ deadlock, because another thread could hold that latch and wait for
+ a right sibling page latch (the current page).
+
+ If there is a conflict, we will temporarily release our latch on the
+ current block while waiting for a latch on the left sibling. The
+ buffer-fixes on both blocks will prevent eviction. */
+
+ retry:
buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
BUF_GET, mtr, err, false);
if (UNIV_UNLIKELY(!prev))
return 0;
int ret= 1;
- if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH))
+ static_assert(MTR_MEMO_PAGE_S_FIX == mtr_memo_type_t(BTR_SEARCH_LEAF), "");
+ static_assert(MTR_MEMO_PAGE_X_FIX == mtr_memo_type_t(BTR_MODIFY_LEAF), "");
+
+ if (rw_latch == RW_S_LATCH
+ ? prev->page.lock.s_lock_try() : prev->page.lock.x_lock_try())
{
- if (UNIV_LIKELY(prev->page.lock.s_lock_try()))
+ mtr->lock_register(prev_savepoint, mtr_memo_type_t(rw_latch));
+ if (UNIV_UNLIKELY(prev->page.id() != page_id))
{
- mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX);
- goto prev_latched;
+ fail:
+ /* the page was just read and found to be corrupted */
+ mtr->rollback_to_savepoint(prev_savepoint);
+ return 0;
}
- block->page.lock.s_unlock();
}
else
{
- if (UNIV_LIKELY(prev->page.lock.x_lock_try()))
+ ut_ad(mtr->at_savepoint(mtr->get_savepoint() - 1)->page.id() == page_id);
+ mtr->release_last_page();
+ if (rw_latch == RW_S_LATCH)
+ block->page.lock.s_unlock();
+ else
+ block->page.lock.x_unlock();
+
+ prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
+ BUF_GET, mtr, err);
+ if (rw_latch == RW_S_LATCH)
+ block->page.lock.s_lock();
+ else
+ block->page.lock.x_lock();
+
+ const page_id_t prev_page_id= page_id;
+ page_id.set_page_no(btr_page_get_prev(page));
+
+ if (UNIV_UNLIKELY(page_id != prev_page_id))
{
- mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX);
- goto prev_latched;
+ mtr->release_last_page();
+ if (page_id.page_no() == FIL_NULL)
+ return -1;
+ goto retry;
}
- block->page.lock.x_unlock();
+
+ if (UNIV_UNLIKELY(!prev))
+ goto fail;
+
+ ret= -1;
}
- ret= -1;
- mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX);
- mtr->rollback_to_savepoint(prev_savepoint);
- prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
- BUF_GET, mtr, err, false);
- if (UNIV_UNLIKELY(!prev))
- return 0;
- mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch);
-
- prev_latched:
- if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame,
- FIL_PAGE_TYPE + block->page.frame, 2) ||
- memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame,
- PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) ||
- page_is_comp(prev->page.frame) != page_is_comp(block->page.frame))
+ const page_t *const p= prev->page.frame;
+ if (memcmp_aligned<4>(FIL_PAGE_NEXT + p, FIL_PAGE_OFFSET + page, 4) ||
+ memcmp_aligned<2>(FIL_PAGE_TYPE + p, FIL_PAGE_TYPE + page, 2) ||
+ memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + p,
+ PAGE_HEADER + PAGE_INDEX_ID + page, 8) ||
+ page_is_comp(p) != page_is_comp(page))
{
ut_ad("corrupted" == 0); // FIXME: remove this
*err= DB_CORRUPTION;
@@ -6092,7 +6126,6 @@ btr_store_big_rec_extern_fields(
for (ulint blob_npages = 0;; ++blob_npages) {
buf_block_t* block;
const ulint commit_freq = 4;
- uint32_t r_extents;
ut_ad(page_align(field_ref) == page_align(rec));
@@ -6127,22 +6160,14 @@ btr_store_big_rec_extern_fields(
hint_prev = rec_block->page.id().page_no();
}
- error = fsp_reserve_free_extents(
- &r_extents, index->table->space, 1,
- FSP_BLOB, &mtr, 1);
- if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
-alloc_fail:
- mtr.commit();
- goto func_exit;
- }
-
block = btr_page_alloc(index, hint_prev + 1,
FSP_NO_DIR, 0, &mtr, &mtr,
&error);
- index->table->space->release_free_extents(r_extents);
if (!block) {
- goto alloc_fail;
+alloc_fail:
+ mtr.commit();
+ goto func_exit;
}
const uint32_t space_id = block->page.id().space();
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
index 2131fb94..de0f9e93 100644
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -179,10 +179,8 @@ before_first:
cursor->old_n_fields,
&cursor->old_rec_buf,
&cursor->buf_size);
- cursor->block_when_stored.store(block);
-
- /* Function try to check if block is S/X latch. */
- cursor->modify_clock = buf_block_get_modify_clock(block);
+ cursor->old_page_id = block->page.id();
+ cursor->modify_clock = block->modify_clock;
}
/**************************************************************//**
@@ -214,101 +212,80 @@ btr_pcur_copy_stored_position(
}
/** Optimistically latches the leaf page or pages requested.
-@param[in] block guessed buffer block
-@param[in,out] pcur cursor
-@param[in,out] latch_mode BTR_SEARCH_LEAF, ...
-@param[in,out] mtr mini-transaction
-@return true if success */
+@param pcur persistent cursor
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param mtr mini-transaction
+@return true on success */
TRANSACTIONAL_TARGET
-static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block,
- btr_pcur_t *pcur,
+static bool btr_pcur_optimistic_latch_leaves(btr_pcur_t *pcur,
btr_latch_mode *latch_mode,
mtr_t *mtr)
{
- ut_ad(block->page.buf_fix_count());
- ut_ad(block->page.in_file());
- ut_ad(block->page.frame);
-
static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) ==
(RW_S_LATCH ^ RW_X_LATCH), "");
+ buf_block_t *const block=
+ buf_page_optimistic_fix(pcur->btr_cur.page_cur.block, pcur->old_page_id);
+
+ if (!block)
+ return false;
+
+ if (*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF)
+ return buf_page_optimistic_get(block, rw_lock_type_t(*latch_mode),
+ pcur->modify_clock, mtr);
+
+ ut_ad(*latch_mode == BTR_SEARCH_PREV || *latch_mode == BTR_MODIFY_PREV);
const rw_lock_type_t mode=
rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH));
- switch (*latch_mode) {
- default:
- ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF);
- return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr);
- case BTR_SEARCH_PREV:
- case BTR_MODIFY_PREV:
- page_id_t id{0};
- uint32_t left_page_no;
- ulint zip_size;
- buf_block_t *left_block= nullptr;
- {
- transactional_shared_lock_guard<block_lock> g{block->page.lock};
- if (block->modify_clock != pcur->modify_clock)
- return false;
- id= block->page.id();
- zip_size= block->zip_size();
- left_page_no= btr_page_get_prev(block->page.frame);
- }
-
- if (left_page_no != FIL_NULL)
- {
- left_block=
- buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size,
- mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
-
- if (!left_block);
- else if (btr_page_get_next(left_block->page.frame) != id.page_no())
- {
-release_left_block:
- mtr->release_last_page();
- return false;
- }
- else
- buf_page_make_young_if_needed(&left_block->page);
- }
-
- if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr))
- {
- if (btr_page_get_prev(block->page.frame) == left_page_no)
- {
- /* block was already buffer-fixed while entering the function and
- buf_page_optimistic_get() buffer-fixes it again. */
- ut_ad(2 <= block->page.buf_fix_count());
- *latch_mode= btr_latch_mode(mode);
- return true;
- }
-
- mtr->release_last_page();
- }
-
- ut_ad(block->page.buf_fix_count());
- if (left_block)
- goto release_left_block;
- return false;
+ uint64_t modify_clock;
+ uint32_t left_page_no;
+ const page_t *const page= block->page.frame;
+ {
+ transactional_shared_lock_guard<block_lock> g{block->page.lock};
+ modify_clock= block->modify_clock;
+ left_page_no= btr_page_get_prev(page);
}
-}
-/** Structure acts as functor to do the latching of leaf pages.
-It returns true if latching of leaf pages succeeded and false
-otherwise. */
-struct optimistic_latch_leaves
-{
- btr_pcur_t *const cursor;
- btr_latch_mode *const latch_mode;
- mtr_t *const mtr;
+ const auto savepoint= mtr->get_savepoint();
+ mtr->memo_push(block, MTR_MEMO_BUF_FIX);
- bool operator()(buf_block_t *hint) const
+ if (UNIV_UNLIKELY(modify_clock != pcur->modify_clock))
{
- return hint &&
- btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr);
+ fail:
+ mtr->rollback_to_savepoint(savepoint);
+ return false;
+ }
+
+ buf_block_t *prev;
+ if (left_page_no != FIL_NULL)
+ {
+ prev= buf_page_get_gen(page_id_t(pcur->old_page_id.space(),
+ left_page_no), block->zip_size(),
+ mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+ if (!prev ||
+ page_is_comp(prev->page.frame) != page_is_comp(block->page.frame) ||
+ memcmp_aligned<2>(block->page.frame, prev->page.frame, 2) ||
+ memcmp_aligned<2>(block->page.frame + PAGE_HEADER + PAGE_INDEX_ID,
+ prev->page.frame + PAGE_HEADER + PAGE_INDEX_ID, 8))
+ goto fail;
}
-};
+ else
+ prev= nullptr;
+
+ mtr->upgrade_buffer_fix(savepoint, mode);
+
+ if (UNIV_UNLIKELY(block->modify_clock != modify_clock) ||
+ UNIV_UNLIKELY(block->page.is_freed()) ||
+ (prev &&
+ memcmp_aligned<4>(FIL_PAGE_NEXT + prev->page.frame,
+ FIL_PAGE_OFFSET + page, 4)))
+ goto fail;
+
+ return true;
+}
/** Restores the stored position of a persistent cursor bufferfixing
the page and obtaining the specified latches. If the cursor position
@@ -331,6 +308,7 @@ btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
record with the same unique field values as in the stored record,
btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
the record with not the samebuniq field values as in the stored */
+TRANSACTIONAL_TARGET
btr_pcur_t::restore_status
btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
{
@@ -361,7 +339,6 @@ btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
latch_mode =
BTR_LATCH_MODE_WITHOUT_INTENTION(restore_latch_mode);
pos_state = BTR_PCUR_IS_POSITIONED;
- block_when_stored.clear();
return restore_status::NOT_SAME;
}
@@ -378,9 +355,8 @@ btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
case BTR_SEARCH_PREV:
case BTR_MODIFY_PREV:
/* Try optimistic restoration. */
- if (block_when_stored.run_with_hint(
- optimistic_latch_leaves{this, &restore_latch_mode,
- mtr})) {
+ if (btr_pcur_optimistic_latch_leaves(this, &restore_latch_mode,
+ mtr)) {
pos_state = BTR_PCUR_IS_POSITIONED;
latch_mode = restore_latch_mode;
@@ -485,16 +461,22 @@ btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
since the cursor can now be on a different page!
But we can retain the value of old_rec */
- block_when_stored.store(btr_pcur_get_block(this));
- modify_clock= buf_block_get_modify_clock(
- block_when_stored.block());
+ old_page_id = btr_cur.page_cur.block->page.id();
+ modify_clock = btr_cur.page_cur.block->modify_clock;
mem_heap_free(heap);
return restore_status::SAME_ALL;
}
- if (n_matched_fields >= index->n_uniq)
- ret_val= restore_status::SAME_UNIQ;
+ if (n_matched_fields >= index->n_uniq
+ /* Unique indexes can contain "NULL" keys, and if all
+ unique fields are NULL and not all tuple
+ fields match to record fields, then treat it as if
+ restored cursor position points to the record with
+ not the same unique key. */
+ && !(index->n_nullable
+ && dtuple_contains_null(tuple, index->n_uniq)))
+ ret_val= restore_status::SAME_UNIQ;
}
mem_heap_free(heap);
@@ -612,40 +594,33 @@ btr_pcur_move_backward_from_page(
return true;
}
- buf_block_t* block = btr_pcur_get_block(cursor);
-
- if (page_has_prev(block->page.frame)) {
- buf_block_t* left_block
- = mtr->at_savepoint(mtr->get_savepoint() - 1);
- const page_t* const left = left_block->page.frame;
- if (memcmp_aligned<4>(left + FIL_PAGE_NEXT,
- block->page.frame
- + FIL_PAGE_OFFSET, 4)) {
- /* This should be the right sibling page, or
- if there is none, the current block. */
- ut_ad(left_block == block
- || !memcmp_aligned<4>(left + FIL_PAGE_PREV,
- block->page.frame
- + FIL_PAGE_OFFSET, 4));
- /* The previous one must be the left sibling. */
- left_block
- = mtr->at_savepoint(mtr->get_savepoint() - 2);
- ut_ad(!memcmp_aligned<4>(left_block->page.frame
- + FIL_PAGE_NEXT,
- block->page.frame
- + FIL_PAGE_OFFSET, 4));
- }
+ buf_block_t* block = mtr->at_savepoint(0);
+ ut_ad(block == btr_pcur_get_block(cursor));
+ const page_t* const page = block->page.frame;
+ /* btr_pcur_optimistic_latch_leaves() will acquire a latch on
+ the preceding page if one exists;
+ if that fails, btr_cur_t::search_leaf() invoked by
+ btr_pcur_open_with_no_init() will also acquire a latch on the
+ succeeding page. Our caller only needs one page latch. */
+ ut_ad(mtr->get_savepoint() <= 3);
+
+ if (page_has_prev(page)) {
+ buf_block_t* const left_block = mtr->at_savepoint(1);
+ ut_ad(!memcmp_aligned<4>(page + FIL_PAGE_OFFSET,
+ left_block->page.frame
+ + FIL_PAGE_NEXT, 4));
if (btr_pcur_is_before_first_on_page(cursor)) {
+ /* Reposition on the previous page. */
page_cur_set_after_last(left_block,
&cursor->btr_cur.page_cur);
/* Release the right sibling. */
- } else {
- /* Release the left sibling. */
+ mtr->rollback_to_savepoint(0, 1);
block = left_block;
}
- mtr->release(*block);
}
+ mtr->rollback_to_savepoint(1);
+ ut_ad(block == mtr->at_savepoint(0));
cursor->latch_mode = latch_mode;
cursor->old_rec = nullptr;
return false;
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
deleted file mode 100644
index 6bd01faa..00000000
--- a/storage/innobase/buf/buf0block_hint.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, 2021, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License, version 2.0, as published by the
-Free Software Foundation.
-
-This program is also distributed with certain software (including but not
-limited to OpenSSL) that is licensed under separate terms, as designated in a
-particular file or component or in included license documentation. The authors
-of MySQL hereby grant you an additional permission to link the program and
-your derivative works with the separately licensed software that they have
-included with MySQL.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
-for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-
-#include "buf0block_hint.h"
-namespace buf {
-
-TRANSACTIONAL_TARGET
-void Block_hint::buffer_fix_block_if_still_valid()
-{
- /* To check if m_block belongs to the current buf_pool, we must
- prevent freeing memory while we check, and until we buffer-fix the
- block. For this purpose it is enough to latch any of the many
- latches taken by buf_pool_t::resize().
-
- Similar to buf_page_optimistic_get(), we must validate
- m_block->page.id() after acquiring the hash_lock, because the object
- may have been freed and not actually attached to buf_pool.page_hash
- at the moment. (The block could have been reused to store a
- different page, and that slice of buf_pool.page_hash could be protected
- by another hash_lock that we are not holding.)
-
- Finally, we must ensure that the block is not being freed. */
- if (m_block)
- {
- auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold());
- transactional_shared_lock_guard<page_hash_latch> g
- {buf_pool.page_hash.lock_get(cell)};
- if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
- m_block->page.frame && m_block->page.in_file())
- m_block->page.fix();
- else
- clear();
- }
-}
-} // namespace buf
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 23b5b776..49f73105 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -77,6 +77,8 @@ struct set_numa_interleave_t
if (srv_numa_interleave) {
struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+ MEM_MAKE_DEFINED(numa_mems_allowed,
+ sizeof *numa_mems_allowed);
ib::info() << "Setting NUMA memory policy to"
" MPOL_INTERLEAVE";
if (set_mempolicy(MPOL_INTERLEAVE,
@@ -1062,6 +1064,7 @@ inline bool buf_pool_t::chunk_t::create(size_t bytes)
if (srv_numa_interleave)
{
struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+ MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed);
if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
numa_mems_allowed->maskp, numa_mems_allowed->size,
MPOL_MF_MOVE))
@@ -1591,17 +1594,14 @@ inline bool buf_pool_t::withdraw_blocks()
/* reserve free_list length */
if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
- buf_flush_LRU(
- std::max<ulint>(withdraw_target
- - UT_LIST_GET_LEN(withdraw),
- srv_LRU_scan_depth),
- true);
- mysql_mutex_unlock(&buf_pool.mutex);
- buf_dblwr.flush_buffered_writes();
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- buf_flush_wait_LRU_batch_end();
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- mysql_mutex_lock(&buf_pool.mutex);
+ try_LRU_scan = false;
+ mysql_mutex_unlock(&mutex);
+ mysql_mutex_lock(&flush_list_mutex);
+ page_cleaner_wakeup(true);
+ my_cond_wait(&done_flush_list,
+ &flush_list_mutex.m_mutex);
+ mysql_mutex_unlock(&flush_list_mutex);
+ mysql_mutex_lock(&mutex);
}
/* relocate blocks/buddies in withdrawn area */
@@ -2298,7 +2298,10 @@ buf_page_t *buf_pool_t::watch_set(const page_id_t id,
got_block:
bpage->fix();
if (watch_is_sentinel(*bpage))
+ {
+ ut_ad(!bpage->oldest_modification());
bpage= nullptr;
+ }
page_hash.lock_get(chain).unlock();
return bpage;
}
@@ -2370,6 +2373,7 @@ void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
}
else
{
+ ut_ad(!w->oldest_modification());
const auto state= w->state();
ut_ad(~buf_page_t::LRU_MASK & state);
ut_ad(state >= buf_page_t::UNFIXED + 1);
@@ -2856,9 +2860,10 @@ got_block_fixed:
if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
if (mode == BUF_PEEK_IF_IN_POOL) {
ignore_block:
+ block->unfix();
+ignore_unfixed:
ut_ad(mode == BUF_GET_POSSIBLY_FREED
|| mode == BUF_PEEK_IF_IN_POOL);
- block->unfix();
if (err) {
*err = DB_CORRUPTION;
}
@@ -2872,16 +2877,32 @@ ignore_block:
in buf_page_t::read_complete() or
buf_pool_t::corrupted_evict(), or
after buf_zip_decompress() in this function. */
- block->page.lock.s_lock();
+ if (rw_latch != RW_NO_LATCH) {
+ block->page.lock.s_lock();
+ } else if (!block->page.lock.s_lock_try()) {
+ /* For RW_NO_LATCH, we should not try to acquire S or X
+ latch directly as we could be violating the latching
+ order resulting in deadlock. Instead we try latching the
+ page and retry in case of a failure. */
+ goto wait_for_read;
+ }
state = block->page.state();
ut_ad(state < buf_page_t::READ_FIX
|| state >= buf_page_t::WRITE_FIX);
const page_id_t id{block->page.id()};
block->page.lock.s_unlock();
- if (UNIV_UNLIKELY(id != page_id)) {
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+ if (UNIV_UNLIKELY(id == page_id)) {
+ /* The page read was completed, and
+ another thread marked the page as free
+ while we were waiting. */
+ goto ignore_block;
+ }
+
ut_ad(id == page_id_t{~0ULL});
block->page.unfix();
+
if (++retries < BUF_PAGE_READ_MAX_RETRIES) {
goto loop;
}
@@ -2892,6 +2913,7 @@ ignore_block:
return nullptr;
}
+ ut_ad(id == page_id);
} else if (mode != BUF_PEEK_IF_IN_POOL) {
} else if (!mtr) {
ut_ad(!block->page.oldest_modification());
@@ -2918,6 +2940,7 @@ free_unfixed_block:
if (UNIV_UNLIKELY(!block->page.frame)) {
if (!block->page.lock.x_lock_try()) {
wait_for_unzip:
+wait_for_read:
/* The page is being read or written, or
another thread is executing buf_zip_decompress()
in buf_page_get_low() on it. */
@@ -3098,83 +3121,72 @@ re_evict_fail:
#endif /* UNIV_DEBUG */
ut_ad(block->page.frame);
+ /* The state = block->page.state() may be stale at this point,
+ and in fact, at any point of time if we consider its
+ buffer-fix component. If the block is being read into the
+ buffer pool, it is possible that buf_page_t::read_complete()
+ will invoke buf_pool_t::corrupted_evict() and therefore
+ invalidate it (invoke buf_page_t::set_corrupt_id() and set the
+ state to FREED). Therefore, after acquiring the page latch we
+ must recheck the state. */
+
if (state >= buf_page_t::UNFIXED
&& allow_ibuf_merge
&& fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
&& page_is_leaf(block->page.frame)) {
block->page.lock.x_lock();
- ut_ad(block->page.id() == page_id
- || (state >= buf_page_t::READ_FIX
- && state < buf_page_t::WRITE_FIX));
-
-#ifdef BTR_CUR_HASH_ADAPT
- btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
-
- dberr_t e;
-
- if (UNIV_UNLIKELY(block->page.id() != page_id)) {
-page_id_mismatch:
- state = block->page.state();
- e = DB_CORRUPTION;
-ibuf_merge_corrupted:
- if (err) {
- *err = e;
- }
-
- if (block->page.id().is_corrupted()) {
- buf_pool.corrupted_evict(&block->page, state);
- }
- return nullptr;
- }
-
state = block->page.state();
ut_ad(state < buf_page_t::READ_FIX);
if (state >= buf_page_t::IBUF_EXIST
&& state < buf_page_t::REINIT) {
block->page.clear_ibuf_exist();
- e = ibuf_merge_or_delete_for_page(block, page_id,
- block->zip_size());
- if (UNIV_UNLIKELY(e != DB_SUCCESS)) {
- goto ibuf_merge_corrupted;
+ if (dberr_t local_err =
+ ibuf_merge_or_delete_for_page(block, page_id,
+ block->zip_size())) {
+ if (err) {
+ *err = local_err;
+ }
+ goto release_and_ignore_block;
}
+ } else if (state < buf_page_t::UNFIXED) {
+release_and_ignore_block:
+ block->page.lock.x_unlock();
+ goto ignore_block;
}
- if (rw_latch == RW_X_LATCH) {
- goto get_latch_valid;
- } else {
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ switch (rw_latch) {
+ case RW_NO_LATCH:
+ block->page.lock.x_unlock();
+ break;
+ case RW_S_LATCH:
block->page.lock.x_unlock();
- goto get_latch;
+ block->page.lock.s_lock();
+ break;
+ case RW_SX_LATCH:
+ block->page.lock.x_u_downgrade();
+ break;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
}
+
+ mtr->memo_push(block, mtr_memo_type_t(rw_latch));
} else {
-get_latch:
switch (rw_latch) {
case RW_NO_LATCH:
mtr->memo_push(block, MTR_MEMO_BUF_FIX);
return block;
case RW_S_LATCH:
block->page.lock.s_lock();
- ut_ad(!block->page.is_read_fixed());
- if (UNIV_UNLIKELY(block->page.id() != page_id)) {
- block->page.lock.s_unlock();
- block->page.lock.x_lock();
- goto page_id_mismatch;
- }
-get_latch_valid:
- mtr->memo_push(block, mtr_memo_type_t(rw_latch));
-#ifdef BTR_CUR_HASH_ADAPT
- btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
break;
case RW_SX_LATCH:
block->page.lock.u_lock();
ut_ad(!block->page.is_io_fixed());
- if (UNIV_UNLIKELY(block->page.id() != page_id)) {
- block->page.lock.u_x_upgrade();
- goto page_id_mismatch;
- }
- goto get_latch_valid;
+ break;
default:
ut_ad(rw_latch == RW_X_LATCH);
if (block->page.lock.x_lock_upgraded()) {
@@ -3183,17 +3195,26 @@ get_latch_valid:
mtr->page_lock_upgrade(*block);
return block;
}
- if (UNIV_UNLIKELY(block->page.id() != page_id)) {
- goto page_id_mismatch;
- }
- goto get_latch_valid;
}
- ut_ad(page_id_t(page_get_space_id(block->page.frame),
- page_get_page_no(block->page.frame))
- == page_id);
+ mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+ state = block->page.state();
+
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+ mtr->release_last_page();
+ goto ignore_unfixed;
+ }
+
+ ut_ad(state < buf_page_t::READ_FIX
+ || state > buf_page_t::WRITE_FIX);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
}
+ ut_ad(page_id_t(page_get_space_id(block->page.frame),
+ page_get_page_no(block->page.frame)) == page_id);
return block;
}
@@ -3289,83 +3310,76 @@ buf_page_get_gen(
return block;
}
-/********************************************************************//**
-This is the general function used to get optimistic access to a database
-page.
-@return TRUE if success */
TRANSACTIONAL_TARGET
-bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
- uint64_t modify_clock, mtr_t *mtr)
+buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id)
+{
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ if (UNIV_UNLIKELY(!buf_pool.is_uncompressed(block) ||
+ id != block->page.id() || !block->page.frame))
+ return nullptr;
+ const auto state= block->page.state();
+ if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+ state >= buf_page_t::READ_FIX))
+ return nullptr;
+ block->page.fix();
+ return block;
+}
+
+buf_block_t *buf_page_optimistic_get(buf_block_t *block,
+ rw_lock_type_t rw_latch,
+ uint64_t modify_clock, mtr_t *mtr)
{
- ut_ad(block);
- ut_ad(mtr);
ut_ad(mtr->is_active());
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+ ut_ad(block->page.buf_fix_count());
- if (have_transactional_memory);
- else if (UNIV_UNLIKELY(!block->page.frame))
- return false;
- else
+ if (rw_latch == RW_S_LATCH)
{
- const auto state= block->page.state();
- if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
- state >= buf_page_t::READ_FIX))
- return false;
- }
+ if (!block->page.lock.s_lock_try())
+ {
+ fail:
+ block->page.unfix();
+ return nullptr;
+ }
- bool success;
- const page_id_t id{block->page.id()};
- buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
- bool have_u_not_x= false;
+ ut_ad(!ibuf_inside(mtr) ||
+ ibuf_page(block->page.id(), block->zip_size(), nullptr));
- {
- transactional_shared_lock_guard<page_hash_latch> g
- {buf_pool.page_hash.lock_get(chain)};
- if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame))
- return false;
- const auto state= block->page.state();
- if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
- state >= buf_page_t::READ_FIX))
- return false;
-
- if (rw_latch == RW_S_LATCH)
- success= block->page.lock.s_lock_try();
- else
+ if (modify_clock != block->modify_clock || block->page.is_freed())
{
- have_u_not_x= block->page.lock.have_u_not_x();
- success= have_u_not_x || block->page.lock.x_lock_try();
+ block->page.lock.s_unlock();
+ goto fail;
}
- }
- if (!success)
- return false;
-
- if (have_u_not_x)
+ ut_ad(!block->page.is_read_fixed());
+ buf_page_make_young_if_needed(&block->page);
+ mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX);
+ }
+ else if (block->page.lock.have_u_not_x())
{
block->page.lock.u_x_upgrade();
+ block->page.unfix();
mtr->page_lock_upgrade(*block);
- ut_ad(id == block->page.id());
ut_ad(modify_clock == block->modify_clock);
}
+ else if (!block->page.lock.x_lock_try())
+ goto fail;
else
{
- ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed());
- ut_ad(id == block->page.id());
- ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr));
+ ut_ad(!block->page.is_io_fixed());
+ ut_ad(!ibuf_inside(mtr) ||
+ ibuf_page(block->page.id(), block->zip_size(), nullptr));
if (modify_clock != block->modify_clock || block->page.is_freed())
{
- if (rw_latch == RW_S_LATCH)
- block->page.lock.s_unlock();
- else
- block->page.lock.x_unlock();
- return false;
+ block->page.lock.x_unlock();
+ goto fail;
}
- block->page.fix();
- ut_ad(!block->page.is_read_fixed());
buf_page_make_young_if_needed(&block->page);
- mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
}
ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
@@ -3375,7 +3389,7 @@ bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
ut_ad(~buf_page_t::LRU_MASK & state);
ut_ad(block->page.frame);
- return true;
+ return block;
}
/** Try to S-latch a page.
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index d4628985..d364be31 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -274,30 +274,22 @@ buf_flush_relocate_on_flush_list(
ut_d(buf_flush_validate_low());
}
-/** Note that a block is no longer dirty, while not removing
-it from buf_pool.flush_list
-@param temporary whether the page belongs to the temporary tablespace
-@param error whether an error may have occurred while writing */
-inline void buf_page_t::write_complete(bool temporary, bool error)
+void buf_page_t::write_complete(bool persistent, bool error, uint32_t state)
{
- ut_ad(temporary == fsp_is_system_temporary(id().space()));
- if (UNIV_UNLIKELY(error));
- else if (temporary)
- {
- ut_ad(oldest_modification() == 2);
- oldest_modification_= 0;
- }
- else
+ ut_ad(!persistent == fsp_is_system_temporary(id().space()));
+ ut_ad(state >= WRITE_FIX);
+
+ if (UNIV_LIKELY(!error))
{
+ ut_d(lsn_t om= oldest_modification());
+ ut_ad(om >= 2);
+ ut_ad(persistent == (om > 2));
/* We use release memory order to guarantee that callers of
oldest_modification_acquire() will observe the block as
being detached from buf_pool.flush_list, after reading the value 0. */
- ut_ad(oldest_modification() > 2);
- oldest_modification_.store(1, std::memory_order_release);
+ oldest_modification_.store(persistent, std::memory_order_release);
}
- const auto s= state();
- ut_ad(s >= WRITE_FIX);
- zip.fix.fetch_sub((s >= WRITE_FIX_REINIT)
+ zip.fix.fetch_sub((state >= WRITE_FIX_REINIT)
? (WRITE_FIX_REINIT - UNFIXED)
: (WRITE_FIX - UNFIXED));
lock.u_unlock(true);
@@ -311,18 +303,10 @@ inline void buf_pool_t::n_flush_inc()
inline void buf_pool_t::n_flush_dec()
{
- mysql_mutex_lock(&flush_list_mutex);
+ mysql_mutex_assert_owner(&flush_list_mutex);
ut_ad(page_cleaner_status >= LRU_FLUSH);
if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH)
pthread_cond_broadcast(&done_flush_LRU);
- mysql_mutex_unlock(&flush_list_mutex);
-}
-
-inline void buf_pool_t::n_flush_dec_holding_mutex()
-{
- mysql_mutex_assert_owner(&flush_list_mutex);
- ut_ad(page_cleaner_status >= LRU_FLUSH);
- page_cleaner_status-= LRU_FLUSH;
}
/** Complete write of a file page from buf_pool.
@@ -352,28 +336,26 @@ void buf_page_write_complete(const IORequest &request, bool error)
mysql_mutex_assert_not_owner(&buf_pool.mutex);
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
- if (request.is_LRU())
+ const bool persistent= bpage->oldest_modification() != 2;
+
+ if (UNIV_UNLIKELY(!persistent) && UNIV_LIKELY(!error))
{
- const bool temp= bpage->oldest_modification() == 2;
- if (!temp && state < buf_page_t::WRITE_FIX_REINIT &&
- request.node->space->use_doublewrite())
- buf_dblwr.write_completed();
/* We must hold buf_pool.mutex while releasing the block, so that
no other thread can access it before we have freed it. */
mysql_mutex_lock(&buf_pool.mutex);
- bpage->write_complete(temp, error);
- if (!error)
- buf_LRU_free_page(bpage, true);
+ bpage->write_complete(persistent, error, state);
+ buf_LRU_free_page(bpage, true);
mysql_mutex_unlock(&buf_pool.mutex);
-
- buf_pool.n_flush_dec();
}
else
{
+ bpage->write_complete(persistent, error, state);
if (state < buf_page_t::WRITE_FIX_REINIT &&
request.node->space->use_doublewrite())
+ {
+ ut_ad(persistent);
buf_dblwr.write_completed();
- bpage->write_complete(false, error);
+ }
}
}
@@ -740,17 +722,15 @@ ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept
}
/** Write a flushable page to a file or free a freeable block.
-@param evict whether to evict the page on write completion
@param space tablespace
@return whether a page write was initiated and buf_pool.mutex released */
-bool buf_page_t::flush(bool evict, fil_space_t *space)
+bool buf_page_t::flush(fil_space_t *space)
{
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
ut_ad(in_file());
ut_ad(in_LRU_list);
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
(space == fil_system.temp_space));
- ut_ad(evict || space != fil_system.temp_space);
ut_ad(space->referenced());
const auto s= state();
@@ -797,22 +777,11 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
mysql_mutex_unlock(&buf_pool.mutex);
IORequest::Type type= IORequest::WRITE_ASYNC;
- if (UNIV_UNLIKELY(evict))
- {
- type= IORequest::WRITE_LRU;
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- buf_pool.n_flush_inc();
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- }
/* Apart from the U-lock, this block will also be protected by
is_write_fixed() and oldest_modification()>1.
Thus, it cannot be relocated or removed. */
- DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
- evict ? "LRU" : "flush_list",
- id().space(), id().page_no()));
-
buf_block_t *block= reinterpret_cast<buf_block_t*>(this);
page_t *write_frame= zip.data;
@@ -864,10 +833,7 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
{
switch (space->chain.start->punch_hole) {
case 1:
- static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH ==
- IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, "");
- type=
- IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC));
+ type= IORequest::PUNCH;
break;
case 2:
size= orig_size;
@@ -894,10 +860,8 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
/** Check whether a page can be flushed from the buf_pool.
@param id page identifier
@param fold id.fold()
-@param evict true=buf_pool.LRU; false=buf_pool.flush_list
@return whether the page can be flushed */
-static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
- bool evict)
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(fold == id.fold());
@@ -906,26 +870,16 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
const buf_page_t *bpage=
buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold));
- if (!bpage || buf_pool.watch_is_sentinel(*bpage))
- return false;
-
- /* We avoid flushing 'non-old' blocks in an eviction flush, because the
- flushed blocks are soon freed */
- if (evict && !bpage->is_old())
- return false;
-
- return bpage->oldest_modification() > 1 && !bpage->is_io_fixed();
+ return bpage && bpage->oldest_modification() > 1 && !bpage->is_io_fixed();
}
/** Check which neighbors of a page can be flushed from the buf_pool.
@param space tablespace
@param id page identifier of a dirty page
@param contiguous whether to consider contiguous areas of pages
-@param evict true=buf_pool.LRU; false=buf_pool.flush_list
@return last page number that can be flushed */
static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
- page_id_t &id, bool contiguous,
- bool evict)
+ page_id_t &id, bool contiguous)
{
ut_ad(id.page_no() < space.size +
(space.physical_size() == 2048 ? 1
@@ -958,7 +912,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
for (page_id_t i= id - 1;; --i)
{
fold--;
- if (!buf_flush_check_neighbor(i, fold, evict))
+ if (!buf_flush_check_neighbor(i, fold))
{
low= i + 1;
break;
@@ -974,7 +928,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
while (++i < high)
{
++fold;
- if (!buf_flush_check_neighbor(i, fold, evict))
+ if (!buf_flush_check_neighbor(i, fold))
break;
}
@@ -1051,14 +1005,13 @@ and also write zeroes or punch the hole for the freed ranges of pages.
@param page_id page identifier
@param bpage buffer page
@param contiguous whether to consider contiguous areas of pages
-@param evict true=buf_pool.LRU; false=buf_pool.flush_list
@param n_flushed number of pages flushed so far in this batch
@param n_to_flush maximum number of pages we are allowed to flush
@return number of pages flushed */
static ulint buf_flush_try_neighbors(fil_space_t *space,
const page_id_t page_id,
buf_page_t *bpage,
- bool contiguous, bool evict,
+ bool contiguous,
ulint n_flushed, ulint n_to_flush)
{
ut_ad(space->id == page_id.space());
@@ -1072,7 +1025,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
ut_ad(lsn >= bpage->oldest_modification());
if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
{
- ut_a(!bpage->flush(evict, space));
+ ut_a(!bpage->flush(space));
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
@@ -1082,7 +1035,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
ulint count= 0;
page_id_t id= page_id;
- page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
+ page_id_t high= buf_flush_check_neighbors(*space, id, contiguous);
ut_ad(page_id >= id);
ut_ad(page_id < high);
@@ -1119,7 +1072,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
ut_ad(!buf_pool.watch_is_sentinel(*b));
ut_ad(b->oldest_modification() > 1);
flush:
- if (b->flush(evict, space))
+ if (b->flush(space))
{
++count;
continue;
@@ -1127,9 +1080,10 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
}
/* We avoid flushing 'non-old' blocks in an eviction flush,
because the flushed blocks are soon freed */
- else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) &&
- b->oldest_modification() > 1 && b->lock.u_lock_try(true))
+ else if (b->oldest_modification() > 1 && b->lock.u_lock_try(true))
{
+ /* For the buf_pool.watch[] sentinels, oldest_modification() == 0 */
+ ut_ad(!buf_pool.watch_is_sentinel(*b));
if (b->oldest_modification() < 2)
b->lock.u_unlock(true);
else
@@ -1251,10 +1205,8 @@ static void buf_flush_discard_page(buf_page_t *bpage)
/** Flush dirty blocks from the end buf_pool.LRU,
and move clean blocks to buf_pool.free.
@param max maximum number of blocks to flush
-@param evict whether dirty pages are to be evicted after flushing them
@param n counts of flushed and evicted pages */
-static void buf_flush_LRU_list_batch(ulint max, bool evict,
- flush_counters_t *n)
+static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
{
ulint scanned= 0;
ulint free_limit= srv_LRU_scan_depth;
@@ -1302,8 +1254,12 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
{
ut_ad(!bpage->is_io_fixed());
- bool do_evict= evict;
switch (bpage->oldest_modification()) {
+ case 2:
+ /* LRU flushing will always evict pages of the temporary tablespace,
+ in buf_page_write_complete(). */
+ ++n->evicted;
+ break;
case 1:
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (ut_d(lsn_t lsn=) bpage->oldest_modification())
@@ -1316,12 +1272,8 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
case 0:
bpage->lock.u_unlock(true);
goto evict;
- case 2:
- /* LRU flushing will always evict pages of the temporary tablespace. */
- do_evict= true;
}
- /* Block is ready for flush. Dispatch an IO request.
- If do_evict, the page may be evicted by buf_page_write_complete(). */
+ /* Block is ready for flush. Dispatch an IO request. */
const page_id_t page_id(bpage->id());
const uint32_t space_id= page_id.space();
if (!space || space->id != space_id)
@@ -1356,6 +1308,7 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
no_space:
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_flush_discard_page(bpage);
+ ++n->evicted;
continue;
}
@@ -1368,8 +1321,8 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
if (neighbors && space->is_rotational())
n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
neighbors == 1,
- do_evict, n->flushed, max);
- else if (bpage->flush(do_evict, space))
+ n->flushed, max);
+ else if (bpage->flush(space))
++n->flushed;
else
continue;
@@ -1387,24 +1340,25 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
space->release();
if (scanned)
+ {
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
MONITOR_LRU_BATCH_SCANNED_PER_CALL,
scanned);
+ }
}
/** Flush and move pages from LRU or unzip_LRU list to the free list.
Whether LRU or unzip_LRU is used depends on the state of the system.
@param max maximum number of blocks to flush
-@param evict whether dirty pages are to be evicted after flushing them
@param n counts of flushed and evicted pages */
-static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n)
+static void buf_do_LRU_batch(ulint max, flush_counters_t *n)
{
if (buf_LRU_evict_from_unzip_LRU())
buf_free_from_unzip_LRU_list_batch();
n->evicted= 0;
n->flushed= 0;
- buf_flush_LRU_list_batch(max, evict, n);
+ buf_flush_LRU_list_batch(max, n);
mysql_mutex_assert_owner(&buf_pool.mutex);
buf_lru_freed_page_count+= n->evicted;
@@ -1516,8 +1470,8 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
{
if (neighbors && space->is_rotational())
count+= buf_flush_try_neighbors(space, page_id, bpage,
- neighbors == 1, false, count, max_n);
- else if (bpage->flush(false, space))
+ neighbors == 1, count, max_n);
+ else if (bpage->flush(space))
++count;
else
continue;
@@ -1536,10 +1490,13 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
space->release();
if (scanned)
+ {
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
scanned);
+ }
+
return count;
}
@@ -1683,7 +1640,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
goto was_freed;
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- if (bpage->flush(false, space))
+ if (bpage->flush(space))
{
++n_flush;
if (!--max_n_flush)
@@ -1741,27 +1698,22 @@ and move clean blocks to buf_pool.free.
The caller must invoke buf_dblwr.flush_buffered_writes()
after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed
-@param evict whether to evict pages after flushing
-@return evict ? number of processed pages : number of pages written */
-ulint buf_flush_LRU(ulint max_n, bool evict)
+@return number of pages written */
+static ulint buf_flush_LRU(ulint max_n)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
flush_counters_t n;
- buf_do_LRU_batch(max_n, evict, &n);
+ buf_do_LRU_batch(max_n, &n);
ulint pages= n.flushed;
if (n.evicted)
{
- if (evict)
- pages+= n.evicted;
buf_pool.try_LRU_scan= true;
pthread_cond_broadcast(&buf_pool.done_free);
}
- else if (!pages && !buf_pool.try_LRU_scan &&
- !buf_pool.LRU_warned.test_and_set(std::memory_order_acquire))
- {
+ else if (!pages && !buf_pool.try_LRU_scan)
/* For example, with the minimum innodb_buffer_pool_size=5M and
the default innodb_page_size=16k there are only a little over 316
pages in the buffer pool. The buffer pool can easily be exhausted
@@ -1775,18 +1727,13 @@ ulint buf_flush_LRU(ulint max_n, bool evict)
(3) This thread is the only one that could make progress, but
we fail to do so because all the pages that we scanned are
buffer-fixed or latched by some thread. */
- sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
- " %zu blocks are in use and %zu free."
- " Consider increasing innodb_buffer_pool_size.",
- UT_LIST_GET_LEN(buf_pool.LRU),
- UT_LIST_GET_LEN(buf_pool.free));
- }
+ buf_pool.LRU_warn();
return pages;
}
#ifdef HAVE_PMEM
-# include <libpmem.h>
+# include "cache.h"
#endif
/** Write checkpoint information to the log header and release mutex.
@@ -1900,8 +1847,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
ut_ad(!log.is_opened());
bool success;
log.m_file=
- os_file_create_func(get_log_file_path().c_str(),
- OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ os_file_create_func(get_log_file_path().c_str(), OS_FILE_OPEN,
OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
ut_a(success);
ut_a(log.is_opened());
@@ -1916,7 +1862,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
{
my_munmap(buf, file_size);
buf= resize_buf;
- buf_free= START_OFFSET + (get_lsn() - resizing);
+ set_buf_free(START_OFFSET + (get_lsn() - resizing));
}
else
#endif
@@ -1958,9 +1904,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
{
ut_ad(!srv_read_only_mode);
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
ut_ad(oldest_lsn <= end_lsn);
ut_ad(end_lsn == log_sys.get_lsn());
@@ -2327,7 +2271,7 @@ func_exit:
sum_pages += last_pages_in;
- const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1);
+ const ulint time_elapsed = std::max<ulint>(ulint(curr_time - prev_time), 1);
/* We update our variables every innodb_flushing_avg_loops
iterations to smooth out transition in workload. */
@@ -2541,26 +2485,16 @@ static void buf_flush_page_cleaner()
{
buf_pool.page_cleaner_set_idle(false);
buf_pool.n_flush_inc();
- /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */
- for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; )
- {
- const lsn_t lsn{p->oldest_modification()};
- ut_ad(lsn > 2 || lsn == 1);
- buf_page_t *n= UT_LIST_GET_NEXT(list, p);
- if (lsn <= 1)
- buf_pool.delete_from_flush_list(p);
- p= n;
- }
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
n= srv_max_io_capacity;
mysql_mutex_lock(&buf_pool.mutex);
LRU_flush:
- n= buf_flush_LRU(n, false);
+ n= buf_flush_LRU(n);
mysql_mutex_unlock(&buf_pool.mutex);
last_pages+= n;
check_oldest_and_set_idle:
mysql_mutex_lock(&buf_pool.flush_list_mutex);
- buf_pool.n_flush_dec_holding_mutex();
+ buf_pool.n_flush_dec();
oldest_lsn= buf_pool.get_oldest_modification(0);
if (!oldest_lsn)
goto fully_unemployed;
@@ -2693,6 +2627,16 @@ static void buf_flush_page_cleaner()
#endif
}
+ATTRIBUTE_COLD void buf_pool_t::LRU_warn()
+{
+ mysql_mutex_assert_owner(&mutex);
+ if (!LRU_warned.test_and_set(std::memory_order_acquire))
+ sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
+ " %zu blocks are in use and %zu free."
+ " Consider increasing innodb_buffer_pool_size.",
+ UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free));
+}
+
/** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
{
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 2a8d6ff2..33d01b6b 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -385,142 +385,76 @@ we put it to free list to be used.
@return the free control block, in state BUF_BLOCK_MEMORY */
buf_block_t *buf_LRU_get_free_block(bool have_mutex)
{
- ulint n_iterations = 0;
- ulint flush_failures = 0;
- MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
- if (have_mutex) {
- mysql_mutex_assert_owner(&buf_pool.mutex);
- goto got_mutex;
- }
- DBUG_EXECUTE_IF("recv_ran_out_of_buffer",
- if (recv_recovery_is_on()
- && recv_sys.apply_log_recs) {
- mysql_mutex_lock(&buf_pool.mutex);
- goto flush_lru;
- });
-get_mutex:
- mysql_mutex_lock(&buf_pool.mutex);
-got_mutex:
- buf_LRU_check_size_of_non_data_objects();
- buf_block_t* block;
+ bool waited= false;
+ MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+ if (!have_mutex)
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ buf_LRU_check_size_of_non_data_objects();
- IF_DBUG(static bool buf_lru_free_blocks_error_printed,);
- DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
- if (!buf_lru_free_blocks_error_printed) {
- n_iterations = 21;
- goto not_found;});
+ buf_block_t *block;
retry:
- /* If there is a block in the free list, take it */
- if ((block = buf_LRU_get_free_only()) != nullptr) {
+ /* If there is a block in the free list, take it */
+ block= buf_LRU_get_free_only();
+ if (block)
+ {
got_block:
- const ulint LRU_size = UT_LIST_GET_LEN(buf_pool.LRU);
- const ulint available = UT_LIST_GET_LEN(buf_pool.free);
- const ulint scan_depth = srv_LRU_scan_depth / 2;
- ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth
- || buf_pool.need_LRU_eviction());
-
- if (!have_mutex) {
- mysql_mutex_unlock(&buf_pool.mutex);
- }
-
- if (UNIV_UNLIKELY(available < scan_depth)
- && LRU_size > BUF_LRU_MIN_LEN) {
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- if (!buf_pool.page_cleaner_active()) {
- buf_pool.page_cleaner_wakeup(true);
- }
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- }
-
- block->page.zip.clear();
- return block;
- }
+ const ulint LRU_size= UT_LIST_GET_LEN(buf_pool.LRU);
+ const ulint available= UT_LIST_GET_LEN(buf_pool.free);
+ const ulint scan_depth= srv_LRU_scan_depth / 2;
+ ut_ad(LRU_size <= BUF_LRU_MIN_LEN ||
+ available >= scan_depth || buf_pool.need_LRU_eviction());
- MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
- if (n_iterations || buf_pool.try_LRU_scan) {
- /* If no block was in the free list, search from the
- end of the LRU list and try to free a block there.
- If we are doing for the first time we'll scan only
- tail of the LRU list otherwise we scan the whole LRU
- list. */
- if (buf_LRU_scan_and_free_block(n_iterations
- ? ULINT_UNDEFINED : 100)) {
- goto retry;
- }
+ if (UNIV_UNLIKELY(available < scan_depth) && LRU_size > BUF_LRU_MIN_LEN)
+ {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (!buf_pool.page_cleaner_active())
+ buf_pool.page_cleaner_wakeup(true);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
- /* Tell other threads that there is no point
- in scanning the LRU list. */
- buf_pool.try_LRU_scan = false;
- }
+ if (!have_mutex)
+ mysql_mutex_unlock(&buf_pool.mutex);
- for (;;) {
- if ((block = buf_LRU_get_free_only()) != nullptr) {
- goto got_block;
- }
- const bool wake = buf_pool.need_LRU_eviction();
- mysql_mutex_unlock(&buf_pool.mutex);
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- const auto n_flush = buf_pool.n_flush();
- if (wake && !buf_pool.page_cleaner_active()) {
- buf_pool.page_cleaner_wakeup(true);
- }
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- mysql_mutex_lock(&buf_pool.mutex);
- if (!n_flush) {
- goto not_found;
- }
- if (!buf_pool.try_LRU_scan) {
- my_cond_wait(&buf_pool.done_free,
- &buf_pool.mutex.m_mutex);
- }
- }
-
-not_found:
- if (n_iterations > 1) {
- MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
- }
+ block->page.zip.clear();
+ return block;
+ }
- if (n_iterations == 21
- && srv_buf_pool_old_size == srv_buf_pool_size
- && buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) {
- IF_DBUG(buf_lru_free_blocks_error_printed = true,);
- mysql_mutex_unlock(&buf_pool.mutex);
- ib::warn() << "Difficult to find free blocks in the buffer pool"
- " (" << n_iterations << " search iterations)! "
- << flush_failures << " failed attempts to"
- " flush a page!"
- " Consider increasing innodb_buffer_pool_size."
- " Pending flushes (fsync): "
- << fil_n_pending_tablespace_flushes
- << ". " << os_n_file_reads << " OS file reads, "
- << os_n_file_writes << " OS file writes, "
- << os_n_fsyncs
- << " OS fsyncs.";
- mysql_mutex_lock(&buf_pool.mutex);
- }
+ MONITOR_INC(MONITOR_LRU_GET_FREE_LOOPS);
+ if (waited || buf_pool.try_LRU_scan)
+ {
+ /* If no block was in the free list, search from the end of the
+ LRU list and try to free a block there. If we are doing for the
+ first time we'll scan only tail of the LRU list otherwise we scan
+ the whole LRU list. */
+ if (buf_LRU_scan_and_free_block(waited ? ULINT_UNDEFINED : 100))
+ goto retry;
+
+ /* Tell other threads that there is no point in scanning the LRU
+ list. */
+ buf_pool.try_LRU_scan= false;
+ }
- /* No free block was found: try to flush the LRU list.
- The freed blocks will be up for grabs for all threads.
+ waited= true;
- TODO: A more elegant way would have been to return one freed
- up block to the caller here but the code that deals with
- removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
- involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
- can do that in a separate patch sometime in future. */
-#ifndef DBUG_OFF
-flush_lru:
-#endif
- if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
- MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
- ++flush_failures;
- }
+ while (!(block= buf_LRU_get_free_only()))
+ {
+ buf_pool.stat.LRU_waits++;
+
+ timespec abstime;
+ set_timespec(abstime, 1);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (!buf_pool.page_cleaner_active())
+ buf_pool.page_cleaner_wakeup(true);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ if (my_cond_timedwait(&buf_pool.done_free, &buf_pool.mutex.m_mutex,
+ &abstime))
+ buf_pool.LRU_warn();
+ }
- n_iterations++;
- buf_pool.stat.LRU_waits++;
- mysql_mutex_unlock(&buf_pool.mutex);
- buf_dblwr.flush_buffered_writes();
- goto get_mutex;
+ goto got_block;
}
/** Move the LRU_old pointer so that the length of the old blocks list
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 9041c6a2..76a5e710 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -575,7 +575,7 @@ fail:
hash_lock.lock_shared();
const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
- if (!bpage)
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
{
hash_lock.unlock_shared();
if (i == page_id)
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 5d3cab17..a1295c33 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -657,47 +657,22 @@ dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
/** Acquire MDL shared for the table name.
@tparam trylock whether to use non-blocking operation
@param[in,out] table table object
-@param[in,out] thd background thread
-@param[out] mdl mdl ticket
+@param[in,out] mdl_context MDL context
+@param[out] mdl MDL ticket
@param[in] table_op operation to perform when opening
@return table object after locking MDL shared
@retval nullptr if the table is not readable, or if trylock && MDL blocked */
template<bool trylock>
+__attribute__((nonnull, warn_unused_result))
dict_table_t*
dict_acquire_mdl_shared(dict_table_t *table,
- THD *thd,
- MDL_ticket **mdl,
+ MDL_context *mdl_context, MDL_ticket **mdl,
dict_table_op_t table_op)
{
- if (!table || !mdl)
- return table;
-
- MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
- size_t db_len;
- dict_table_t *not_found= nullptr;
-
- if (trylock)
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- db_len= dict_get_db_name_len(table->name.m_name);
- dict_sys.unfreeze();
- }
- else
- {
- ut_ad(dict_sys.frozen_not_locked());
- db_len= dict_get_db_name_len(table->name.m_name);
- }
-
- if (db_len == 0)
- return table; /* InnoDB system tables are not covered by MDL */
-
- if (!mdl_context)
- return nullptr;
-
table_id_t table_id= table->id;
char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
- size_t tbl_len;
+ size_t db_len, tbl_len;
bool unaccessible= false;
if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len))
@@ -768,7 +743,6 @@ retry:
if (!table || !table->is_accessible())
{
- table= nullptr;
return_without_mdl:
if (trylock)
dict_sys.unfreeze();
@@ -777,7 +751,7 @@ return_without_mdl:
mdl_context->release_lock(*mdl);
*mdl= nullptr;
}
- return not_found;
+ return nullptr;
}
size_t db1_len, tbl1_len;
@@ -815,6 +789,50 @@ return_without_mdl:
}
template dict_table_t* dict_acquire_mdl_shared<false>
+(dict_table_t*,MDL_context*,MDL_ticket**,dict_table_op_t);
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out] table table object
+@param[in,out] thd background thread
+@param[out] mdl mdl ticket
+@param[in] table_op operation to perform when opening
+@return table object after locking MDL shared
+@retval nullptr if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+ THD *thd,
+ MDL_ticket **mdl,
+ dict_table_op_t table_op)
+{
+ if (!table || !mdl)
+ return table;
+
+ MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+ size_t db_len;
+
+ if (trylock)
+ {
+ dict_sys.freeze(SRW_LOCK_CALL);
+ db_len= dict_get_db_name_len(table->name.m_name);
+ dict_sys.unfreeze();
+ }
+ else
+ {
+ ut_ad(dict_sys.frozen_not_locked());
+ db_len= dict_get_db_name_len(table->name.m_name);
+ }
+
+ if (db_len == 0)
+ return table; /* InnoDB system tables are not covered by MDL */
+
+ return mdl_context
+ ? dict_acquire_mdl_shared<trylock>(table, mdl_context, mdl, table_op)
+ : nullptr;
+}
+
+template dict_table_t* dict_acquire_mdl_shared<false>
(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
template dict_table_t* dict_acquire_mdl_shared<true>
(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
@@ -960,9 +978,6 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
{
latch.wr_lock(SRW_LOCK_ARGS(file, line));
latch_ex_wait_start.store(0, std::memory_order_relaxed);
- ut_ad(!latch_readers);
- ut_ad(!latch_ex);
- ut_d(latch_ex= pthread_self());
return;
}
@@ -978,35 +993,36 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
ib::warn() << "A long wait (" << waited
<< " seconds) was observed for dict_sys.latch";
latch.wr_lock(SRW_LOCK_ARGS(file, line));
- ut_ad(!latch_readers);
- ut_ad(!latch_ex);
- ut_d(latch_ex= pthread_self());
}
#ifdef UNIV_PFS_RWLOCK
ATTRIBUTE_NOINLINE void dict_sys_t::unlock()
{
- ut_ad(latch_ex == pthread_self());
- ut_ad(!latch_readers);
- ut_d(latch_ex= 0);
latch.wr_unlock();
}
ATTRIBUTE_NOINLINE void dict_sys_t::freeze(const char *file, unsigned line)
{
latch.rd_lock(file, line);
- ut_ad(!latch_ex);
- ut_d(latch_readers++);
}
ATTRIBUTE_NOINLINE void dict_sys_t::unfreeze()
{
- ut_ad(!latch_ex);
- ut_ad(latch_readers--);
latch.rd_unlock();
}
#endif /* UNIV_PFS_RWLOCK */
+/** Report an error about failing to open a table.
+@param name table name */
+static void dict_table_open_failed(const table_name_t &name)
+{
+ my_printf_error(ER_TABLE_CORRUPT,
+ "Table %`.*s.%`s is corrupted."
+ " Please drop the table and recreate.",
+ MYF(ME_ERROR_LOG),
+ int(name.dblen()), name.m_name, name.basename());
+}
+
/**********************************************************************//**
Returns a table object and increments its open handle count.
NOTE! This is a high-level function to be used mainly from outside the
@@ -1039,18 +1055,20 @@ dict_table_open_on_name(
if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
!table->is_readable() && table->corrupted)
{
- ulint algo = table->space->get_compression_algo();
- if (algo <= PAGE_ALGORITHM_LAST && !fil_comp_algo_loaded(algo)) {
- my_printf_error(ER_PROVIDER_NOT_LOADED,
- "Table %s is compressed with %s, which is not currently loaded. "
- "Please load the %s provider plugin to open the table",
- MYF(ME_ERROR_LOG), table->name,
- page_compression_algorithms[algo], page_compression_algorithms[algo]);
- } else {
- my_printf_error(ER_TABLE_CORRUPT,
- "Table %s is corrupted. Please drop the table and recreate.",
- MYF(ME_ERROR_LOG), table->name);
- }
+ ulint algo= table->space->get_compression_algo();
+ if (algo <= PAGE_ALGORITHM_LAST && !fil_comp_algo_loaded(algo))
+ my_printf_error(ER_PROVIDER_NOT_LOADED,
+ "Table %`.*s.%`s is compressed with %s,"
+ " which is not currently loaded. "
+ "Please load the %s provider plugin"
+ " to open the table",
+ MYF(ME_ERROR_LOG),
+ int(table->name.dblen()), table->name.m_name,
+ table->name.basename(),
+ page_compression_algorithms[algo],
+ page_compression_algorithms[algo]);
+ else
+ dict_table_open_failed(table->name);
dict_sys.unfreeze();
DBUG_RETURN(nullptr);
}
@@ -1070,8 +1088,7 @@ dict_table_open_on_name(
if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
!table->is_readable() && table->corrupted)
{
- ib::error() << "Table " << table->name
- << " is corrupted. Please drop the table and recreate.";
+ dict_table_open_failed(table->name);
if (!dict_locked)
dict_sys.unlock();
DBUG_RETURN(nullptr);
@@ -1992,7 +2009,6 @@ dict_index_add_to_cache(
new_index->n_fields = new_index->n_def;
new_index->trx_id = index->trx_id;
new_index->set_committed(index->is_committed());
- new_index->nulls_equal = index->nulls_equal;
n_ord = new_index->n_uniq;
/* Flag the ordering columns and also set column max_prefix */
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index f11187b9..f647278d 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -3867,6 +3867,10 @@ release_and_exit:
goto release_and_exit;
}
+#ifdef ENABLED_DEBUG_SYNC
+ DEBUG_SYNC(thd, "dict_stats_mdl_acquired");
+#endif /* ENABLED_DEBUG_SYNC */
+
trx = trx_create();
trx_start_internal_read_only(trx);
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index 97cb3994..d4e6c6f3 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -2290,7 +2290,7 @@ void fil_space_crypt_close_tablespace(const fil_space_t *space)
<< space->chain.start->name << " ("
<< space->id << ") active threads "
<< crypt_data->rotate_state.active_threads
- << "flushing="
+ << " flushing="
<< crypt_data->rotate_state.flushing << ".";
last = now;
}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index bd0ace7c..e8d23657 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -342,7 +342,7 @@ static bool fil_node_open_file_low(fil_node_t *node)
ut_ad(node->space->is_closing());
mysql_mutex_assert_owner(&fil_system.mutex);
static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
-#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+#if defined _WIN32 || defined O_DIRECT
ulint type;
switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
case 1:
@@ -361,8 +361,7 @@ static bool fil_node_open_file_low(fil_node_t *node)
bool success;
node->handle= os_file_create(innodb_data_file_key, node->name,
node->is_raw_disk
- ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
- : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ ? OS_FILE_OPEN_RAW : OS_FILE_OPEN,
OS_FILE_AIO, type,
srv_read_only_mode, &success);
@@ -928,9 +927,7 @@ bool fil_space_free(uint32_t id, bool x_latched)
log_sys.latch.wr_unlock();
} else {
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
if (space->max_lsn) {
ut_d(space->max_lsn = 0);
fil_system.named_spaces.remove(*space);
@@ -1691,30 +1688,27 @@ pfs_os_file_t fil_delete_tablespace(uint32_t id)
/*******************************************************************//**
Allocates and builds a file name from a path, a table or tablespace name
and a suffix. The string must be freed by caller with ut_free().
-@param[in] path NULL or the directory path or the full path and filename.
+@param[in] path nullptr or the directory path or the full path and filename
@param[in] name {} if path is full, or Table/Tablespace name
-@param[in] ext the file extension to use
-@param[in] trim_name true if the last name on the path should be trimmed.
+@param[in] extension the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed
@return own: file name */
-char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
- ib_extention ext, bool trim_name)
+char* fil_make_filepath_low(const char *path,
+ const fil_space_t::name_type &name,
+ ib_extention extension, bool trim_name)
{
/* The path may contain the basename of the file, if so we do not
need the name. If the path is NULL, we can use the default path,
but there needs to be a name. */
ut_ad(path || name.data());
- /* If we are going to strip a name off the path, there better be a
- path and a new name to put back on. */
- ut_ad(!trim_name || (path && name.data()));
-
if (path == NULL) {
path = fil_path_to_mysql_datadir;
}
ulint len = 0; /* current length */
ulint path_len = strlen(path);
- const char* suffix = dot_ext[ext];
+ const char* suffix = dot_ext[extension];
ulint suffix_len = strlen(suffix);
ulint full_len = path_len + 1 + name.size() + suffix_len + 1;
@@ -1797,8 +1791,16 @@ char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
char *fil_make_filepath(const char* path, const table_name_t name,
ib_extention suffix, bool strip_name)
{
- return fil_make_filepath(path, {name.m_name, strlen(name.m_name)},
- suffix, strip_name);
+ return fil_make_filepath_low(path, {name.m_name, strlen(name.m_name)},
+ suffix, strip_name);
+}
+
+/** Wrapper function over fil_make_filepath_low() to build directory name.
+@param path the directory path or the full path and filename
+@return own: directory name */
+static inline char *fil_make_dirpath(const char *path)
+{
+ return fil_make_filepath_low(path, fil_space_t::name_type{}, NO_EXT, true);
}
dberr_t fil_space_t::rename(const char *path, bool log, bool replace)
@@ -1839,14 +1841,32 @@ dberr_t fil_space_t::rename(const char *path, bool log, bool replace)
return DB_TABLESPACE_NOT_FOUND;
}
- exists= false;
- if (replace);
- else if (!os_file_status(path, &exists, &ftype) || exists)
+ if (!replace)
{
- sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
- " because the target file exists.",
- old_path, path);
- return DB_TABLESPACE_EXISTS;
+ char *schema_path= fil_make_dirpath(path);
+ if (!schema_path)
+ return DB_ERROR;
+
+ exists= false;
+ bool schema_fail= os_file_status(schema_path, &exists, &ftype) && !exists;
+ ut_free(schema_path);
+
+ if (schema_fail)
+ {
+ sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+ " because the target schema directory doesn't exist.",
+ old_path, path);
+ return DB_ERROR;
+ }
+
+ exists= false;
+ if (!os_file_status(path, &exists, &ftype) || exists)
+ {
+ sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+ " because the target file exists.",
+ old_path, path);
+ return DB_TABLESPACE_EXISTS;
+ }
}
mtr_t mtr;
@@ -1906,7 +1926,7 @@ fil_ibd_create(
static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
"compatibility");
-#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+#if defined _WIN32 || defined O_DIRECT
ulint type;
switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
case 1:
@@ -1922,7 +1942,7 @@ fil_ibd_create(
file = os_file_create(
innodb_data_file_key, path,
- OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_CREATE,
OS_FILE_AIO, type, srv_read_only_mode, &success);
if (!success) {
@@ -3037,9 +3057,7 @@ void
fil_names_dirty(
fil_space_t* space)
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
ut_ad(recv_recovery_is_on());
ut_ad(log_sys.get_lsn() != 0);
ut_ad(space->max_lsn == 0);
@@ -3053,9 +3071,7 @@ fil_names_dirty(
tablespace was modified for the first time since fil_names_clear(). */
ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write()
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
ut_d(fil_space_validate_for_mtr_commit(m_user_space));
ut_ad(!m_user_space->max_lsn);
m_user_space->max_lsn= log_sys.get_lsn();
@@ -3079,9 +3095,7 @@ ATTRIBUTE_COLD lsn_t fil_names_clear(lsn_t lsn)
{
mtr_t mtr;
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
ut_ad(lsn);
ut_ad(log_sys.is_latest());
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
index 1c20efcd..62f90f53 100644
--- a/storage/innobase/fsp/fsp0file.cc
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -502,9 +502,10 @@ err_exit:
return DB_SUCCESS;
}
- sql_print_error("InnoDB: %s in datafile: %s, Space ID: "
- UINT32PF ", " "Flags: " UINT32PF,
- error_txt, m_filepath, m_space_id, m_flags);
+ sql_print_information(
+ "InnoDB: %s in datafile: %s, Space ID: " UINT32PF
+ ", " "Flags: " UINT32PF,
+ error_txt, m_filepath, m_space_id, m_flags);
m_is_valid = false;
return DB_CORRUPTION;
}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 87672a82..5f34fe93 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -259,6 +259,7 @@ inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
}
/** Mark a page used in an extent descriptor.
+@param[in] space tablespace
@param[in,out] seg_inode segment inode
@param[in,out] iblock segment inode page
@param[in] page page number
@@ -268,8 +269,9 @@ inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
@return error code */
static MY_ATTRIBUTE((nonnull, warn_unused_result))
dberr_t
-fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
- ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
+fseg_mark_page_used(const fil_space_t *space,
+ fseg_inode_t *seg_inode, buf_block_t *iblock,
+ uint32_t page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
{
ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE);
ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
@@ -278,15 +280,16 @@ fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE);
const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+ const uint32_t limit= space->free_limit;
if (!xdes_get_n_used(descr))
{
/* We move the extent from the free list to the NOT_FULL list */
if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset),
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
}
@@ -303,10 +306,10 @@ fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
{
/* We move the extent from the NOT_FULL list to the FULL list */
if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset),
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
not_full_n_used - FSP_EXTENT_SIZE);
@@ -884,7 +887,7 @@ fsp_fill_free_list(
xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr);
xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
- xdes, xoffset, mtr))
+ xdes, xoffset, space->free_limit, mtr))
return err;
byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used));
@@ -893,7 +896,7 @@ fsp_fill_free_list(
{
if (dberr_t err=
flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
- xdes, xoffset, mtr))
+ xdes, xoffset, space->free_limit, mtr))
return err;
count++;
}
@@ -944,7 +947,11 @@ corrupted:
first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+ header->page.frame);
- if (first.page == FIL_NULL) {
+ if (first.page >= space->free_limit) {
+ if (first.page != FIL_NULL) {
+ goto flst_corrupted;
+ }
+
*err = fsp_fill_free_list(false, space, header, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
goto corrupted;
@@ -955,6 +962,17 @@ corrupted:
if (first.page == FIL_NULL) {
return nullptr; /* No free extents left */
}
+ if (first.page >= space->free_limit) {
+ goto flst_corrupted;
+ }
+ }
+
+ if (first.boffset < FSP_HEADER_OFFSET + FSP_HEADER_SIZE
+ || first.boffset >= space->physical_size()
+ - (XDES_SIZE + FIL_PAGE_DATA_END)) {
+ flst_corrupted:
+ *err = DB_CORRUPTION;
+ goto corrupted;
}
descr = xdes_lst_get_descriptor(*space, first, mtr,
@@ -967,7 +985,7 @@ corrupted:
*err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
static_cast<uint16_t>(descr - desc_block->page.frame
+ XDES_FLST_NODE),
- mtr);
+ space->free_limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return nullptr;
}
@@ -984,11 +1002,12 @@ MY_ATTRIBUTE((nonnull, warn_unused_result))
@param[in,out] xdes extent descriptor page
@param[in,out] descr extent descriptor
@param[in] bit slot to allocate in the extent
+@param[in] space tablespace
@param[in,out] mtr mini-transaction
@return error code */
static dberr_t
fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
- ulint bit, mtr_t *mtr)
+ uint32_t bit, fil_space_t *space, mtr_t *mtr)
{
if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG ||
!xdes_is_free(descr, bit)))
@@ -1001,14 +1020,15 @@ fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
if (xdes_is_full(descr))
{
+ const uint32_t limit= space->free_limit;
/* The fragment is full: move it to another list */
const uint16_t xoffset=
static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
- xdes, xoffset, mtr))
+ xdes, xoffset, limit, mtr))
return err;
xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
n_used-= FSP_EXTENT_SIZE;
@@ -1070,8 +1090,11 @@ buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
/* Else take the first extent in free_frag list */
fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG +
block->page.frame);
- if (first.page == FIL_NULL)
+ if (first.page >= space->free_limit)
{
+ if (first.page != FIL_NULL)
+ goto flst_corrupted;
+
/* There are no partially full fragments: allocate a free extent
and add it to the FREE_FRAG list. NOTE that the allocation may
have as a side-effect that an extent containing a descriptor
@@ -1082,13 +1105,23 @@ buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
return nullptr;
*err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes,
static_cast<uint16_t>(descr - xdes->page.frame +
- XDES_FLST_NODE), mtr);
+ XDES_FLST_NODE),
+ space->free_limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
return nullptr;
xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
}
else
{
+ if (first.boffset < FSP_HEADER_OFFSET + FSP_HEADER_SIZE ||
+ first.boffset >= space->physical_size() -
+ (XDES_SIZE + FIL_PAGE_DATA_END))
+ {
+ flst_corrupted:
+ *err= DB_CORRUPTION;
+ goto err_exit;
+ }
+
descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err);
if (!descr)
return nullptr;
@@ -1135,7 +1168,7 @@ buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
}
}
- *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+ *err= fsp_alloc_from_free_frag(block, xdes, descr, free, space, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
goto corrupted;
return fsp_page_create(space, page_no, init_mtr);
@@ -1174,7 +1207,8 @@ static dberr_t fsp_free_extent(fil_space_t* space, uint32_t offset,
space->free_len++;
return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
xdes, static_cast<uint16_t>(descr - xdes->page.frame +
- XDES_FLST_NODE), mtr);
+ XDES_FLST_NODE),
+ space->free_limit, mtr);
}
MY_ATTRIBUTE((nonnull))
@@ -1228,16 +1262,17 @@ static dberr_t fsp_free_page(fil_space_t *space, uint32_t offset, mtr_t *mtr)
const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame
+ XDES_FLST_NODE);
+ const uint32_t limit = space->free_limit;
if (state == XDES_FULL_FRAG) {
/* The fragment was full: move it to another list */
err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
@@ -1259,7 +1294,7 @@ static dberr_t fsp_free_page(fil_space_t *space, uint32_t offset, mtr_t *mtr)
if (!xdes_get_n_used(descr)) {
/* The extent has become free: move it to another list */
err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (err == DB_SUCCESS) {
err = fsp_free_extent(space, offset, mtr);
}
@@ -1353,7 +1388,7 @@ static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space,
#endif
return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
- block, FSEG_INODE_PAGE_NODE, mtr);
+ block, FSEG_INODE_PAGE_NODE, space->free_limit, mtr);
}
MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -1409,12 +1444,13 @@ fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
{
/* There are no other unused headers left on the page: move it
to another list */
+ const uint32_t limit= space->free_limit;
*err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
- block, FSEG_INODE_PAGE_NODE, mtr);
+ block, FSEG_INODE_PAGE_NODE, limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
return nullptr;
*err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
- block, FSEG_INODE_PAGE_NODE, mtr);
+ block, FSEG_INODE_PAGE_NODE, limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
return nullptr;
}
@@ -1447,16 +1483,17 @@ static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
}
const ulint physical_size= space->physical_size();
+ const uint32_t limit= space->free_limit;
if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0,
physical_size))
{
/* Move the page to another list */
if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
- iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+ iblock, FSEG_INODE_PAGE_NODE, limit, mtr) != DB_SUCCESS)
return;
if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
- iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+ iblock, FSEG_INODE_PAGE_NODE, limit, mtr) != DB_SUCCESS)
return;
}
@@ -1468,7 +1505,7 @@ static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
/* There are no other used headers left on the page: free it */
if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
- iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS)
+ iblock, FSEG_INODE_PAGE_NODE, limit, mtr) == DB_SUCCESS)
fsp_free_page(space, iblock->page.id().page_no(), mtr);
}
@@ -1841,7 +1878,8 @@ static dberr_t fseg_fill_free_list(const fseg_inode_t *inode,
static_cast<uint16_t>(inode - iblock->page.frame +
FSEG_FREE), xdes,
static_cast<uint16_t>(descr - xdes->page.frame +
- XDES_FLST_NODE), mtr))
+ XDES_FLST_NODE),
+ space->free_limit, mtr))
return err;
xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8);
@@ -1876,11 +1914,25 @@ fseg_alloc_free_extent(
ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
ut_d(space->modify_check(*mtr));
+ if (UNIV_UNLIKELY(page_offset(inode) < FSEG_ARR_OFFSET))
+ {
+ corrupted:
+ *err= DB_CORRUPTION;
+ space->set_corrupted();
+ return nullptr;
+ }
+
if (flst_get_len(inode + FSEG_FREE))
{
+ const fil_addr_t first= flst_get_first(inode + FSEG_FREE);
+ if (first.page >= space->free_limit ||
+ first.boffset < FSP_HEADER_OFFSET + FSP_HEADER_SIZE ||
+ first.boffset >= space->physical_size() -
+ (XDES_SIZE + FIL_PAGE_DATA_END))
+ goto corrupted;
+
/* Segment free list is not empty, allocate from it */
- return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE),
- mtr, xdes, err);
+ return xdes_lst_get_descriptor(*space, first, mtr, xdes, err);
}
xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err);
@@ -1892,7 +1944,8 @@ fseg_alloc_free_extent(
static_cast<uint16_t>(inode - iblock->page.frame +
FSEG_FREE), *xdes,
static_cast<uint16_t>(descr - (*xdes)->page.frame +
- XDES_FLST_NODE), mtr);
+ XDES_FLST_NODE),
+ space->free_limit, mtr);
if (UNIV_LIKELY(*err != DB_SUCCESS))
return nullptr;
/* Try to fill the segment free list */
@@ -1978,29 +2031,42 @@ fseg_alloc_free_page_low(
}
}
- /* In the big if-else below we look for ret_page and ret_descr */
- /*-------------------------------------------------------------*/
- if ((xdes_get_state(descr) == XDES_FSEG)
- && mach_read_from_8(descr + XDES_ID) == seg_id
- && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
+ const uint32_t extent_size = FSP_EXTENT_SIZE;
+ ret_descr = descr;
+ /* Try to get the page from extent which belongs to segment */
+ if (xdes_get_state(descr) == XDES_FSEG
+ && mach_read_from_8(descr + XDES_ID) == seg_id) {
+ /* Get the page from the segment extent */
+ if (xdes_is_free(descr, hint % extent_size)) {
take_hinted_page:
- /* 1. We can take the hinted page
- =================================*/
- ret_descr = descr;
- ret_page = hint;
- /* Skip the check for extending the tablespace. If the
- page hint were not within the size of the tablespace,
- we would have got (descr == NULL) above and reset the hint. */
- goto got_hinted_page;
- /*-----------------------------------------------------------*/
- } else if (xdes_get_state(descr) == XDES_FREE
- && reserved - used < reserved / FSEG_FILLFACTOR
- && used >= FSEG_FRAG_LIMIT) {
-
- /* 2. We allocate the free extent from space and can take
- =========================================================
- the hinted page
- ===============*/
+ ret_page = hint;
+ goto got_hinted_page;
+ } else if (!xdes_is_full(descr)) {
+ /* Take the page from the same extent as the
+ hinted page (and the extent already belongs to
+ the segment) */
+ ret_page = xdes_find_free(descr, hint % extent_size);
+ if (ret_page == FIL_NULL) {
+ ut_ad(!has_done_reservation);
+ return nullptr;
+ }
+ ret_page += xdes_get_offset(ret_descr);
+ goto alloc_done;
+ }
+ }
+
+ /** If the number of unused but reserved pages in a segment is
+ esser than minimum value of 1/8 of reserved pages or
+ 4 * FSP_EXTENT_SIZE and there are at least half of extent size
+ used pages, then we allow a new empty extent to be added to
+ the segment in fseg_alloc_free_page_general(). Otherwise, we use
+ unused pages of the segment. */
+ if (used < extent_size / 2 ||
+ reserved - used >= reserved / 8 ||
+ reserved - used >= extent_size * 4) {
+ } else if (xdes_get_state(descr) == XDES_FREE) {
+ /* Allocate the free extent from space and can
+ take the hinted page */
ret_descr = fsp_alloc_free_extent(space, hint, &xdes,
mtr, err);
@@ -2020,61 +2086,42 @@ take_hinted_page:
+ FSEG_FREE), xdes,
static_cast<uint16_t>(ret_descr
- xdes->page.frame
- + XDES_FLST_NODE), mtr);
+ + XDES_FLST_NODE),
+ space->free_limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return nullptr;
}
/* Try to fill the segment free list */
*err = fseg_fill_free_list(seg_inode, iblock, space,
- hint + FSP_EXTENT_SIZE, mtr);
+ hint + extent_size, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return nullptr;
}
goto take_hinted_page;
- /*-----------------------------------------------------------*/
- } else if ((direction != FSP_NO_DIR)
- && ((reserved - used) < reserved / FSEG_FILLFACTOR)
- && (used >= FSEG_FRAG_LIMIT)
- && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
- &xdes, space,
- mtr, err))) {
- /* 3. We take any free extent (which was already assigned above
- ===============================================================
- in the if-condition to ret_descr) and take the lowest or
- ========================================================
- highest page in it, depending on the direction
- ==============================================*/
+ } else if (direction != FSP_NO_DIR) {
+
+ ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+ &xdes, space, mtr, err);
+
+ if (!ret_descr) {
+ ut_ad(*err != DB_SUCCESS);
+ return nullptr;
+ }
+ /* Take any free extent (which was already assigned
+ above in the if-condition to ret_descr) and take the
+ lowest or highest page in it, depending on the direction */
ret_page = xdes_get_offset(ret_descr);
if (direction == FSP_DOWN) {
- ret_page += FSP_EXTENT_SIZE - 1;
- }
- ut_ad(!has_done_reservation || ret_page != FIL_NULL);
- /*-----------------------------------------------------------*/
- } else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
- return nullptr;
- } else if ((xdes_get_state(descr) == XDES_FSEG)
- && mach_read_from_8(descr + XDES_ID) == seg_id
- && (!xdes_is_full(descr))) {
-
- /* 4. We can take the page from the same extent as the
- ======================================================
- hinted page (and the extent already belongs to the
- ==================================================
- segment)
- ========*/
- ret_descr = descr;
- ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
- if (ret_page == FIL_NULL) {
- ut_ad(!has_done_reservation);
- } else {
- ret_page += xdes_get_offset(ret_descr);
+ ret_page += extent_size - 1;
}
- /*-----------------------------------------------------------*/
- } else if (reserved - used > 0) {
- /* 5. We take any unused page from the segment
- ==============================================*/
+ goto alloc_done;
+ }
+
+ /* Try to take individual page from the segment or tablespace */
+ if (reserved - used > 0) {
+ /* Take any unused page from the segment */
fil_addr_t first;
if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
@@ -2083,7 +2130,15 @@ take_hinted_page:
first = flst_get_first(seg_inode + FSEG_FREE);
} else {
ut_ad(!has_done_reservation);
- return(NULL);
+ return nullptr;
+ }
+
+ if (first.page >= space->free_limit
+ || first.boffset < FSP_HEADER_OFFSET + FSP_HEADER_SIZE
+ || first.boffset >= space->physical_size()
+ - (XDES_SIZE + FIL_PAGE_DATA_END)) {
+ *err= DB_CORRUPTION;
+ return nullptr;
}
ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes);
@@ -2097,10 +2152,9 @@ take_hinted_page:
} else {
ret_page += xdes_get_offset(ret_descr);
}
- /*-----------------------------------------------------------*/
- } else if (used < FSEG_FRAG_LIMIT) {
- /* 6. We allocate an individual page from the space
- ===================================================*/
+
+ } else if (used < extent_size / 2) {
+ /* Allocate an individual page from the space */
buf_block_t* block = fsp_alloc_free_page(
space, hint, mtr, init_mtr, err);
@@ -2123,13 +2177,11 @@ take_hinted_page:
/* fsp_alloc_free_page() invoked fsp_init_file_page()
already. */
return(block);
- /*-----------------------------------------------------------*/
} else {
- /* 7. We allocate a new extent and take its first page
- ======================================================*/
+ /* In worst case, try to allocate a new extent
+ and take its first page */
ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
space, mtr, err);
-
if (!ret_descr) {
ut_ad(!has_done_reservation || *err);
return nullptr;
@@ -2142,14 +2194,13 @@ take_hinted_page:
/* Page could not be allocated */
ut_ad(!has_done_reservation);
- return(NULL);
+ return nullptr;
}
-
+alloc_done:
if (space->size <= ret_page && !is_predefined_tablespace(space->id)) {
/* It must be that we are extending a single-table
tablespace whose size is still < 64 pages */
-
- if (ret_page >= FSP_EXTENT_SIZE) {
+ if (ret_page >= extent_size) {
sql_print_error("InnoDB: Trying to extend '%s'"
" by single page(s) though the"
" space size " UINT32PF "."
@@ -2157,33 +2208,34 @@ take_hinted_page:
space->chain.start->name, space->size,
ret_page);
ut_ad(!has_done_reservation);
- return(NULL);
+ return nullptr;
}
if (!fsp_try_extend_data_file_with_pages(
space, ret_page, header, mtr)) {
/* No disk space left */
ut_ad(!has_done_reservation);
- return(NULL);
+ return nullptr;
}
}
-got_hinted_page:
- /* ret_descr == NULL if the block was allocated from free_frag
- (XDES_FREE_FRAG) */
+ /* Skip the check for extending the tablespace.
+ If the page hint were not within the size of the tablespace,
+ descr set to nullptr above and reset the hint and the block
+ was allocated from free_frag (XDES_FREE_FRAG) */
if (ret_descr != NULL) {
+got_hinted_page:
/* At this point we know the extent and the page offset.
The extent is still in the appropriate list (FSEG_NOT_FULL
or FSEG_FREE), and the page is not yet marked as used. */
-
ut_d(buf_block_t* xxdes);
ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes)
== ret_descr);
ut_ad(xdes == xxdes);
- ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
+ ut_ad(xdes_is_free(ret_descr, ret_page % extent_size));
- *err = fseg_mark_page_used(seg_inode, iblock, ret_page,
- ret_descr, xdes, mtr);
+ *err = fseg_mark_page_used(space, seg_inode, iblock, ret_page,
+ ret_descr, xdes, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return nullptr;
}
@@ -2524,18 +2576,19 @@ corrupted:
const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+ XDES_FLST_NODE);
const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+ const uint32_t limit = space->free_limit;
if (xdes_is_full(descr)) {
/* The fragment is full: move it to another list */
err = flst_remove(iblock,
static_cast<uint16_t>(FSEG_FULL + ioffset),
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ ioffset),
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
@@ -2553,7 +2606,7 @@ corrupted:
if (!xdes_get_n_used(descr)) {
err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ ioffset),
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
@@ -2698,11 +2751,12 @@ fseg_free_extent(
#endif /* BTR_CUR_HASH_ADAPT */
uint16_t lst;
+ uint32_t limit = space->free_limit;
if (xdes_is_full(descr)) {
lst = static_cast<uint16_t>(FSEG_FULL + ioffset);
remove:
- err = flst_remove(iblock, lst, xdes, xoffset, mtr);
+ err = flst_remove(iblock, lst, xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
@@ -2712,7 +2766,7 @@ remove:
} else {
err = flst_remove(
iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset),
- xdes, xoffset, mtr);
+ xdes, xoffset, limit, mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
return err;
}
@@ -2962,7 +3016,10 @@ fseg_get_first_extent(
return nullptr;
}
- if (first.page == FIL_NULL)
+ if (first.page >= space->free_limit ||
+ first.boffset < FSP_HEADER_OFFSET + FSP_HEADER_SIZE ||
+ first.boffset >= space->physical_size() -
+ (XDES_SIZE + FIL_PAGE_DATA_END))
goto corrupted;
return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err);
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index 0775d939..4d9a1d3a 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -2187,6 +2187,22 @@ fts_trx_row_get_new_state(
return(result);
}
+/** Compare two doubly indirected pointers */
+static int fts_ptr2_cmp(const void *p1, const void *p2)
+{
+ const void *a= **static_cast<const void*const*const*>(p1);
+ const void *b= **static_cast<const void*const*const*>(p2);
+ return b > a ? -1 : a > b;
+}
+
+/** Compare a singly indirected pointer to a doubly indirected one */
+static int fts_ptr1_ptr2_cmp(const void *p1, const void *p2)
+{
+ const void *a= *static_cast<const void*const*>(p1);
+ const void *b= **static_cast<const void*const*const*>(p2);
+ return b > a ? -1 : a > b;
+}
+
/******************************************************************//**
Create a savepoint instance.
@return savepoint instance */
@@ -2209,8 +2225,8 @@ fts_savepoint_create(
savepoint->name = mem_heap_strdup(heap, name);
}
- savepoint->tables = rbt_create(
- sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+ static_assert(!offsetof(fts_trx_table_t, table), "ABI");
+ savepoint->tables = rbt_create(sizeof(fts_trx_table_t*), fts_ptr2_cmp);
return(savepoint);
}
@@ -2258,6 +2274,19 @@ fts_trx_create(
return(ftt);
}
+/** Compare two doc_id */
+static inline int doc_id_cmp(doc_id_t a, doc_id_t b)
+{
+ return b > a ? -1 : a > b;
+}
+
+/** Compare two DOC_ID. */
+int fts_doc_id_cmp(const void *p1, const void *p2)
+{
+ return doc_id_cmp(*static_cast<const doc_id_t*>(p1),
+ *static_cast<const doc_id_t*>(p2));
+}
+
/******************************************************************//**
Create an FTS trx table.
@return FTS trx table */
@@ -2276,7 +2305,8 @@ fts_trx_table_create(
ftt->table = table;
ftt->fts_trx = fts_trx;
- ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+ static_assert(!offsetof(fts_trx_row_t, doc_id), "ABI");
+ ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_doc_id_cmp);
return(ftt);
}
@@ -2300,7 +2330,8 @@ fts_trx_table_clone(
ftt->table = ftt_src->table;
ftt->fts_trx = ftt_src->fts_trx;
- ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+ static_assert(!offsetof(fts_trx_row_t, doc_id), "ABI");
+ ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_doc_id_cmp);
/* Copy the rb tree values to the new savepoint. */
rbt_merge_uniq(ftt->rows, ftt_src->rows);
@@ -2325,13 +2356,9 @@ fts_trx_init(
{
fts_trx_table_t* ftt;
ib_rbt_bound_t parent;
- ib_rbt_t* tables;
- fts_savepoint_t* savepoint;
-
- savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
-
- tables = savepoint->tables;
- rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+ ib_rbt_t* tables = static_cast<fts_savepoint_t*>(
+ ib_vector_last(savepoints))->tables;
+ rbt_search_cmp(tables, &parent, &table, fts_ptr1_ptr2_cmp, nullptr);
if (parent.result == 0) {
fts_trx_table_t** fttp;
@@ -3860,6 +3887,13 @@ fts_write_node(
return(error);
}
+/** Sort an array of doc_id */
+void fts_doc_ids_sort(ib_vector_t *doc_ids)
+{
+ doc_id_t *const data= reinterpret_cast<doc_id_t*>(doc_ids->data);
+ std::sort(data, data + doc_ids->used);
+}
+
/*********************************************************************//**
Add rows to the DELETED_CACHE table.
@return DB_SUCCESS if all went well else error code*/
@@ -3881,7 +3915,7 @@ fts_sync_add_deleted_cache(
ut_a(ib_vector_size(doc_ids) > 0);
- ib_vector_sort(doc_ids, fts_doc_id_cmp);
+ fts_doc_ids_sort(doc_ids);
info = pars_info_create();
@@ -5575,8 +5609,8 @@ fts_savepoint_rollback_last_stmt(
l_ftt = rbt_value(fts_trx_table_t*, node);
rbt_search_cmp(
- s_tables, &parent, &(*l_ftt)->table->id,
- fts_trx_table_id_cmp, NULL);
+ s_tables, &parent, &(*l_ftt)->table,
+ fts_ptr1_ptr2_cmp, nullptr);
if (parent.result == 0) {
fts_trx_table_t** s_ftt;
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
index fe31767d..30889e59 100644
--- a/storage/innobase/fts/fts0opt.cc
+++ b/storage/innobase/fts/fts0opt.cc
@@ -1016,7 +1016,7 @@ fts_table_fetch_doc_ids(
que_graph_free(graph);
if (error == DB_SUCCESS) {
- ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp);
+ fts_doc_ids_sort(doc_ids->doc_ids);
}
if (alloc_bk_trx) {
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 9c92a117..b8f22076 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -385,22 +385,6 @@ fts_query_terms_in_document(
ulint* total); /*!< out: total words in document */
#endif
-/********************************************************************
-Compare two fts_doc_freq_t doc_ids.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_freq_doc_id_cmp(
-/*================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
- const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
-
- return((int) (fq1->doc_id - fq2->doc_id));
-}
-
#if 0
/*******************************************************************//**
Print the table used for calculating LCS. */
@@ -506,14 +490,11 @@ fts_query_compare_rank(
if (r2->rank < r1->rank) {
return(-1);
} else if (r2->rank == r1->rank) {
-
if (r1->doc_id < r2->doc_id) {
- return(1);
- } else if (r1->doc_id > r2->doc_id) {
- return(1);
+ return -1;
}
- return(0);
+ return r1->doc_id > r2->doc_id;
}
return(1);
@@ -674,8 +655,9 @@ fts_query_add_word_freq(
word_freq.doc_count = 0;
+ static_assert(!offsetof(fts_doc_freq_t, doc_id), "ABI");
word_freq.doc_freqs = rbt_create(
- sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+ sizeof(fts_doc_freq_t), fts_doc_id_cmp);
parent.last = rbt_add_node(
query->word_freqs, &parent, &word_freq);
@@ -1253,8 +1235,9 @@ fts_query_intersect(
/* Create the rb tree that will hold the doc ids of
the intersection. */
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
query->intersection = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ sizeof(fts_ranking_t), fts_doc_id_cmp);
query->total_size += SIZEOF_RBT_CREATE;
@@ -1540,8 +1523,9 @@ fts_merge_doc_ids(
to create a new result set for fts_query_intersect(). */
if (query->oper == FTS_EXIST) {
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
query->intersection = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ sizeof(fts_ranking_t), fts_doc_id_cmp);
query->total_size += SIZEOF_RBT_CREATE;
}
@@ -3012,8 +2996,9 @@ fts_query_visitor(
if (query->oper == FTS_EXIST) {
ut_ad(query->intersection == NULL);
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
query->intersection = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ sizeof(fts_ranking_t), fts_doc_id_cmp);
query->total_size += SIZEOF_RBT_CREATE;
}
@@ -3123,8 +3108,8 @@ fts_ast_visit_sub_exp(
/* Create new result set to store the sub-expression result. We
will merge this result set with the parent after processing. */
- query->doc_ids = rbt_create(sizeof(fts_ranking_t),
- fts_ranking_doc_id_cmp);
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
+ query->doc_ids = rbt_create(sizeof(fts_ranking_t), fts_doc_id_cmp);
query->total_size += SIZEOF_RBT_CREATE;
@@ -3661,8 +3646,9 @@ fts_query_prepare_result(
result = static_cast<fts_result_t*>(
ut_zalloc_nokey(sizeof(*result)));
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
result->rankings_by_id = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ sizeof(fts_ranking_t), fts_doc_id_cmp);
query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
result_is_null = true;
@@ -4038,7 +4024,7 @@ fts_query(
DEBUG_SYNC_C("fts_deleted_doc_ids_append");
/* Sort the vector so that we can do a binary search over the ids. */
- ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
+ fts_doc_ids_sort(query.deleted->doc_ids);
/* Convert the query string to lower case before parsing. We own
the ut_malloc'ed result and so remember to free it before return. */
@@ -4065,8 +4051,9 @@ fts_query(
query.heap = mem_heap_create(128);
/* Create the rb tree for the doc id (current) set. */
+ static_assert(!offsetof(fts_ranking_t, doc_id), "ABI");
query.doc_ids = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ sizeof(fts_ranking_t), fts_doc_id_cmp);
query.parser = index->parser;
query.total_size += SIZEOF_RBT_CREATE;
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
index a52027f2..48e2fbe3 100644
--- a/storage/innobase/fut/fut0lst.cc
+++ b/storage/innobase/fut/fut0lst.cc
@@ -113,17 +113,18 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
}
/** Insert a node after another one.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] cur insert position block
-@param[in] coffset byte offset of the insert position
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the block to be added
-@param[in,out] mtr mini-transaction */
+@param base base node block
+@param boffset byte offset of the base node
+@param cur insert position block
+@param coffset byte offset of the insert position
+@param add block to be added
+@param aoffset byte offset of the block to be added
+@param limit fil_space_t::free_limit
+@param mtr mini-transaction */
static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset,
buf_block_t *cur, uint16_t coffset,
buf_block_t *add, uint16_t aoffset,
- mtr_t *mtr)
+ uint32_t limit, mtr_t *mtr)
{
ut_ad(base != cur || boffset != coffset);
ut_ad(base != add || boffset != aoffset);
@@ -139,6 +140,15 @@ static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset,
MTR_MEMO_PAGE_SX_FIX));
fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
+ if (next_addr.page >= limit)
+ {
+ if (UNIV_UNLIKELY(next_addr.page != FIL_NULL))
+ return DB_CORRUPTION;
+ }
+ else if (UNIV_UNLIKELY(next_addr.boffset < FIL_PAGE_DATA ||
+ next_addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
cur->page.id().page_no(), coffset, mtr);
@@ -167,18 +177,19 @@ static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset,
}
/** Insert a node before another one.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] cur insert position block
-@param[in] coffset byte offset of the insert position
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the block to be added
-@param[in,out] mtr mini-transaction
+@param base base node block
+@param boffset byte offset of the base node
+@param cur insert position block
+@param coffset byte offset of the insert position
+@param add block to be added
+@param aoffset byte offset of the block to be added
+@param limit fil_space_t::free_limit
+@param mtr mini-transaction
@return error code */
static dberr_t flst_insert_before(buf_block_t *base, uint16_t boffset,
buf_block_t *cur, uint16_t coffset,
buf_block_t *add, uint16_t aoffset,
- mtr_t *mtr)
+ uint32_t limit, mtr_t *mtr)
{
ut_ad(base != cur || boffset != coffset);
ut_ad(base != add || boffset != aoffset);
@@ -194,6 +205,15 @@ static dberr_t flst_insert_before(buf_block_t *base, uint16_t boffset,
MTR_MEMO_PAGE_SX_FIX));
fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset);
+ if (prev_addr.page >= limit)
+ {
+ if (UNIV_UNLIKELY(prev_addr.page != FIL_NULL))
+ return DB_CORRUPTION;
+ }
+ else if (UNIV_UNLIKELY(prev_addr.boffset < FIL_PAGE_DATA ||
+ prev_addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
prev_addr.page, prev_addr.boffset, mtr);
@@ -234,14 +254,9 @@ void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
flst_zero_both(block, base + FLST_FIRST, mtr);
}
-/** Append a file list node to a list.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the node to be added
-@param[in,outr] mtr mini-transaction */
dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
- buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ buf_block_t *add, uint16_t aoffset,
+ uint32_t limit, mtr_t *mtr)
{
ut_ad(base != add || boffset != aoffset);
ut_ad(boffset < base->physical_size());
@@ -258,6 +273,13 @@ dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
else
{
fil_addr_t addr= flst_get_last(base->page.frame + boffset);
+ if (UNIV_UNLIKELY(addr.page >= limit))
+ return DB_CORRUPTION;
+ else if (UNIV_UNLIKELY(addr.boffset < FIL_PAGE_DATA ||
+ addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
+
buf_block_t *cur= add;
dberr_t err;
if (addr.page != add->page.id().page_no() &&
@@ -266,19 +288,13 @@ dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
BUF_GET_POSSIBLY_FREED, mtr, &err)))
return err;
return flst_insert_after(base, boffset, cur, addr.boffset,
- add, aoffset, mtr);
+ add, aoffset, limit, mtr);
}
}
-/** Prepend a file list node to a list.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the node to be added
-@param[in,out] mtr mini-transaction
-@return error code */
dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
- buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ buf_block_t *add, uint16_t aoffset,
+ uint32_t limit, mtr_t *mtr)
{
ut_ad(base != add || boffset != aoffset);
ut_ad(boffset < base->physical_size());
@@ -296,6 +312,12 @@ dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
else
{
fil_addr_t addr= flst_get_first(base->page.frame + boffset);
+ if (UNIV_UNLIKELY(addr.page >= limit))
+ return DB_CORRUPTION;
+ else if (UNIV_UNLIKELY(addr.boffset < FIL_PAGE_DATA ||
+ addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
buf_block_t *cur= add;
dberr_t err;
if (addr.page != add->page.id().page_no() &&
@@ -304,19 +326,13 @@ dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
BUF_GET_POSSIBLY_FREED, mtr, &err)))
return err;
return flst_insert_before(base, boffset, cur, addr.boffset,
- add, aoffset, mtr);
+ add, aoffset, limit, mtr);
}
}
-/** Remove a file list node.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] cur block to be removed
-@param[in] coffset byte offset of the current record to be removed
-@param[in,out] mtr mini-transaction
-@return error code */
dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
- buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+ buf_block_t *cur, uint16_t coffset,
+ uint32_t limit, mtr_t *mtr)
{
ut_ad(boffset < base->physical_size());
ut_ad(coffset < cur->physical_size());
@@ -329,9 +345,27 @@ dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
const fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
dberr_t err= DB_SUCCESS;
- if (prev_addr.page == FIL_NULL)
+ if (next_addr.page >= limit)
+ {
+ if (next_addr.page != FIL_NULL)
+ return DB_CORRUPTION;
+ }
+ else if (UNIV_UNLIKELY(next_addr.boffset < FIL_PAGE_DATA ||
+ next_addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
+
+ if (prev_addr.page >= limit)
+ {
+ if (prev_addr.page != FIL_NULL)
+ return DB_CORRUPTION;
flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
next_addr.page, next_addr.boffset, mtr);
+ }
+ else if (UNIV_UNLIKELY(prev_addr.boffset < FIL_PAGE_DATA ||
+ prev_addr.boffset >= base->physical_size() -
+ FIL_PAGE_DATA_END))
+ return DB_CORRUPTION;
else
{
buf_block_t *b= cur;
@@ -375,25 +409,19 @@ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
MTR_MEMO_PAGE_SX_FIX));
- /* We use two mini-transaction handles: the first is used to lock
- the base node, and prevent other threads from modifying the list.
- The second is used to traverse the list. We cannot run the second
- mtr without committing it at times, because if the list is long,
- the x-locked pages could fill the buffer, resulting in a deadlock. */
- mtr_t mtr2;
-
const uint32_t len= flst_get_len(base->page.frame + boffset);
fil_addr_t addr= flst_get_first(base->page.frame + boffset);
for (uint32_t i= len; i--; )
{
- mtr2.start();
+ ut_ad(addr.boffset >= FIL_PAGE_DATA);
+ ut_ad(addr.boffset < base->physical_size() - FIL_PAGE_DATA_END);
const buf_block_t *b=
buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
ut_ad(b);
addr= flst_get_next_addr(b->page.frame + addr.boffset);
- mtr2.commit();
+ mtr->release_last_page();
}
ut_ad(addr.page == FIL_NULL);
@@ -402,13 +430,14 @@ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
for (uint32_t i= len; i--; )
{
- mtr2.start();
+ ut_ad(addr.boffset >= FIL_PAGE_DATA);
+ ut_ad(addr.boffset < base->physical_size() - FIL_PAGE_DATA_END);
const buf_block_t *b=
buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
ut_ad(b);
addr= flst_get_prev_addr(b->page.frame + addr.boffset);
- mtr2.commit();
+ mtr->release_last_page();
}
ut_ad(addr.page == FIL_NULL);
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index 0df9a7de..4aab68e9 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -289,10 +289,6 @@ rtr_pcur_getnext_from_path(
mtr->rollback_to_savepoint(1);
}
- ut_ad((my_latch_mode | 4) == BTR_CONT_MODIFY_TREE
- || !page_is_leaf(btr_cur_get_page(btr_cur))
- || !btr_cur->page_cur.block->page.lock.have_any());
-
const auto block_savepoint = mtr->get_savepoint();
block = buf_page_get_gen(
page_id_t(index->table->space_id,
@@ -511,7 +507,7 @@ rtr_pcur_move_to_next(
mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
cursor->btr_cur.page_cur.rec = rec.r_rec;
- cursor->btr_cur.page_cur.block = &rtr_info->matches->block;
+ cursor->btr_cur.page_cur.block = rtr_info->matches->block;
DEBUG_SYNC_C("rtr_pcur_move_to_next_return");
return(true);
@@ -672,8 +668,13 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
buf_mode, mtr, &err, false);
if (!block)
{
- if (err == DB_DECRYPTION_FAILED)
- btr_decryption_failed(*index);
+ if (err)
+ {
+ err_exit:
+ if (err == DB_DECRYPTION_FAILED)
+ btr_decryption_failed(*index);
+ mtr->rollback_to_savepoint(savepoint);
+ }
func_exit:
if (UNIV_LIKELY_NULL(heap))
mem_heap_free(heap);
@@ -737,7 +738,8 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
#endif
}
- if (height == 0) {
+ if (height == 0)
+ {
if (rw_latch == RW_NO_LATCH)
{
ut_ad(block == mtr->at_savepoint(block_savepoint));
@@ -821,7 +823,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
if (page_cur_search_with_match(tuple, page_mode, &up_match,
&low_match, &cur->page_cur, nullptr)) {
err= DB_CORRUPTION;
- goto func_exit;
+ goto err_exit;
}
}
@@ -1316,21 +1318,15 @@ rtr_create_rtr_info(
rtr_info->index = index;
if (init_matches) {
- rtr_info->heap = mem_heap_create(sizeof(*(rtr_info->matches)));
rtr_info->matches = static_cast<matched_rec_t*>(
- mem_heap_zalloc(
- rtr_info->heap,
- sizeof(*rtr_info->matches)));
+ ut_zalloc_nokey(sizeof *rtr_info->matches));
rtr_info->matches->matched_recs
= UT_NEW_NOKEY(rtr_rec_vector());
- rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf
- + UNIV_PAGE_SIZE_MAX + 1);
mysql_mutex_init(rtr_match_mutex_key,
&rtr_info->matches->rtr_match_mutex,
nullptr);
- rtr_info->matches->block.page.lock.init();
}
rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
@@ -1449,18 +1445,16 @@ rtr_clean_rtr_info(
if (free_all) {
if (rtr_info->matches) {
- if (rtr_info->matches->matched_recs != NULL) {
- UT_DELETE(rtr_info->matches->matched_recs);
+ if (rtr_info->matches->block) {
+ buf_block_free(rtr_info->matches->block);
+ rtr_info->matches->block = nullptr;
}
- rtr_info->matches->block.page.lock.free();
+ UT_DELETE(rtr_info->matches->matched_recs);
mysql_mutex_destroy(
&rtr_info->matches->rtr_match_mutex);
- }
-
- if (rtr_info->heap) {
- mem_heap_free(rtr_info->heap);
+ ut_free(rtr_info->matches);
}
if (initialized) {
@@ -1570,7 +1564,7 @@ rtr_check_discard_page(
if (auto matches = rtr_info->matches) {
mysql_mutex_lock(&matches->rtr_match_mutex);
- if (matches->block.page.id() == id) {
+ if (matches->block->page.id() == id) {
matches->matched_recs->clear();
matches->valid = false;
}
@@ -1584,23 +1578,6 @@ rtr_check_discard_page(
lock_sys.prdt_page_free_from_discard(id, true);
}
-/** Structure acts as functor to get the optimistic access of the page.
-It returns true if it successfully gets the page. */
-struct optimistic_get
-{
- btr_pcur_t *const r_cursor;
- mtr_t *const mtr;
-
- optimistic_get(btr_pcur_t *r_cursor,mtr_t *mtr)
- :r_cursor(r_cursor), mtr(mtr) {}
-
- bool operator()(buf_block_t *hint) const
- {
- return hint && buf_page_optimistic_get(
- RW_X_LATCH, hint, r_cursor->modify_clock, mtr);
- }
-};
-
/** Restore the stored position of a persistent cursor bufferfixing the page */
static
bool
@@ -1632,8 +1609,11 @@ rtr_cur_restore_position(
r_cursor->modify_clock = 100;
);
- if (r_cursor->block_when_stored.run_with_hint(
- optimistic_get(r_cursor, mtr))) {
+ if (buf_page_optimistic_fix(r_cursor->btr_cur.page_cur.block,
+ r_cursor->old_page_id)
+ && buf_page_optimistic_get(r_cursor->btr_cur.page_cur.block,
+ RW_X_LATCH, r_cursor->modify_clock,
+ mtr)) {
ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED);
ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
@@ -1778,7 +1758,7 @@ rtr_leaf_push_match_rec(
ulint data_len;
rtr_rec_t rtr_rec;
- buf = match_rec->block.page.frame + match_rec->used;
+ buf = match_rec->block->page.frame + match_rec->used;
ut_ad(page_rec_is_leaf(rec));
copy = rec_copy(buf, rec, offsets);
@@ -1875,43 +1855,6 @@ rtr_non_leaf_insert_stack_push(
new_seq, level, child_no, my_cursor, mbr_inc);
}
-/** Copy a buf_block_t, except "block->page.lock".
-@param[in,out] matches copy to match->block
-@param[in] block block to copy */
-static
-void
-rtr_copy_buf(
- matched_rec_t* matches,
- const buf_block_t* block)
-{
- /* Copy all members of "block" to "matches->block" except "lock".
- We skip "lock" because it is not used
- from the dummy buf_block_t we create here and because memcpy()ing
- it generates (valid) compiler warnings that the vtable pointer
- will be copied. */
- matches->block.page.lock.free();
- new (&matches->block.page) buf_page_t(block->page);
- matches->block.page.frame = block->page.frame;
- matches->block.unzip_LRU = block->unzip_LRU;
-
- ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
- ut_d(matches->block.in_withdraw_list = block->in_withdraw_list);
-
- /* Skip buf_block_t::lock */
- matches->block.modify_clock = block->modify_clock;
-#ifdef BTR_CUR_HASH_ADAPT
- matches->block.n_hash_helps = block->n_hash_helps;
- matches->block.n_fields = block->n_fields;
- matches->block.left_side = block->left_side;
-#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
- matches->block.n_pointers = 0;
-#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
- matches->block.curr_n_fields = block->curr_n_fields;
- matches->block.curr_left_side = block->curr_left_side;
- matches->block.index = block->index;
-#endif /* BTR_CUR_HASH_ADAPT */
-}
-
/****************************************************************//**
Generate a shadow copy of the page block header to save the
matched records */
@@ -1925,16 +1868,18 @@ rtr_init_match(
{
ut_ad(matches->matched_recs->empty());
matches->locked = false;
- rtr_copy_buf(matches, block);
- matches->block.page.frame = matches->bufp;
matches->valid = false;
+ if (!matches->block) {
+ matches->block = buf_block_alloc();
+ }
+
+ matches->block->page.init(buf_page_t::MEMORY, block->page.id());
/* We have to copy PAGE_*_SUPREMUM_END bytes so that we can
use infimum/supremum of this page as normal btr page for search. */
- memcpy(matches->block.page.frame, page, page_is_comp(page)
- ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END);
matches->used = page_is_comp(page)
? PAGE_NEW_SUPREMUM_END
: PAGE_OLD_SUPREMUM_END;
+ memcpy(matches->block->page.frame, page, matches->used);
#ifdef RTR_SEARCH_DIAGNOSTIC
ulint pageno = page_get_page_no(page);
fprintf(stderr, "INNODB_RTR: Searching leaf page %d\n",
@@ -2361,7 +2306,7 @@ rtr_cur_search_with_match(
#endif /* UNIV_DEBUG */
/* Pop the last match record and position on it */
match_rec->matched_recs->pop_back();
- page_cur_position(test_rec.r_rec, &match_rec->block,
+ page_cur_position(test_rec.r_rec, match_rec->block,
cursor);
}
} else {
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 407834f2..dfe034ec 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -47,10 +47,13 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include <my_bitmap.h>
#include <mysql/service_thd_alloc.h>
#include <mysql/service_thd_wait.h>
+#include <mysql/service_print_check_msg.h>
#include "sql_type_geom.h"
#include "scope.h"
#include "srv0srv.h"
+extern my_bool opt_readonly;
+
// MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system;
// MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[];
@@ -116,6 +119,7 @@ thread_local ha_handler_stats *mariadb_stats= &mariadb_dummy_stats;
#include "snappy-c.h"
#include <limits>
+#include <myisamchk.h> // TT_FOR_UPGRADE
#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
@@ -874,6 +878,10 @@ static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
/* check_func */ NULL, /* update_func */ NULL,
/* default */ TRUE);
+static MYSQL_THDVAR_BOOL(snapshot_isolation, PLUGIN_VAR_OPCMDARG,
+ "Use snapshot isolation (write-write conflict detection).",
+ NULL, NULL, FALSE);
+
static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
"Use strict mode when evaluating create options.",
NULL, NULL, TRUE);
@@ -2166,6 +2174,9 @@ convert_error_code_to_mysql(
return(HA_ERR_LOCK_DEADLOCK);
+ case DB_RECORD_CHANGED:
+ return HA_ERR_RECORD_CHANGED;
+
case DB_LOCK_WAIT_TIMEOUT:
/* Starting from 5.0.13, we let MySQL just roll back the
latest SQL statement in a lock wait timeout. Previously, we
@@ -2809,6 +2820,8 @@ innobase_trx_init(
trx->check_unique_secondary = !thd_test_options(
thd, OPTION_RELAXED_UNIQUE_CHECKS);
+ trx->snapshot_isolation = THDVAR(thd, snapshot_isolation) & 1;
+
#ifdef WITH_WSREP
trx->wsrep = wsrep_on(thd);
#endif
@@ -4003,7 +4016,7 @@ static int innodb_init_params()
data_mysql_default_charset_coll = (ulint) default_charset_info->number;
-#ifdef HAVE_FCNTL_DIRECT
+#if !defined _WIN32 && defined O_DIRECT
if (srv_use_atomic_writes && my_may_have_atomic_write) {
/*
Force O_DIRECT on Unixes (on Windows writes are always
@@ -4342,7 +4355,7 @@ innobase_start_trx_and_assign_read_view(
Do this only if transaction is using REPEATABLE READ isolation
level. */
trx->isolation_level = innobase_map_isolation_level(
- thd_get_trx_isolation(thd));
+ thd_get_trx_isolation(thd)) & 3;
if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
trx->read_view.open(trx);
@@ -5345,67 +5358,6 @@ test_normalize_table_name_low()
}
}
}
-
-/*********************************************************************
-Test ut_format_name(). */
-static
-void
-test_ut_format_name()
-/*=================*/
-{
- char buf[NAME_LEN * 3];
-
- struct {
- const char* name;
- ulint buf_size;
- const char* expected;
- } test_data[] = {
- {"test/t1", sizeof(buf), "`test`.`t1`"},
- {"test/t1", 12, "`test`.`t1`"},
- {"test/t1", 11, "`test`.`t1"},
- {"test/t1", 10, "`test`.`t"},
- {"test/t1", 9, "`test`.`"},
- {"test/t1", 8, "`test`."},
- {"test/t1", 7, "`test`"},
- {"test/t1", 6, "`test"},
- {"test/t1", 5, "`tes"},
- {"test/t1", 4, "`te"},
- {"test/t1", 3, "`t"},
- {"test/t1", 2, "`"},
- {"test/t1", 1, ""},
- {"test/t1", 0, "BUF_NOT_CHANGED"},
- {"table", sizeof(buf), "`table`"},
- {"ta'le", sizeof(buf), "`ta'le`"},
- {"ta\"le", sizeof(buf), "`ta\"le`"},
- {"ta`le", sizeof(buf), "`ta``le`"},
- };
-
- for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
-
- memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1);
-
- char* ret;
-
- ret = ut_format_name(test_data[i].name,
- buf,
- test_data[i].buf_size);
-
- ut_a(ret == buf);
-
- if (strcmp(buf, test_data[i].expected) == 0) {
- ib::info() << "ut_format_name(" << test_data[i].name
- << ", buf, " << test_data[i].buf_size << "),"
- " expected " << test_data[i].expected
- << ", OK";
- } else {
- ib::error() << "ut_format_name(" << test_data[i].name
- << ", buf, " << test_data[i].buf_size << "),"
- " expected " << test_data[i].expected
- << ", ERROR: got " << buf;
- ut_error;
- }
- }
-}
#endif /* !DBUG_OFF */
/** Match index columns between MySQL and InnoDB.
@@ -5763,9 +5715,9 @@ func_exit:
return ret;
}
-/********************************************************************//**
-Get the upper limit of the MySQL integral and floating-point type.
-@return maximum allowed value for the field */
+/** Get the maximum integer value of a numeric column.
+@param field column definition
+@return maximum allowed integer value */
ulonglong innobase_get_int_col_max_value(const Field *field)
{
ulonglong max_value = 0;
@@ -5830,46 +5782,45 @@ ha_innobase::open().
@param[in,out] table persistent table
@param[in] field the AUTO_INCREMENT column */
-static
-void
-initialize_auto_increment(dict_table_t* table, const Field* field)
-{
- ut_ad(!table->is_temporary());
-
- const unsigned col_no = innodb_col_no(field);
-
- table->autoinc_mutex.wr_lock();
-
- table->persistent_autoinc = static_cast<uint16_t>(
- dict_table_get_nth_col_pos(table, col_no, NULL) + 1)
- & dict_index_t::MAX_N_FIELDS;
+static void initialize_auto_increment(dict_table_t *table, const Field& field,
+ const TABLE_SHARE &s)
+{
+ ut_ad(!table->is_temporary());
+ const unsigned col_no= innodb_col_no(&field);
+ table->autoinc_mutex.wr_lock();
+ table->persistent_autoinc=
+ uint16_t(dict_table_get_nth_col_pos(table, col_no, nullptr) + 1) &
+ dict_index_t::MAX_N_FIELDS;
+ if (table->autoinc)
+ /* Already initialized. Our caller checked
+ table->persistent_autoinc without
+ autoinc_mutex protection, and there might be multiple
+ ha_innobase::open() executing concurrently. */;
+ else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN)
+ /* If innodb_force_recovery is set so high that writes
+ are disabled we force the AUTOINC counter to 0
+ value effectively disabling writes to the table.
+ Secondly, we avoid reading the table in case the read
+ results in failure due to a corrupted table/index.
+
+ We will not return an error to the client, so that the
+ tables can be dumped with minimal hassle. If an error
+ were returned in this case, the first attempt to read
+ the table would fail and subsequent SELECTs would succeed. */;
+ else if (table->persistent_autoinc)
+ {
+ uint64_t max_value= innobase_get_int_col_max_value(&field);
+ table->autoinc=
+ innobase_next_autoinc(btr_read_autoinc_with_fallback(table, col_no,
+ s.mysql_version,
+ max_value),
+ 1 /* need */,
+ 1 /* auto_increment_increment */,
+ 0 /* auto_increment_offset */,
+ max_value);
+ }
- if (table->autoinc) {
- /* Already initialized. Our caller checked
- table->persistent_autoinc without
- autoinc_mutex protection, and there might be multiple
- ha_innobase::open() executing concurrently. */
- } else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
- /* If the recovery level is set so high that writes
- are disabled we force the AUTOINC counter to 0
- value effectively disabling writes to the table.
- Secondly, we avoid reading the table in case the read
- results in failure due to a corrupted table/index.
-
- We will not return an error to the client, so that the
- tables can be dumped with minimal hassle. If an error
- were returned in this case, the first attempt to read
- the table would fail and subsequent SELECTs would succeed. */
- } else if (table->persistent_autoinc) {
- table->autoinc = innobase_next_autoinc(
- btr_read_autoinc_with_fallback(table, col_no),
- 1 /* need */,
- 1 /* auto_increment_increment */,
- 0 /* auto_increment_offset */,
- innobase_get_int_col_max_value(field));
- }
-
- table->autoinc_mutex.wr_unlock();
+ table->autoinc_mutex.wr_unlock();
}
/** Open an InnoDB table
@@ -6105,7 +6056,7 @@ ha_innobase::open(const char* name, int, uint)
|| m_prebuilt->table->persistent_autoinc
|| !m_prebuilt->table->is_readable()) {
} else if (const Field* ai = table->found_next_number_field) {
- initialize_auto_increment(m_prebuilt->table, ai);
+ initialize_auto_increment(m_prebuilt->table, *ai, *table->s);
}
/* Set plugin parser for fulltext index */
@@ -7394,26 +7345,55 @@ ha_innobase::build_template(
ulint num_v = 0;
- if (active_index != MAX_KEY
- && active_index == pushed_idx_cond_keyno) {
- m_prebuilt->idx_cond = this;
- goto icp;
- } else if (pushed_rowid_filter && rowid_filter_is_active) {
-icp:
- /* Push down an index condition or an end_range check. */
+ /* MDEV-31154: For pushed down index condition we don't support virtual
+ column and idx_cond_push() does check for it. For row ID filtering we
+ don't need such restrictions but we get into trouble trying to use the
+ ICP path.
+
+ 1. It should be fine to follow no_icp path if primary key is generated.
+ However, with user specified primary key(PK), the row is identified by
+ the PK and those columns need to be converted to mysql format in
+ row_search_idx_cond_check before doing the comparison. Since secondary
+ indexes always have PK appended in innodb, it works with current ICP
+ handling code when fetch_primary_key_cols is set to TRUE.
+
+ 2. Although ICP comparison and Row ID comparison works on different
+ columns the current ICP code can be shared by both.
+
+ 3. In most cases, it works today by jumping to goto no_icp when we
+ encounter a virtual column. This is hackish and already have some
+ issues as it cannot handle PK and all states are not reset properly,
+ for example, idx_cond_n_cols is not reset.
+
+ 4. We already encountered MDEV-28747 m_prebuilt->idx_cond was being set.
+
+ Neither ICP nor row ID comparison needs virtual columns and the code is
+ simplified to handle both. It should handle the issues. */
+
+ const bool pushed_down = active_index != MAX_KEY
+ && active_index == pushed_idx_cond_keyno;
+
+ m_prebuilt->idx_cond = pushed_down ? this : nullptr;
+
+ if (m_prebuilt->idx_cond || m_prebuilt->pk_filter) {
+ /* Push down an index condition, end_range check or row ID
+ filter */
for (ulint i = 0; i < n_fields; i++) {
const Field* field = table->field[i];
const bool is_v = !field->stored_in_db();
- if (is_v && skip_virtual) {
- num_v++;
- continue;
- }
+
bool index_contains = index->contains_col_or_prefix(
is_v ? num_v : i - num_v, is_v);
- if (is_v && index_contains) {
- m_prebuilt->n_template = 0;
- num_v = 0;
- goto no_icp;
+
+ if (is_v) {
+ if (index_contains) {
+ /* We want to ensure that ICP is not
+ used with virtual columns. */
+ ut_ad(!pushed_down);
+ m_prebuilt->idx_cond = nullptr;
+ }
+ num_v++;
+ continue;
}
/* Test if an end_range or an index condition
@@ -7433,7 +7413,7 @@ icp:
which would be acceptable if end_range==NULL. */
if (build_template_needs_field_in_icp(
index, m_prebuilt, index_contains,
- is_v ? num_v : i - num_v, is_v)) {
+ i - num_v, false)) {
if (!whole_row) {
field = build_template_needs_field(
index_contains,
@@ -7442,15 +7422,10 @@ icp:
fetch_primary_key_cols,
index, table, i, num_v);
if (!field) {
- if (is_v) {
- num_v++;
- }
continue;
}
}
- ut_ad(!is_v);
-
mysql_row_templ_t* templ= build_template_field(
m_prebuilt, clust_index, index,
table, field, i - num_v, 0);
@@ -7527,15 +7502,16 @@ icp:
*/
}
- if (is_v) {
- num_v++;
- }
}
- ut_ad(m_prebuilt->idx_cond_n_cols > 0);
- ut_ad(m_prebuilt->idx_cond_n_cols == m_prebuilt->n_template);
-
num_v = 0;
+ ut_ad(m_prebuilt->idx_cond_n_cols == m_prebuilt->n_template);
+ if (m_prebuilt->idx_cond_n_cols == 0) {
+ /* No columns to push down. It is safe to jump to np ICP
+ path. */
+ m_prebuilt->idx_cond = nullptr;
+ goto no_icp;
+ }
/* Include the fields that are not needed in index condition
pushdown. */
@@ -7550,7 +7526,7 @@ icp:
bool index_contains = index->contains_col_or_prefix(
is_v ? num_v : i - num_v, is_v);
- if (!build_template_needs_field_in_icp(
+ if (is_v || !build_template_needs_field_in_icp(
index, m_prebuilt, index_contains,
is_v ? num_v : i - num_v, is_v)) {
/* Not needed in ICP */
@@ -7583,7 +7559,7 @@ icp:
} else {
no_icp:
/* No index condition pushdown */
- m_prebuilt->idx_cond = NULL;
+ ut_ad(!m_prebuilt->idx_cond);
ut_ad(num_v == 0);
for (ulint i = 0; i < n_fields; i++) {
@@ -8737,6 +8713,7 @@ ha_innobase::delete_row(
: PLAIN_DELETE;
trx->fts_next_doc_id = 0;
+ ut_ad(!trx->is_bulk_insert());
error = row_update_for_mysql(m_prebuilt);
#ifdef WITH_WSREP
@@ -8844,47 +8821,63 @@ ha_innobase::index_end(void)
DBUG_RETURN(0);
}
-/*********************************************************************//**
-Converts a search mode flag understood by MySQL to a flag understood
-by InnoDB. */
-page_cur_mode_t
-convert_search_mode_to_innobase(
-/*============================*/
- ha_rkey_function find_flag)
-{
- switch (find_flag) {
- case HA_READ_KEY_EXACT:
- /* this does not require the index to be UNIQUE */
- case HA_READ_KEY_OR_NEXT:
- return(PAGE_CUR_GE);
- case HA_READ_AFTER_KEY:
- return(PAGE_CUR_G);
- case HA_READ_BEFORE_KEY:
- return(PAGE_CUR_L);
- case HA_READ_KEY_OR_PREV:
- case HA_READ_PREFIX_LAST:
- case HA_READ_PREFIX_LAST_OR_PREV:
- return(PAGE_CUR_LE);
- case HA_READ_MBR_CONTAIN:
- return(PAGE_CUR_CONTAIN);
- case HA_READ_MBR_INTERSECT:
- return(PAGE_CUR_INTERSECT);
- case HA_READ_MBR_WITHIN:
- return(PAGE_CUR_WITHIN);
- case HA_READ_MBR_DISJOINT:
- return(PAGE_CUR_DISJOINT);
- case HA_READ_MBR_EQUAL:
- return(PAGE_CUR_MBR_EQUAL);
- case HA_READ_PREFIX:
- return(PAGE_CUR_UNSUPP);
- /* do not use "default:" in order to produce a gcc warning:
- enumeration value '...' not handled in switch
- (if -Wswitch or -Wall is used) */
- }
-
- my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
-
- return(PAGE_CUR_UNSUPP);
+/** Convert a MariaDB search mode to an InnoDB search mode.
+@tparam last_match whether last_match_mode is to be set
+@param find_flag MariaDB search mode
+@param mode InnoDB search mode
+@param last_match_mode pointer to ha_innobase::m_last_match_mode
+@return whether the search mode is unsupported */
+template<bool last_match= false>
+static bool convert_search_mode_to_innobase(ha_rkey_function find_flag,
+ page_cur_mode_t &mode,
+ uint *last_match_mode= nullptr)
+{
+ mode= PAGE_CUR_LE;
+ if (last_match)
+ *last_match_mode= 0;
+
+ switch (find_flag) {
+ case HA_READ_KEY_EXACT:
+ /* this does not require the index to be UNIQUE */
+ if (last_match)
+ *last_match_mode= ROW_SEL_EXACT;
+ /* fall through */
+ case HA_READ_KEY_OR_NEXT:
+ mode= PAGE_CUR_GE;
+ return false;
+ case HA_READ_AFTER_KEY:
+ mode= PAGE_CUR_G;
+ return false;
+ case HA_READ_BEFORE_KEY:
+ mode= PAGE_CUR_L;
+ return false;
+ case HA_READ_PREFIX_LAST:
+ if (last_match)
+ *last_match_mode= ROW_SEL_EXACT_PREFIX;
+ /* fall through */
+ case HA_READ_KEY_OR_PREV:
+ case HA_READ_PREFIX_LAST_OR_PREV:
+ return false;
+ case HA_READ_MBR_CONTAIN:
+ mode= PAGE_CUR_CONTAIN;
+ return false;
+ case HA_READ_MBR_INTERSECT:
+ mode= PAGE_CUR_INTERSECT;
+ return false;
+ case HA_READ_MBR_WITHIN:
+ mode= PAGE_CUR_WITHIN;
+ return false;
+ case HA_READ_MBR_DISJOINT:
+ mode= PAGE_CUR_DISJOINT;
+ return false;
+ case HA_READ_MBR_EQUAL:
+ mode= PAGE_CUR_MBR_EQUAL;
+ return false;
+ case HA_READ_PREFIX:
+ break;
+ }
+
+ return true;
}
/*
@@ -8962,8 +8955,7 @@ ha_innobase::index_read(
mariadb_set_stats set_stats_temporary(handler_stats);
DEBUG_SYNC_C("ha_innobase_index_read_begin");
- ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
- ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
+ ut_ad(m_prebuilt->trx == thd_to_trx(m_user_thd));
dict_index_t* index = m_prebuilt->index;
@@ -8999,7 +8991,8 @@ ha_innobase::index_read(
build_template(false);
}
- if (key_ptr != NULL) {
+ if (key_len) {
+ ut_ad(key_ptr);
/* Convert the search key value to InnoDB format into
m_prebuilt->search_tuple */
@@ -9009,84 +9002,58 @@ ha_innobase::index_read(
m_prebuilt->srch_key_val_len,
index,
(byte*) key_ptr,
- (ulint) key_len);
+ key_len);
DBUG_ASSERT(m_prebuilt->search_tuple->n_fields > 0);
} else {
+ ut_ad(find_flag != HA_READ_KEY_EXACT);
/* We position the cursor to the last or the first entry
in the index */
dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
}
- page_cur_mode_t mode = convert_search_mode_to_innobase(find_flag);
-
- ulint match_mode = 0;
+ page_cur_mode_t mode;
- if (find_flag == HA_READ_KEY_EXACT) {
-
- match_mode = ROW_SEL_EXACT;
-
- } else if (find_flag == HA_READ_PREFIX_LAST) {
-
- match_mode = ROW_SEL_EXACT_PREFIX;
+ if (convert_search_mode_to_innobase<true>(find_flag, mode,
+ &m_last_match_mode)) {
+ table->status = STATUS_NOT_FOUND;
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
}
- m_last_match_mode = (uint) match_mode;
-
- dberr_t ret = mode == PAGE_CUR_UNSUPP ? DB_UNSUPPORTED
- : row_search_mvcc(buf, mode, m_prebuilt, match_mode, 0);
+ dberr_t ret =
+ row_search_mvcc(buf, mode, m_prebuilt, m_last_match_mode, 0);
DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;);
- int error;
-
- switch (ret) {
- case DB_SUCCESS:
- error = 0;
+ if (UNIV_LIKELY(ret == DB_SUCCESS)) {
table->status = 0;
- break;
-
- case DB_RECORD_NOT_FOUND:
- error = HA_ERR_KEY_NOT_FOUND;
- table->status = STATUS_NOT_FOUND;
- break;
+ DBUG_RETURN(0);
+ }
- case DB_END_OF_INDEX:
- error = HA_ERR_KEY_NOT_FOUND;
- table->status = STATUS_NOT_FOUND;
- break;
+ table->status = STATUS_NOT_FOUND;
+ switch (ret) {
case DB_TABLESPACE_DELETED:
ib_senderrf(
m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
ER_TABLESPACE_DISCARDED,
table->s->table_name.str);
-
- table->status = STATUS_NOT_FOUND;
- error = HA_ERR_TABLESPACE_MISSING;
- break;
-
+ DBUG_RETURN(HA_ERR_TABLESPACE_MISSING);
+ case DB_RECORD_NOT_FOUND:
+ case DB_END_OF_INDEX:
+ DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
case DB_TABLESPACE_NOT_FOUND:
-
ib_senderrf(
m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
ER_TABLESPACE_MISSING,
table->s->table_name.str);
-
- table->status = STATUS_NOT_FOUND;
- error = HA_ERR_TABLESPACE_MISSING;
- break;
-
+ DBUG_RETURN(HA_ERR_TABLESPACE_MISSING);
default:
- error = convert_error_code_to_mysql(
- ret, m_prebuilt->table->flags, m_user_thd);
-
- table->status = STATUS_NOT_FOUND;
- break;
+ DBUG_RETURN(convert_error_code_to_mysql(
+ ret, m_prebuilt->table->flags,
+ m_user_thd));
}
-
- DBUG_RETURN(error);
}
/*******************************************************************//**
@@ -9513,8 +9480,6 @@ ha_innobase::rnd_pos(
DBUG_ENTER("rnd_pos");
DBUG_DUMP("key", pos, ref_length);
- ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
-
/* Note that we assume the length of the row reference is fixed
for the table, and it is == ref_length */
@@ -9938,7 +9903,8 @@ wsrep_append_foreign_key(
}
ulint rcode = DB_SUCCESS;
- char cache_key[513] = {'\0'};
+ char cache_key[MAX_FULL_NAME_LEN] = {'\0'};
+ char db_name[MAX_DATABASE_NAME_LEN+1] = {'\0'};
size_t cache_key_len = 0;
if ( !((referenced) ?
@@ -10028,14 +9994,38 @@ wsrep_append_foreign_key(
return DB_ERROR;
}
- strncpy(cache_key,
+ char * fk_table =
(wsrep_protocol_version > 1) ?
((referenced) ?
foreign->referenced_table->name.m_name :
foreign->foreign_table->name.m_name) :
- foreign->foreign_table->name.m_name, sizeof(cache_key) - 1);
- cache_key_len = strlen(cache_key);
+ foreign->foreign_table->name.m_name;
+
+ /* convert db and table name parts separately to system charset */
+ ulint db_name_len = dict_get_db_name_len(fk_table);
+ strmake(db_name, fk_table, db_name_len);
+ uint errors;
+ cache_key_len= innobase_convert_to_system_charset(cache_key,
+ db_name, sizeof(cache_key), &errors);
+ if (errors) {
+ WSREP_WARN("unexpected foreign key table %s %s",
+ foreign->referenced_table->name.m_name,
+ foreign->foreign_table->name.m_name);
+ return DB_ERROR;
+ }
+ /* after db name adding 0 and then converted table name */
+ cache_key[db_name_len]= '\0';
+ cache_key_len++;
+
+ cache_key_len+= innobase_convert_to_system_charset(cache_key+cache_key_len,
+ fk_table+db_name_len+1, sizeof(cache_key), &errors);
+ if (errors) {
+ WSREP_WARN("unexpected foreign key table %s %s",
+ foreign->referenced_table->name.m_name,
+ foreign->foreign_table->name.m_name);
+ return DB_ERROR;
+ }
#ifdef WSREP_DEBUG_PRINT
ulint j;
fprintf(stderr, "FK parent key, table: %s %s len: %lu ",
@@ -10045,16 +10035,6 @@ wsrep_append_foreign_key(
}
fprintf(stderr, "\n");
#endif
- char *p = strchr(cache_key, '/');
-
- if (p) {
- *p = '\0';
- } else {
- WSREP_WARN("unexpected foreign key table %s %s",
- foreign->referenced_table->name.m_name,
- foreign->foreign_table->name.m_name);
- }
-
wsrep_buf_t wkey_part[3];
wsrep_key_t wkey = {wkey_part, 3};
@@ -12269,7 +12249,7 @@ create_table_info_t::create_foreign_keys()
dict_index_t* index = NULL;
fkerr_t index_error = FK_SUCCESS;
dict_index_t* err_index = NULL;
- ulint err_col;
+ ulint err_col = 0;
const bool tmp_table = m_flags2 & DICT_TF2_TEMPORARY;
const CHARSET_INFO* cs = thd_charset(m_thd);
const char* operation = "Create ";
@@ -13383,6 +13363,49 @@ ha_innobase::discard_or_import_tablespace(
DBUG_RETURN(0);
}
+/** Report a DROP TABLE failure due to a FOREIGN KEY constraint.
+@param name table name
+@param foreign constraint */
+ATTRIBUTE_COLD
+static void delete_table_cannot_drop_foreign(const table_name_t &name,
+ const dict_foreign_t &foreign)
+{
+ mysql_mutex_lock(&dict_foreign_err_mutex);
+ rewind(dict_foreign_err_file);
+ ut_print_timestamp(dict_foreign_err_file);
+ fputs(" Cannot drop table ", dict_foreign_err_file);
+ ut_print_name(dict_foreign_err_file, nullptr, name.m_name);
+ fputs("\nbecause it is referenced by ", dict_foreign_err_file);
+ ut_print_name(dict_foreign_err_file, nullptr, foreign.foreign_table_name);
+ putc('\n', dict_foreign_err_file);
+ mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/** Check if DROP TABLE would fail due to a FOREIGN KEY constraint.
+@param table table to be dropped
+@param sqlcom thd_sql_command(current_thd)
+@return whether child tables that refer to this table exist */
+static bool delete_table_check_foreigns(const dict_table_t &table,
+ enum_sql_command sqlcom)
+{
+ const bool drop_db{sqlcom == SQLCOM_DROP_DB};
+ for (const auto foreign : table.referenced_set)
+ {
+ /* We should allow dropping a referenced table if creating
+ that referenced table has failed for some reason. For example
+ if referenced table is created but it column types that are
+ referenced do not match. */
+ if (foreign->foreign_table == &table ||
+ (drop_db &&
+ dict_tables_have_same_db(table.name.m_name,
+ foreign->foreign_table_name_lookup)))
+ continue;
+ delete_table_cannot_drop_foreign(table.name, *foreign);
+ return true;
+ }
+
+ return false;
+}
/** DROP TABLE (possibly as part of DROP DATABASE, CREATE/ALTER TABLE)
@param name table name
@@ -13397,8 +13420,8 @@ int ha_innobase::delete_table(const char *name)
DBUG_EXECUTE_IF("test_normalize_table_name_low",
test_normalize_table_name_low(););
- DBUG_EXECUTE_IF("test_ut_format_name", test_ut_format_name(););
+ const enum_sql_command sqlcom= enum_sql_command(thd_sql_command(thd));
trx_t *parent_trx= check_trx_exists(thd);
dict_table_t *table;
@@ -13435,6 +13458,13 @@ int ha_innobase::delete_table(const char *name)
DBUG_RETURN(0);
}
+ if (parent_trx->check_foreigns &&
+ delete_table_check_foreigns(*table, sqlcom))
+ {
+ dict_sys.unlock();
+ DBUG_RETURN(HA_ERR_ROW_IS_REFERENCED);
+ }
+
table->acquire();
dict_sys.unlock();
@@ -13467,14 +13497,7 @@ int ha_innobase::delete_table(const char *name)
/* FOREIGN KEY constraints cannot exist on partitioned tables. */;
#endif
else
- {
- dict_sys.freeze(SRW_LOCK_CALL);
- for (const dict_foreign_t* f : table->referenced_set)
- if (dict_table_t* child= f->foreign_table)
- if ((err= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
- break;
- dict_sys.unfreeze();
- }
+ err= lock_table_children(table, trx);
}
dict_table_t *table_stats= nullptr, *index_stats= nullptr;
@@ -13484,7 +13507,6 @@ int ha_innobase::delete_table(const char *name)
const bool fts= err == DB_SUCCESS &&
(table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS));
- const enum_sql_command sqlcom= enum_sql_command(thd_sql_command(thd));
if (fts)
{
@@ -13642,36 +13664,16 @@ err_exit:
DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
}
- if (!table->no_rollback() && trx->check_foreigns)
+ if (!table->no_rollback())
{
- const bool drop_db= sqlcom == SQLCOM_DROP_DB;
- for (auto foreign : table->referenced_set)
+ if (trx->check_foreigns && delete_table_check_foreigns(*table, sqlcom))
{
- /* We should allow dropping a referenced table if creating
- that referenced table has failed for some reason. For example
- if referenced table is created but it column types that are
- referenced do not match. */
- if (foreign->foreign_table == table ||
- (drop_db &&
- dict_tables_have_same_db(table->name.m_name,
- foreign->foreign_table_name_lookup)))
- continue;
- mysql_mutex_lock(&dict_foreign_err_mutex);
- rewind(dict_foreign_err_file);
- ut_print_timestamp(dict_foreign_err_file);
- fputs(" Cannot drop table ", dict_foreign_err_file);
- ut_print_name(dict_foreign_err_file, trx, table->name.m_name);
- fputs("\nbecause it is referenced by ", dict_foreign_err_file);
- ut_print_name(dict_foreign_err_file, trx, foreign->foreign_table_name);
- putc('\n', dict_foreign_err_file);
- mysql_mutex_unlock(&dict_foreign_err_mutex);
err= DB_CANNOT_DROP_CONSTRAINT;
goto err_exit;
}
- }
- if (!table->no_rollback())
err= trx->drop_table_foreign(table->name);
+ }
if (err == DB_SUCCESS && table_stats && index_stats)
err= trx->drop_table_statistics(table->name);
@@ -13790,6 +13792,19 @@ int ha_innobase::truncate()
update_thd();
+#ifdef UNIV_DEBUG
+ if (!thd_test_options(m_user_thd, OPTION_NO_FOREIGN_KEY_CHECKS))
+ {
+ /* fk_truncate_illegal_if_parent() should have failed in
+ Sql_cmd_truncate_table::handler_truncate() if foreign_key_checks=ON
+ and child tables exist. */
+ dict_sys.freeze(SRW_LOCK_CALL);
+ for (const auto foreign : m_prebuilt->table->referenced_set)
+ ut_ad(foreign->foreign_table == m_prebuilt->table);
+ dict_sys.unfreeze();
+ }
+#endif
+
if (is_read_only())
DBUG_RETURN(HA_ERR_TABLE_READONLY);
@@ -13872,14 +13887,7 @@ int ha_innobase::truncate()
dict_table_t *table_stats = nullptr, *index_stats = nullptr;
MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
- dberr_t error= DB_SUCCESS;
-
- dict_sys.freeze(SRW_LOCK_CALL);
- for (const dict_foreign_t *f : ib_table->referenced_set)
- if (dict_table_t *child= f->foreign_table)
- if ((error= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
- break;
- dict_sys.unfreeze();
+ dberr_t error= lock_table_children(ib_table, trx);
if (error == DB_SUCCESS)
error= lock_table_for_trx(ib_table, trx, LOCK_X);
@@ -14070,16 +14078,7 @@ ha_innobase::rename_table(
/* There is no need to lock any FOREIGN KEY child tables. */
} else if (dict_table_t *table = dict_table_open_on_name(
norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) {
- dict_sys.freeze(SRW_LOCK_CALL);
- for (const dict_foreign_t* f : table->referenced_set) {
- if (dict_table_t* child = f->foreign_table) {
- error = lock_table_for_trx(child, trx, LOCK_X);
- if (error != DB_SUCCESS) {
- break;
- }
- }
- }
- dict_sys.unfreeze();
+ error = lock_table_children(table, trx);
if (error == DB_SUCCESS) {
error = lock_table_for_trx(table, trx, LOCK_X);
}
@@ -14218,14 +14217,14 @@ ha_innobase::records_in_range(
dict_index_t* index;
dtuple_t* range_start;
dtuple_t* range_end;
- ha_rows n_rows;
+ ha_rows n_rows = HA_POS_ERROR;
page_cur_mode_t mode1;
page_cur_mode_t mode2;
mem_heap_t* heap;
DBUG_ENTER("records_in_range");
- ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+ ut_ad(m_prebuilt->trx == thd_to_trx(ha_thd()));
m_prebuilt->trx->op_info = "estimating records in index range";
@@ -14238,12 +14237,7 @@ ha_innobase::records_in_range(
/* There exists possibility of not being able to find requested
index due to inconsistency between MySQL and InoDB dictionary info.
Necessary message should have been printed in innobase_get_index() */
- if (!m_prebuilt->table->space) {
- n_rows = HA_POS_ERROR;
- goto func_exit;
- }
- if (!index) {
- n_rows = HA_POS_ERROR;
+ if (!index || !m_prebuilt->table->space) {
goto func_exit;
}
if (index->is_corrupted()) {
@@ -14259,61 +14253,50 @@ ha_innobase::records_in_range(
+ sizeof(dtuple_t)));
range_start = dtuple_create(heap, key->ext_key_parts);
- dict_index_copy_types(range_start, index, key->ext_key_parts);
range_end = dtuple_create(heap, key->ext_key_parts);
- dict_index_copy_types(range_end, index, key->ext_key_parts);
-
- row_sel_convert_mysql_key_to_innobase(
- range_start,
- m_prebuilt->srch_key_val1,
- m_prebuilt->srch_key_val_len,
- index,
- (byte*) (min_key ? min_key->key : (const uchar*) 0),
- (ulint) (min_key ? min_key->length : 0));
-
- DBUG_ASSERT(min_key
- ? range_start->n_fields > 0
- : range_start->n_fields == 0);
-
- row_sel_convert_mysql_key_to_innobase(
- range_end,
- m_prebuilt->srch_key_val2,
- m_prebuilt->srch_key_val_len,
- index,
- (byte*) (max_key ? max_key->key : (const uchar*) 0),
- (ulint) (max_key ? max_key->length : 0));
-
- DBUG_ASSERT(max_key
- ? range_end->n_fields > 0
- : range_end->n_fields == 0);
-
- mode1 = convert_search_mode_to_innobase(
- min_key ? min_key->flag : HA_READ_KEY_EXACT);
-
- mode2 = convert_search_mode_to_innobase(
- max_key ? max_key->flag : HA_READ_KEY_EXACT);
-
- if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
-
- if (dict_index_is_spatial(index)) {
- /*Only min_key used in spatial index. */
- n_rows = rtr_estimate_n_rows_in_range(
- index, range_start, mode1);
- } else {
- btr_pos_t tuple1(range_start, mode1, pages->first_page);
- btr_pos_t tuple2(range_end, mode2, pages->last_page);
- n_rows = btr_estimate_n_rows_in_range(
- index, &tuple1, &tuple2);
- pages->first_page= tuple1.page_id.raw();
- pages->last_page= tuple2.page_id.raw();
- }
+
+ if (!min_key) {
+ mode1 = PAGE_CUR_GE;
+ dtuple_set_n_fields(range_start, 0);
+ } else if (convert_search_mode_to_innobase(min_key->flag, mode1)) {
+ goto unsupported;
} else {
+ dict_index_copy_types(range_start, index, key->ext_key_parts);
+ row_sel_convert_mysql_key_to_innobase(
+ range_start,
+ m_prebuilt->srch_key_val1,
+ m_prebuilt->srch_key_val_len,
+ index, min_key->key, min_key->length);
+ DBUG_ASSERT(range_start->n_fields > 0);
+ }
- n_rows = HA_POS_ERROR;
+ if (!max_key) {
+ mode2 = PAGE_CUR_GE;
+ dtuple_set_n_fields(range_end, 0);
+ } else if (convert_search_mode_to_innobase(max_key->flag, mode2)) {
+ goto unsupported;
+ } else {
+ dict_index_copy_types(range_end, index, key->ext_key_parts);
+ row_sel_convert_mysql_key_to_innobase(
+ range_end,
+ m_prebuilt->srch_key_val2,
+ m_prebuilt->srch_key_val_len,
+ index, max_key->key, max_key->length);
+ DBUG_ASSERT(range_end->n_fields > 0);
}
- mem_heap_free(heap);
+ if (dict_index_is_spatial(index)) {
+ /*Only min_key used in spatial index. */
+ n_rows = rtr_estimate_n_rows_in_range(
+ index, range_start, mode1);
+ } else {
+ btr_pos_t tuple1(range_start, mode1, pages->first_page);
+ btr_pos_t tuple2(range_end, mode2, pages->last_page);
+ n_rows = btr_estimate_n_rows_in_range(index, &tuple1, &tuple2);
+ pages->first_page= tuple1.page_id.raw();
+ pages->last_page= tuple2.page_id.raw();
+ }
DBUG_EXECUTE_IF(
"print_btr_estimate_n_rows_in_range_return_value",
@@ -14324,11 +14307,7 @@ ha_innobase::records_in_range(
(longlong) n_rows);
);
-func_exit:
-
- m_prebuilt->trx->op_info = (char*)"";
-
- /* The MySQL optimizer seems to believe an estimate of 0 rows is
+ /* The MariaDB optimizer seems to believe an estimate of 0 rows is
always accurate and may return the result 'Empty set' based on that.
The accuracy is not guaranteed, and even if it were, for a locking
read we should anyway perform the search to set the next-key lock.
@@ -14338,6 +14317,10 @@ func_exit:
n_rows = 1;
}
+unsupported:
+ mem_heap_free(heap);
+func_exit:
+ m_prebuilt->trx->op_info = "";
DBUG_RETURN((ha_rows) n_rows);
}
@@ -15140,6 +15123,7 @@ ha_innobase::check(
ulint n_rows_in_table = ULINT_UNDEFINED;
bool is_ok = true;
dberr_t ret;
+ uint handler_flags= check_opt->handler_flags;
DBUG_ENTER("ha_innobase::check");
DBUG_ASSERT(thd == ha_thd());
@@ -15148,6 +15132,27 @@ ha_innobase::check(
ut_a(m_prebuilt->trx == thd_to_trx(thd));
ut_ad(m_prebuilt->trx->mysql_thd == thd);
+ if (handler_flags || check_for_upgrade(check_opt)) {
+ /* The file was already checked and fixed as part of open */
+ print_check_msg(thd, table->s->db.str, table->s->table_name.str,
+ "check", "note",
+ (opt_readonly || high_level_read_only
+ || !(check_opt->sql_flags & TT_FOR_UPGRADE))
+ ? "Auto_increment will be"
+ " checked on each open until"
+ " CHECK TABLE FOR UPGRADE is executed"
+ : "Auto_increment checked and"
+ " .frm file version updated", 1);
+ if (handler_flags && (check_opt->sql_flags & TT_FOR_UPGRADE)) {
+ /*
+ No other issues found (as handler_flags was only
+ set if there as not other problems with the table
+ than auto_increment).
+ */
+ DBUG_RETURN(HA_ADMIN_OK);
+ }
+ }
+
if (m_prebuilt->mysql_template == NULL) {
/* Build the template; we will use a dummy template
in index scans done in checking */
@@ -15330,7 +15335,7 @@ ha_innobase::check(
}
/* Restore the original isolation level */
- m_prebuilt->trx->isolation_level = old_isolation_level;
+ m_prebuilt->trx->isolation_level = old_isolation_level & 3;
#ifdef BTR_CUR_HASH_ADAPT
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
/* We validate the whole adaptive hash index for all tables
@@ -15351,6 +15356,35 @@ func_exit:
DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
}
+/**
+Check if we there is a problem with the InnoDB table.
+@param check_opt check options
+@retval HA_ADMIN_OK if Table is ok
+@retval HA_ADMIN_NEEDS_ALTER User should run ALTER TABLE FOR UPGRADE
+@retval HA_ADMIN_NEEDS_CHECK User should run CHECK TABLE FOR UPGRADE
+@retval HA_ADMIN_FAILED if InnoDB is in read-only mode */
+int ha_innobase::check_for_upgrade(HA_CHECK_OPT *check_opt)
+{
+ /*
+ Check if there is a possibility that the auto increment value
+ stored in PAGE_ROOT_AUTO_INC could be corrupt.
+ */
+ if (table->s->mysql_version >= 100210);
+ else if (const Field *auto_increment= table->found_next_number_field)
+ {
+ uint col_no= innodb_col_no(auto_increment);
+ const dict_col_t *autoinc_col=
+ dict_table_get_nth_col(m_prebuilt->table, col_no);
+ if (m_prebuilt->table->get_index(*autoinc_col))
+ {
+ check_opt->handler_flags= 1;
+ return (high_level_read_only && !opt_readonly)
+ ? HA_ADMIN_FAILED : HA_ADMIN_NEEDS_CHECK;
+ }
+ }
+ return HA_ADMIN_OK;
+}
+
/*******************************************************************//**
Gets the foreign key create info for a table stored in InnoDB.
@return own: character string in the form which can be inserted to the
@@ -15408,7 +15442,6 @@ get_foreign_key_info(
char tmp_buff[NAME_LEN+1];
char name_buff[NAME_LEN+1];
const char* ptr;
- LEX_CSTRING* referenced_key_name;
LEX_CSTRING* name = NULL;
if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) {
@@ -15509,18 +15542,16 @@ get_foreign_key_info(
if (foreign->referenced_index
&& foreign->referenced_index->name != NULL) {
- referenced_key_name = thd_make_lex_string(
+ f_key_info.referenced_key_name = thd_make_lex_string(
thd,
- f_key_info.referenced_key_name,
+ nullptr,
foreign->referenced_index->name,
strlen(foreign->referenced_index->name),
1);
} else {
- referenced_key_name = NULL;
+ f_key_info.referenced_key_name = NULL;
}
- f_key_info.referenced_key_name = referenced_key_name;
-
pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
sizeof(FOREIGN_KEY_INFO));
@@ -15806,7 +15837,7 @@ ha_innobase::start_stmt(
}
/* fall through */
default:
- trx->end_bulk_insert(*m_prebuilt->table);
+ trx->bulk_insert_apply_for_table(m_prebuilt->table);
if (!trx->bulk_insert) {
break;
}
@@ -16000,7 +16031,7 @@ ha_innobase::external_lock(
}
/* fall through */
default:
- trx->end_bulk_insert(*m_prebuilt->table);
+ trx->bulk_insert_apply_for_table(m_prebuilt->table);
if (!trx->bulk_insert) {
break;
}
@@ -16370,7 +16401,7 @@ ha_innobase::store_lock(
if (lock_type != TL_IGNORE
&& trx->n_mysql_tables_in_use == 0) {
trx->isolation_level = innobase_map_isolation_level(
- (enum_tx_isolation) thd_tx_isolation(thd));
+ (enum_tx_isolation) thd_tx_isolation(thd)) & 3;
if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
@@ -16600,6 +16631,13 @@ ha_innobase::get_auto_increment(
if (error != DB_SUCCESS) {
*first_value = (~(ulonglong) 0);
+ /* This is an error case. We do the error handling by calling
+ the error code conversion function. Specifically, we need to
+ call thd_mark_transaction_to_rollback() to inform sql that we
+ have rolled back innodb transaction after a deadlock error. We
+ ignore the returned mysql error code here. */
+ std::ignore = convert_error_code_to_mysql(
+ error, m_prebuilt->table->flags, m_user_thd);
return;
}
@@ -17535,6 +17573,7 @@ innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save)
{
mtr_t mtr;
uint space_id = *static_cast<const uint*>(save);
+ srv_fil_make_page_dirty_debug= space_id;
mysql_mutex_unlock(&LOCK_global_system_variables);
fil_space_t* space = fil_space_t::get(space_id);
@@ -18261,13 +18300,15 @@ buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
return;
const uint s= srv_fil_make_page_dirty_debug;
mysql_mutex_unlock(&LOCK_global_system_variables);
- if (s)
- buf_flush_sync();
- else
+ if (s == 0 || srv_is_undo_tablespace(s))
{
- while (buf_flush_list_space(fil_system.sys_space, nullptr));
+ fil_space_t *space= fil_system.sys_space;
+ if (s) { space= fil_space_get(s); }
+ while (buf_flush_list_space(space, nullptr));
os_aio_wait_until_no_pending_writes(true);
}
+ else
+ buf_flush_sync();
mysql_mutex_lock(&LOCK_global_system_variables);
}
@@ -18452,7 +18493,7 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
const bool in_progress(buf_pool.get_oldest_modification(LSN_MAX) <
log_sys.resize_in_progress());
if (in_progress)
- my_cond_timedwait(&buf_pool.do_flush_list,
+ my_cond_timedwait(&buf_pool.done_flush_list,
&buf_pool.flush_list_mutex.m_mutex, &abstime);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (!log_sys.resize_in_progress())
@@ -18463,6 +18504,15 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
mysql_mutex_lock(&LOCK_global_system_variables);
}
+static void innodb_log_spin_wait_delay_update(THD *, st_mysql_sys_var*,
+ void *, const void *save)
+{
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ mtr_t::spin_wait_delay= *static_cast<const unsigned*>(save);
+ mtr_t::finisher_update();
+ log_sys.latch.wr_unlock();
+}
+
/** Update innodb_status_output or innodb_status_output_locks,
which control InnoDB "status monitor" output to the error log.
@param[out] var current value
@@ -19279,10 +19329,10 @@ static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
NULL, NULL, UNIV_PAGE_SIZE_DEF,
UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
-static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
+static MYSQL_SYSVAR_UINT(log_buffer_size, log_sys.buf_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log buffer size in bytes.",
- NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096);
+ NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096);
#if defined __linux__ || defined _WIN32
static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
@@ -19297,6 +19347,12 @@ static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
nullptr, innodb_log_file_size_update,
96 << 20, 4 << 20, std::numeric_limits<ulonglong>::max(), 4096);
+static MYSQL_SYSVAR_UINT(log_spin_wait_delay, mtr_t::spin_wait_delay,
+ PLUGIN_VAR_OPCMDARG,
+ "Delay between log buffer spin lock polls (0 to use a blocking latch)",
+ nullptr, innodb_log_spin_wait_delay_update,
+ 0, 0, 6000, 0);
+
static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
PLUGIN_VAR_RQCMDARG,
"Percentage of the buffer pool to reserve for 'old' blocks.",
@@ -19756,6 +19812,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_file_buffering),
#endif
MYSQL_SYSVAR(log_file_size),
+ MYSQL_SYSVAR(log_spin_wait_delay),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
@@ -19776,6 +19833,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(ft_server_stopword_table),
MYSQL_SYSVAR(ft_user_stopword_table),
MYSQL_SYSVAR(disable_sort_file_cache),
+ MYSQL_SYSVAR(snapshot_isolation),
MYSQL_SYSVAR(stats_on_metadata),
MYSQL_SYSVAR(stats_transient_sample_pages),
MYSQL_SYSVAR(stats_persistent),
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 1f42bf18..50ac423f 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -209,6 +209,7 @@ public:
int rename_table(const char* from, const char* to) override;
inline int defragment_table();
int check(THD* thd, HA_CHECK_OPT* check_opt) override;
+ int check_for_upgrade(HA_CHECK_OPT* check_opt) override;
inline void reload_statistics();
@@ -909,6 +910,12 @@ unsigned
innodb_col_no(const Field* field)
MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Get the maximum integer value of a numeric column.
+@param field column definition
+@return maximum allowed integer value */
+ulonglong innobase_get_int_col_max_value(const Field *field)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
/********************************************************************//**
Helper function to push frm mismatch error to error log and
if needed to sql-layer. */
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 1401136f..6689b9ef 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -864,6 +864,9 @@ my_error_innodb(
case DB_DEADLOCK:
my_error(ER_LOCK_DEADLOCK, MYF(0));
break;
+ case DB_RECORD_CHANGED:
+ my_error(ER_CHECKREAD, MYF(0), table);
+ break;
case DB_LOCK_WAIT_TIMEOUT:
my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
break;
@@ -1458,11 +1461,6 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
}
};
-/********************************************************************//**
-Get the upper limit of the MySQL integral and floating-point type.
-@return maximum allowed value for the field */
-ulonglong innobase_get_int_col_max_value(const Field *field);
-
/** Determine if fulltext indexes exist in a given table.
@param table MySQL table
@return number of fulltext indexes */
@@ -1730,11 +1728,9 @@ instant_alter_column_possible(
ut_ad(!is_null || nullable);
n_nullable += nullable;
n_add++;
- uint l;
+ uint l = (*af)->pack_length();
switch ((*af)->type()) {
case MYSQL_TYPE_VARCHAR:
- l = reinterpret_cast<const Field_varstring*>
- (*af)->get_length();
variable_length:
if (l >= min_local_len) {
max_size += blob_prefix
@@ -1748,7 +1744,6 @@ instant_alter_column_possible(
if (!is_null) {
min_size += l;
}
- l = (*af)->pack_length();
max_size += l;
lenlen += l > 255 ? 2 : 1;
}
@@ -1762,7 +1757,6 @@ instant_alter_column_possible(
((*af))->get_length();
goto variable_length;
default:
- l = (*af)->pack_length();
if (l > 255 && ib_table.not_redundant()) {
goto variable_length;
}
@@ -2748,6 +2742,9 @@ cannot_create_many_fulltext_index:
online = false;
}
+ static constexpr const char *not_implemented
+ = "Not implemented for system-versioned operations";
+
if (ha_alter_info->handler_flags
& ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
/* ADD FULLTEXT|SPATIAL INDEX requires a lock.
@@ -2775,6 +2772,12 @@ cannot_create_many_fulltext_index:
goto cannot_create_many_fulltext_index;
}
+ if (altered_table->versioned()) {
+ ha_alter_info->unsupported_reason
+ = not_implemented;
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
add_fulltext = true;
if (ha_alter_info->online
&& !ha_alter_info->unsupported_reason) {
@@ -2811,12 +2814,18 @@ cannot_create_many_fulltext_index:
}
}
+ if (m_prebuilt->table->is_stats_table()) {
+ if (ha_alter_info->online) {
+ ha_alter_info->unsupported_reason =
+ table_share->table_name.str;
+ }
+ online= false;
+ }
+
// FIXME: implement Online DDL for system-versioned operations
if (ha_alter_info->handler_flags & INNOBASE_ALTER_VERSIONED_REBUILD) {
-
if (ha_alter_info->online) {
- ha_alter_info->unsupported_reason =
- "Not implemented for system-versioned operations";
+ ha_alter_info->unsupported_reason = not_implemented;
}
online = false;
@@ -7444,6 +7453,7 @@ error_handled:
row_mysql_lock_data_dictionary(ctx->trx);
} else {
row_merge_drop_indexes(ctx->trx, user_table, true);
+ user_table->indexes.start->online_log = nullptr;
ctx->trx->commit();
}
@@ -9865,13 +9875,7 @@ commit_set_autoinc(
const dict_col_t* autoinc_col
= dict_table_get_nth_col(ctx->old_table,
innodb_col_no(ai));
- dict_index_t* index
- = dict_table_get_first_index(ctx->old_table);
- while (index != NULL
- && index->fields[0].col != autoinc_col) {
- index = dict_table_get_next_index(index);
- }
-
+ auto index = ctx->old_table->get_index(*autoinc_col);
ut_ad(index);
ib_uint64_t max_in_table = index
@@ -10246,6 +10250,7 @@ when rebuilding the table.
@param ctx In-place ALTER TABLE context
@param altered_table MySQL table that is being altered
@param old_table MySQL table as it is before the ALTER operation
+@param statistics_exist whether to update InnoDB persistent statistics
@param trx Data dictionary transaction
@param table_name Table name in MySQL
@retval true Failure
@@ -10519,6 +10524,7 @@ when not rebuilding the table.
@param ha_alter_info Data used during in-place alter
@param ctx In-place ALTER TABLE context
@param old_table MySQL table as it is before the ALTER operation
+@param statistics_exist whether to update InnoDB persistent statistics
@param trx Data dictionary transaction
@param table_name Table name in MySQL
@retval true Failure
@@ -10532,6 +10538,7 @@ commit_try_norebuild(
ha_innobase_inplace_ctx*ctx,
TABLE* altered_table,
const TABLE* old_table,
+ bool statistics_exist,
trx_t* trx,
const char* table_name)
{
@@ -10646,6 +10653,10 @@ commit_try_norebuild(
goto handle_error;
}
+ if (!statistics_exist) {
+ continue;
+ }
+
error = dict_stats_delete_from_index_stats(db, table,
index->name, trx);
switch (error) {
@@ -10657,7 +10668,8 @@ commit_try_norebuild(
}
}
- if (const size_t size = ha_alter_info->rename_keys.size()) {
+ if (!statistics_exist) {
+ } else if (const size_t size = ha_alter_info->rename_keys.size()) {
char tmp_name[5];
char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
@@ -11224,16 +11236,7 @@ ha_innobase::commit_inplace_alter_table(
fts_optimize_remove_table(ctx->old_table);
}
- dict_sys.freeze(SRW_LOCK_CALL);
- for (auto f : ctx->old_table->referenced_set) {
- if (dict_table_t* child = f->foreign_table) {
- error = lock_table_for_trx(child, trx, LOCK_X);
- if (error != DB_SUCCESS) {
- break;
- }
- }
- }
- dict_sys.unfreeze();
+ error = lock_table_children(ctx->old_table, trx);
if (ctx->new_table->fts) {
ut_ad(!ctx->new_table->fts->add_wq);
@@ -11413,6 +11416,8 @@ err_index:
}
}
+ DEBUG_SYNC(m_user_thd, "innodb_commit_inplace_before_lock");
+
DBUG_EXECUTE_IF("stats_lock_fail",
error = DB_LOCK_WAIT_TIMEOUT;
trx_rollback_for_mysql(trx););
@@ -11496,7 +11501,9 @@ fail:
goto fail;
}
} else if (commit_try_norebuild(ha_alter_info, ctx,
- altered_table, table, trx,
+ altered_table, table,
+ table_stats && index_stats,
+ trx,
table_share->table_name.str)) {
goto fail;
}
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index 4ec07b81..e77401ed 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -24,6 +24,7 @@ Insert buffer
Created 7/19/1997 Heikki Tuuri
*******************************************************/
+#include <tuple>
#include "ibuf0ibuf.h"
#include "btr0sea.h"
@@ -1833,7 +1834,7 @@ corrupted:
err = flst_add_last(ibuf_root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- &mtr);
+ fil_system.sys_space->free_limit, &mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
goto corrupted;
}
@@ -1864,7 +1865,6 @@ Removes a page from the free list and frees it to the fsp system. */
static void ibuf_remove_free_page()
{
mtr_t mtr;
- mtr_t mtr2;
page_t* header_page;
log_free_check();
@@ -1891,26 +1891,28 @@ early_exit:
return;
}
- ibuf_mtr_start(&mtr2);
-
- buf_block_t* root = ibuf_tree_root_get(&mtr2);
+ buf_block_t* root = ibuf_tree_root_get(&mtr);
if (UNIV_UNLIKELY(!root)) {
- ibuf_mtr_commit(&mtr2);
goto early_exit;
}
- mysql_mutex_unlock(&ibuf_mutex);
-
+ const auto root_savepoint = mtr.get_savepoint() - 1;
const uint32_t page_no = flst_get_last(PAGE_HEADER
+ PAGE_BTR_IBUF_FREE_LIST
+ root->page.frame).page;
+ if (page_no >= fil_system.sys_space->free_limit) {
+ goto early_exit;
+ }
+
+ mysql_mutex_unlock(&ibuf_mutex);
+
/* NOTE that we must release the latch on the ibuf tree root
because in fseg_free_page we access level 1 pages, and the root
is a level 2 page. */
-
- ibuf_mtr_commit(&mtr2);
+ root->page.lock.u_unlock();
+ mtr.lock_register(root_savepoint, MTR_MEMO_BUF_FIX);
ibuf_exit(&mtr);
/* Since pessimistic inserts were prevented, we know that the
@@ -1933,15 +1935,7 @@ early_exit:
ibuf_enter(&mtr);
mysql_mutex_lock(&ibuf_mutex);
-
- root = ibuf_tree_root_get(&mtr, &err);
- if (UNIV_UNLIKELY(!root)) {
- mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
- goto func_exit;
- }
-
- ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
- + root->page.frame).page);
+ mtr.upgrade_buffer_fix(root_savepoint, RW_X_LATCH);
/* Remove the page from the free list and update the ibuf size data */
if (buf_block_t* block =
@@ -1950,7 +1944,7 @@ early_exit:
err = flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
block,
PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
- &mtr);
+ fil_system.sys_space->free_limit, &mtr);
}
mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
@@ -2332,7 +2326,8 @@ func_exit:
/** Merge the change buffer to some pages. */
static void ibuf_read_merge_pages(const uint32_t* space_ids,
- const uint32_t* page_nos, ulint n_stored)
+ const uint32_t* page_nos, ulint n_stored,
+ bool slow_shutdown_cleanup)
{
for (ulint i = 0; i < n_stored; i++) {
const uint32_t space_id = space_ids[i];
@@ -2357,30 +2352,28 @@ tablespace_deleted:
if (UNIV_LIKELY(page_nos[i] < size)) {
mtr.start();
dberr_t err;
- buf_block_t *block =
+ /* Load the page and apply change buffers. */
+ std::ignore =
buf_page_get_gen(page_id_t(space_id, page_nos[i]),
zip_size, RW_X_LATCH, nullptr,
BUF_GET_POSSIBLY_FREED,
&mtr, &err, true);
- bool remove = !block
- || fil_page_get_type(block->page.frame)
- != FIL_PAGE_INDEX
- || !page_is_leaf(block->page.frame);
mtr.commit();
if (err == DB_TABLESPACE_DELETED) {
s->x_unlock();
goto tablespace_deleted;
}
- if (!remove) {
- s->x_unlock();
- continue;
- }
}
s->x_unlock();
- if (srv_shutdown_state == SRV_SHUTDOWN_NONE
- || srv_fast_shutdown) {
+ /* During slow shutdown cleanup, we apply all pending IBUF
+ changes and need to cleanup any left-over IBUF records. There
+ are a few cases when the changes are already discarded and IBUF
+ bitmap is cleaned but we miss to delete the record. Even after
+ the issues are fixed, we need to keep this safety measure for
+ upgraded DBs with such left over records. */
+ if (!slow_shutdown_cleanup) {
continue;
}
@@ -2451,7 +2444,7 @@ ATTRIBUTE_COLD ulint ibuf_contract()
space_ids, page_nos, &n_pages);
ibuf_mtr_commit(&mtr);
- ibuf_read_merge_pages(space_ids, page_nos, n_pages);
+ ibuf_read_merge_pages(space_ids, page_nos, n_pages, true);
return(sum_sizes + 1);
}
@@ -2532,7 +2525,7 @@ ibuf_merge_space(
}
#endif /* UNIV_DEBUG */
- ibuf_read_merge_pages(spaces, pages, n_pages);
+ ibuf_read_merge_pages(spaces, pages, n_pages, false);
}
return(n_pages);
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index b42c543c..83bdaa97 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -189,13 +189,16 @@ btr_read_autoinc(dict_index_t* index)
/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
or fall back to MAX(auto_increment_column).
-@param[in] table table containing an AUTO_INCREMENT column
-@param[in] col_no index of the AUTO_INCREMENT column
-@return the AUTO_INCREMENT value
-@retval 0 on error or if no AUTO_INCREMENT value was used yet */
-ib_uint64_t
-btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+@param table table containing an AUTO_INCREMENT column
+@param col_no index of the AUTO_INCREMENT column
+@param mysql_version TABLE_SHARE::mysql_version
+@param max the maximum value of the AUTO_INCREMENT column
+@return the AUTO_INCREMENT value
+@retval 0 on error or if no AUTO_INCREMENT value was used yet */
+uint64_t btr_read_autoinc_with_fallback(const dict_table_t *table,
+ unsigned col_no, ulong mysql_version,
+ uint64_t max)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
@param[in,out] index clustered index
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index c66a3bfa..5f84328d 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -28,7 +28,6 @@ Created 2/23/1996 Heikki Tuuri
#include "dict0dict.h"
#include "btr0cur.h"
-#include "buf0block_hint.h"
#include "btr0btr.h"
#include "gis0rtree.h"
@@ -332,8 +331,8 @@ struct btr_pcur_t
/** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
whether cursor was on, before, or after the old_rec record */
btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
- /** buffer block when the position was stored */
- buf::Block_hint block_when_stored;
+ /** the page identifier of old_rec */
+ page_id_t old_page_id{0,0};
/** the modify clock value of the buffer block when the cursor position
was stored */
ib_uint64_t modify_clock= 0;
@@ -432,7 +431,8 @@ btr_pcur_open(
}
/** Open a cursor on the first user record satisfying the search condition;
-in case of no match, after the last index record. */
+in case of no match, after the last index record.
+@return DB_SUCCESS or error code */
MY_ATTRIBUTE((nonnull, warn_unused_result))
inline
dberr_t
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
deleted file mode 100644
index d4fee7c1..00000000
--- a/storage/innobase/include/buf0block_hint.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, MariaDB Corporation.
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License, version 2.0, as published by the
-Free Software Foundation.
-
-This program is also distributed with certain software (including but not
-limited to OpenSSL) that is licensed under separate terms, as designated in a
-particular file or component or in included license documentation. The authors
-of MySQL hereby grant you an additional permission to link the program and
-your derivative works with the separately licensed software that they have
-included with MySQL.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
-for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-
-*****************************************************************************/
-#pragma once
-#include "buf0buf.h"
-
-namespace buf {
-class Block_hint {
-public:
- /** Stores the pointer to the block, which is currently buffer-fixed.
- @param block a pointer to a buffer-fixed block to be stored */
- inline void store(buf_block_t *block)
- {
- ut_ad(block->page.buf_fix_count());
- m_block= block;
- m_page_id= block->page.id();
- }
-
- /** Clears currently stored pointer. */
- inline void clear() { m_block= nullptr; }
-
- /** Invoke f on m_block(which may be null)
- @param f The function to be executed. It will be passed the pointer.
- If you wish to use the block pointer subsequently,
- you need to ensure you buffer-fix it before returning from f.
- @return the return value of f
- */
- template <typename F>
- bool run_with_hint(const F &f)
- {
- buffer_fix_block_if_still_valid();
- /* m_block could be changed during f() call, so we use local
- variable to remember which block we need to unfix */
- buf_block_t *block= m_block;
- bool res= f(block);
- if (block)
- block->page.unfix();
- return res;
- }
-
- buf_block_t *block() const { return m_block; }
-
- private:
- /** The block pointer stored by store(). */
- buf_block_t *m_block= nullptr;
- /** If m_block is non-null, the m_block->page.id at time it was stored. */
- page_id_t m_page_id{0, 0};
-
- /** A helper function which checks if m_block is not a dangling pointer and
- still points to block with page with m_page_id and if so, buffer-fixes it,
- otherwise clear()s it */
- void buffer_fix_block_if_still_valid();
-};
-} // namespace buf
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index cd7cc294..c291615c 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -158,14 +158,25 @@ buf_block_free(
#define buf_page_get(ID, SIZE, LA, MTR) \
buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
-/** Try to acquire a page latch.
-@param rw_latch RW_S_LATCH or RW_X_LATCH
+/** Try to buffer-fix a page.
@param block guessed block
+@param id expected block->page.id()
+@return block if it was buffer-fixed
+@retval nullptr if the block no longer is valid */
+buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to acquire a page latch after buf_page_optimistic_fix().
+@param block buffer-fixed block
+@param rw_latch RW_S_LATCH or RW_X_LATCH
@param modify_clock expected value of block->modify_clock
@param mtr mini-transaction
-@return whether the latch was acquired (the page is an allocated file page) */
-bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
- uint64_t modify_clock, mtr_t *mtr);
+@return block if the latch was acquired
+@retval nullptr if block->unfix() was called because it no longer is valid */
+buf_block_t *buf_page_optimistic_get(buf_block_t *block,
+ rw_lock_type_t rw_latch,
+ uint64_t modify_clock, mtr_t *mtr)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Try to S-latch a page.
Suitable for using when holding the lock_sys latches (as it avoids deadlock).
@@ -292,15 +303,6 @@ void
buf_block_modify_clock_inc(
/*=======================*/
buf_block_t* block); /*!< in: block */
-/********************************************************************//**
-Returns the value of the modify clock. The caller must have an s-lock
-or x-lock on the block.
-@return value */
-UNIV_INLINE
-ib_uint64_t
-buf_block_get_modify_clock(
-/*=======================*/
- buf_block_t* block); /*!< in: block */
#endif /* !UNIV_INNOCHECKSUM */
/** Check if a buffer is all zeroes.
@@ -771,17 +773,16 @@ public:
@retval DB_FAIL if the page contains the wrong ID */
dberr_t read_complete(const fil_node_t &node);
- /** Note that a block is no longer dirty, while not removing
- it from buf_pool.flush_list
- @param temporary whether the page belongs to the temporary tablespace
- @param error whether an error may have occurred while writing */
- inline void write_complete(bool temporary, bool error);
+ /** Release a write fix after a page write was completed.
+ @param persistent whether the page belongs to a persistent tablespace
+ @param error whether an error may have occurred while writing
+ @param state recently read state() value with the correct io-fix */
+ void write_complete(bool persistent, bool error, uint32_t state);
/** Write a flushable page to a file or free a freeable block.
- @param evict whether to evict the page on write completion
@param space tablespace
@return whether a page write was initiated and buf_pool.mutex released */
- bool flush(bool evict, fil_space_t *space);
+ bool flush(fil_space_t *space);
/** Notify that a page in a temporary tablespace has been modified. */
void set_temp_modified()
@@ -1756,10 +1757,6 @@ public:
/** Decrement the number of pending LRU flush */
inline void n_flush_dec();
- /** Decrement the number of pending LRU flush
- while holding flush_list_mutex */
- inline void n_flush_dec_holding_mutex();
-
/** @return whether flush_list flushing is active */
bool flush_list_active() const
{
@@ -1912,6 +1909,9 @@ public:
/** Free a page whose underlying file page has been freed. */
ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept;
+ /** Issue a warning that we could not free up buffer pool pages. */
+ ATTRIBUTE_COLD void LRU_warn();
+
private:
/** Temporary memory for page_compressed and encrypted I/O */
struct io_buf_t
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
index b3158cf1..050c8493 100644
--- a/storage/innobase/include/buf0buf.inl
+++ b/storage/innobase/include/buf0buf.inl
@@ -116,17 +116,3 @@ buf_block_modify_clock_inc(
block->modify_clock++;
}
-
-/********************************************************************//**
-Returns the value of the modify clock. The caller must have an s-lock
-or x-lock on the block.
-@return value */
-UNIV_INLINE
-ib_uint64_t
-buf_block_get_modify_clock(
-/*=======================*/
- buf_block_t* block) /*!< in: block */
-{
- ut_ad(block->page.lock.have_any());
- return(block->modify_clock);
-}
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index 0cce514b..cc32a38a 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -85,16 +85,6 @@ buf_flush_init_for_writing(
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
MY_ATTRIBUTE((warn_unused_result));
-/** Write out dirty blocks from buf_pool.LRU,
-and move clean blocks to buf_pool.free.
-The caller must invoke buf_dblwr.flush_buffered_writes()
-after releasing buf_pool.mutex.
-@param max_n wished maximum mumber of blocks flushed
-@param evict whether to evict pages after flushing
-@return evict ? number of processed pages : number of pages written
-@retval 0 if a buf_pool.LRU batch is already running */
-ulint buf_flush_LRU(ulint max_n, bool evict);
-
/** Wait until a LRU flush batch ends. */
void buf_flush_wait_LRU_batch_end();
/** Wait until all persistent pages are flushed up to a limit.
diff --git a/storage/innobase/include/cache.h b/storage/innobase/include/cache.h
new file mode 100644
index 00000000..0647cbe6
--- /dev/null
+++ b/storage/innobase/include/cache.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+
+Copyright (c) 2024, MariaDB plc
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <cstddef>
+
+#if defined __x86_64__ || defined __aarch64__ || defined __powerpc64__
+struct pmem_control
+{
+ void (*persist)(const void *, size_t);
+public:
+ pmem_control();
+};
+extern const pmem_control pmem;
+# define pmem_persist(buf, size) pmem.persist(buf, size)
+#else
+void pmem_persist(const void *buf, size_t size);
+#endif
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
index a5356e0d..fcb543eb 100644
--- a/storage/innobase/include/data0data.h
+++ b/storage/innobase/include/data0data.h
@@ -339,15 +339,12 @@ dtuple_set_types_binary(
dtuple_t* tuple, /*!< in: data tuple */
ulint n) /*!< in: number of fields to set */
MY_ATTRIBUTE((nonnull));
-/**********************************************************************//**
-Checks if a dtuple contains an SQL null value.
-@return TRUE if some field is SQL null */
+/** Checks if a dtuple contains an SQL null value.
+@param tuple tuple
+@param fields_number number of fields in the tuple to check
+@return true if some field is SQL null */
UNIV_INLINE
-ibool
-dtuple_contains_null(
-/*=================*/
- const dtuple_t* tuple) /*!< in: dtuple */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
+bool dtuple_contains_null(const dtuple_t *tuple, ulint fields_number = 0);
/**********************************************************//**
Checks that a data field is typed. Asserts an error if not.
@return TRUE if ok */
diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl
index 2d1bf5a2..b6c6ace8 100644
--- a/storage/innobase/include/data0data.inl
+++ b/storage/innobase/include/data0data.inl
@@ -596,28 +596,18 @@ data_write_sql_null(
memset(data, 0, len);
}
-/**********************************************************************//**
-Checks if a dtuple contains an SQL null value.
-@return TRUE if some field is SQL null */
+/** Checks if a dtuple contains an SQL null value.
+@param tuple tuple
+@param fields_number number of fields in the tuple to check
+@return true if some field is SQL null */
UNIV_INLINE
-ibool
-dtuple_contains_null(
-/*=================*/
- const dtuple_t* tuple) /*!< in: dtuple */
+bool dtuple_contains_null(const dtuple_t *tuple, ulint fields_number)
{
- ulint n;
- ulint i;
-
- n = dtuple_get_n_fields(tuple);
-
- for (i = 0; i < n; i++) {
- if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
-
- return(TRUE);
- }
- }
-
- return(FALSE);
+ ulint n= fields_number ? fields_number : dtuple_get_n_fields(tuple);
+ for (ulint i= 0; i < n; i++)
+ if (dfield_is_null(dtuple_get_nth_field(tuple, i)))
+ return true;
+ return false;
}
/**************************************************************//**
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
index 64182aab..960ec390 100644
--- a/storage/innobase/include/db0err.h
+++ b/storage/innobase/include/db0err.h
@@ -32,23 +32,25 @@ Created 5/24/1996 Heikki Tuuri
enum dberr_t {
DB_SUCCESS,
- DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new
+ DB_SUCCESS_LOCKED_REC= 9, /*!< like DB_SUCCESS, but a new
explicit record lock was created */
/* The following are error codes */
- DB_ERROR = 11,
+ DB_RECORD_CHANGED,
+ DB_ERROR,
DB_INTERRUPTED,
DB_OUT_OF_MEMORY,
DB_OUT_OF_FILE_SPACE,
DB_LOCK_WAIT,
DB_DEADLOCK,
- DB_ROLLBACK,
DB_DUPLICATE_KEY,
DB_MISSING_HISTORY, /*!< required history data has been
deleted due to lack of space in
rollback segment */
- DB_CLUSTER_NOT_FOUND = 30,
- DB_TABLE_NOT_FOUND,
+#ifdef WITH_WSREP
+ DB_ROLLBACK,
+#endif
+ DB_TABLE_NOT_FOUND= 31,
DB_TOO_BIG_RECORD, /*!< a record in an index would not fit
on a compressed page, or it would
become bigger than 1/2 free space in
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 5fafb2c5..3baac658 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -35,6 +35,7 @@ Created 1/8/1996 Heikki Tuuri
#include <my_sys.h>
#include <deque>
+class MDL_context;
class MDL_ticket;
/** the first table or index ID for other than hard-coded system tables */
@@ -139,6 +140,21 @@ dict_acquire_mdl_shared(dict_table_t *table,
MDL_ticket **mdl,
dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out] table table object
+@param[in,out] mdl_context MDL context
+@param[out] mdl MDL ticket
+@param[in] table_op operation to perform when opening
+@return table object after locking MDL shared
+@retval nullptr if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+__attribute__((nonnull, warn_unused_result))
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+ MDL_context *mdl_context, MDL_ticket **mdl,
+ dict_table_op_t table_op);
+
/** Look up a table by numeric identifier.
@param[in] table_id table identifier
@param[in] dict_locked data dictionary locked
@@ -1314,13 +1330,7 @@ class dict_sys_t
std::atomic<ulonglong> latch_ex_wait_start;
/** the rw-latch protecting the data dictionary cache */
- alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
-#ifdef UNIV_DEBUG
- /** whether latch is being held in exclusive mode (by any thread) */
- Atomic_relaxed<pthread_t> latch_ex;
- /** number of S-latch holders */
- Atomic_counter<uint32_t> latch_readers;
-#endif
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) IF_DBUG(srw_lock_debug,srw_lock) latch;
public:
/** Indexes of SYS_TABLE[] */
enum
@@ -1488,15 +1498,12 @@ public:
}
#ifdef UNIV_DEBUG
- /** @return whether any thread (not necessarily the current thread)
- is holding the latch; that is, this check may return false
- positives */
- bool frozen() const { return latch_readers || latch_ex; }
- /** @return whether any thread (not necessarily the current thread)
- is holding a shared latch */
- bool frozen_not_locked() const { return latch_readers; }
+ /** @return whether the current thread is holding the latch */
+ bool frozen() const { return latch.have_any(); }
+ /** @return whether the current thread is holding a shared latch */
+ bool frozen_not_locked() const { return latch.have_rd(); }
/** @return whether the current thread holds the exclusive latch */
- bool locked() const { return latch_ex == pthread_self(); }
+ bool locked() const { return latch.have_wr(); }
#endif
private:
/** Acquire the exclusive latch */
@@ -1511,13 +1518,7 @@ public:
/** Exclusively lock the dictionary cache. */
void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
{
- if (latch.wr_lock_try())
- {
- ut_ad(!latch_readers);
- ut_ad(!latch_ex);
- ut_d(latch_ex= pthread_self());
- }
- else
+ if (!latch.wr_lock_try())
lock_wait(SRW_LOCK_ARGS(file, line));
}
@@ -1530,27 +1531,11 @@ public:
ATTRIBUTE_NOINLINE void unfreeze();
#else
/** Unlock the data dictionary cache. */
- void unlock()
- {
- ut_ad(latch_ex == pthread_self());
- ut_ad(!latch_readers);
- ut_d(latch_ex= 0);
- latch.wr_unlock();
- }
+ void unlock() { latch.wr_unlock(); }
/** Acquire a shared lock on the dictionary cache. */
- void freeze()
- {
- latch.rd_lock();
- ut_ad(!latch_ex);
- ut_d(latch_readers++);
- }
+ void freeze() { latch.rd_lock(); }
/** Release a shared lock on the dictionary cache. */
- void unfreeze()
- {
- ut_ad(!latch_ex);
- ut_ad(latch_readers--);
- latch.rd_unlock();
- }
+ void unfreeze() { latch.rd_unlock(); }
#endif
/** Estimate the used memory occupied by the data dictionary
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index fde2a714..0268a280 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -1010,8 +1010,6 @@ struct dict_index_t {
/*!< number of columns the user defined to
be in the index: in the internal
representation we add more columns */
- unsigned nulls_equal:1;
- /*!< if true, SQL NULL == SQL NULL */
unsigned n_uniq:10;/*!< number of fields from the beginning
which are enough to determine an index
entry uniquely */
@@ -2448,6 +2446,9 @@ public:
/** @return number of unique columns in FTS_DOC_ID index */
unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+ /** @return the index for that starts with a specific column */
+ dict_index_t *get_index(const dict_col_t &col) const;
+
/** Create metadata.
@param name table name
@param space tablespace
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
index d60ee5d9..edb7cf92 100644
--- a/storage/innobase/include/dict0mem.inl
+++ b/storage/innobase/include/dict0mem.inl
@@ -63,6 +63,5 @@ dict_mem_fill_index_struct(
& index->MAX_N_FIELDS;
/* The '1 +' above prevents allocation
of an empty mem block */
- index->nulls_equal = false;
ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
}
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
index 06af4dcc..7a4e6760 100644
--- a/storage/innobase/include/dyn0buf.h
+++ b/storage/innobase/include/dyn0buf.h
@@ -57,11 +57,7 @@ public:
/**
Gets the number of used bytes in a block.
@return number of bytes used */
- ulint used() const
- MY_ATTRIBUTE((warn_unused_result))
- {
- return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG));
- }
+ uint32_t used() const { return m_used; }
/**
Gets pointer to the start of data.
@@ -153,8 +149,7 @@ public:
/** Storage */
byte m_data[MAX_DATA_SIZE];
- /** number of data bytes used in this block;
- DYN_BLOCK_FULL_FLAG is set when the block becomes full */
+ /** number of data bytes used in this block */
uint32_t m_used;
friend class mtr_buf_t;
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
index 83d0b0d6..af7f663d 100644
--- a/storage/innobase/include/dyn0types.h
+++ b/storage/innobase/include/dyn0types.h
@@ -33,7 +33,4 @@ Created 2013-03-16 Sunny Bains
/** This is the initial 'payload' size of a dynamic array */
#define DYN_ARRAY_DATA_SIZE 512
-/** Flag for dyn_block_t::used that indicates a full block */
-#define DYN_BLOCK_FULL_FLAG 0x1000000UL
-
#endif /* dyn0types_h */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index cdc32515..dfda1178 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -63,7 +63,7 @@ enum srv_flush_t
SRV_LITTLESYNC,
/** do not flush after writing */
SRV_NOSYNC,
- /** invoke os_file_set_nocache() on data files. This implies using
+ /** Open or create files with O_DIRECT. This implies using
unbuffered I/O but still fdatasync(), because some filesystems might
not flush meta-data on write completion */
SRV_O_DIRECT,
@@ -347,7 +347,6 @@ struct fil_space_t final
~fil_space_t()
{
ut_ad(!latch_owner);
- ut_ad(!latch_count);
latch.destroy();
}
@@ -411,9 +410,9 @@ private:
/** The reference count */
static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
/** latch protecting all page allocation bitmap pages */
- srw_lock latch;
+ IF_DBUG(srw_lock_debug, srw_lock) latch;
+ /** the thread that holds the exclusive latch, or 0 */
pthread_t latch_owner;
- ut_d(Atomic_relaxed<uint32_t> latch_count;)
public:
/** MariaDB encryption data */
fil_space_crypt_t *crypt_data;
@@ -1004,40 +1003,32 @@ public:
bool recheck, bool encrypt);
#ifdef UNIV_DEBUG
- bool is_latched() const { return latch_count != 0; }
+ bool is_latched() const { return latch.have_any(); }
#endif
- bool is_owner() const { return latch_owner == pthread_self(); }
+ bool is_owner() const
+ {
+ const bool owner{latch_owner == pthread_self()};
+ ut_ad(owner == latch.have_wr());
+ return owner;
+ }
/** Acquire the allocation latch in exclusive mode */
void x_lock()
{
latch.wr_lock(SRW_LOCK_CALL);
ut_ad(!latch_owner);
latch_owner= pthread_self();
- ut_ad(!latch_count.fetch_add(1));
}
/** Release the allocation latch from exclusive mode */
void x_unlock()
{
- ut_ad(latch_count.fetch_sub(1) == 1);
ut_ad(latch_owner == pthread_self());
latch_owner= 0;
latch.wr_unlock();
}
/** Acquire the allocation latch in shared mode */
- void s_lock()
- {
- ut_ad(!is_owner());
- latch.rd_lock(SRW_LOCK_CALL);
- ut_ad(!latch_owner);
- ut_d(latch_count.fetch_add(1));
- }
+ void s_lock() { latch.rd_lock(SRW_LOCK_CALL); }
/** Release the allocation latch from shared mode */
- void s_unlock()
- {
- ut_ad(latch_count.fetch_sub(1));
- ut_ad(!latch_owner);
- latch.rd_unlock();
- }
+ void s_unlock() { latch.rd_unlock(); }
typedef span<const char> name_type;
@@ -1637,17 +1628,34 @@ void fil_close_tablespace(uint32_t id);
/*******************************************************************//**
Allocates and builds a file name from a path, a table or tablespace name
and a suffix. The string must be freed by caller with ut_free().
-@param[in] path NULL or the directory path or the full path and filename.
+@param[in] path nullptr or the directory path or the full path and filename
@param[in] name {} if path is full, or Table/Tablespace name
-@param[in] ext the file extension to use
-@param[in] trim_name true if the last name on the path should be trimmed.
+@param[in] extension the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed
@return own: file name */
-char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
- ib_extention ext, bool trim_name);
+char* fil_make_filepath_low(const char *path,
+ const fil_space_t::name_type &name,
+ ib_extention extension, bool trim_name);
char *fil_make_filepath(const char* path, const table_name_t name,
ib_extention suffix, bool strip_name);
+/** Wrapper function over fil_make_filepath_low to build file name.
+@param path nullptr or the directory path or the full path and filename
+@param name {} if path is full, or Table/Tablespace name
+@param extension the file extension to use
+@param trim_name true if the last name on the path should be trimmed
+@return own: file name */
+static inline char*
+fil_make_filepath(const char* path, const fil_space_t::name_type &name,
+ ib_extention extension, bool trim_name)
+{
+ /* If we are going to strip a name off the path, there better be a
+ path and a new name to put back on. */
+ ut_ad(!trim_name || (path && name.data()));
+ return fil_make_filepath_low(path, name, extension, trim_name);
+}
+
/** Create a tablespace file.
@param[in] space_id Tablespace ID
@param[in] name Tablespace name in dbname/tablename format.
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 26261554..99459bcb 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -209,24 +209,6 @@ typedef byte fseg_inode_t;
static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
-#define FSEG_FILLFACTOR 8 /* If the number of unused but reserved
- pages in a segment is less than
- reserved pages / FSEG_FILLFACTOR,
- and there are
- at least FSEG_FRAG_LIMIT used pages,
- then we allow a new empty extent to
- be added to the segment in
- fseg_alloc_free_page_general().
- Otherwise, we
- use unused pages of the segment. */
-
-#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS
- /* If the segment has >= this many
- used pages, it may be expanded by
- allocating extents to the segment;
- until that only individual fragment
- pages are allocated from the space */
-
#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment
is at least this many extents, we
allow extents to be put to the free
@@ -294,7 +276,7 @@ Determine if a page is marked free.
@param[in] descr extent descriptor
@param[in] offset page offset within extent
@return whether the page is free */
-inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+inline bool xdes_is_free(const xdes_t *descr, uint32_t offset)
{
ut_ad(offset < FSP_EXTENT_SIZE);
ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index c0151b44..1d2b409b 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -163,6 +163,9 @@ struct fts_token_t;
struct fts_doc_ids_t;
struct fts_index_cache_t;
+/** Compare two DOC_ID. */
+int fts_doc_id_cmp(const void *p1, const void *p2)
+ __attribute__((nonnull, warn_unused_result));
/** Initialize the "fts_table" for internal query into FTS auxiliary
tables */
@@ -412,6 +415,9 @@ inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids)
mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg));
}
+/** Sort an array of doc_id */
+void fts_doc_ids_sort(ib_vector_t *doc_ids);
+
/******************************************************************//**
Notify the FTS system about an operation on an FTS-indexed table. */
void
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
index ae0bb036..04faceb9 100644
--- a/storage/innobase/include/fts0priv.h
+++ b/storage/innobase/include/fts0priv.h
@@ -271,27 +271,6 @@ fts_index_fetch_nodes(
word, /*!< in: the word to fetch */
fts_fetch_t* fetch) /*!< in: fetch callback.*/
MY_ATTRIBUTE((nonnull));
-/******************************************************************//**
-Compare two fts_trx_table_t instances, we actually compare the
-table id's here.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_trx_table_cmp(
-/*==============*/
- const void* v1, /*!< in: id1 */
- const void* v2) /*!< in: id2 */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
-/******************************************************************//**
-Compare a table id with a trx_table_t table id.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_trx_table_id_cmp(
-/*=================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
- MY_ATTRIBUTE((nonnull, warn_unused_result));
#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
#define fts_sql_rollback(trx) (trx)->rollback()
/******************************************************************//**
diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl
index 3cb09c92..3d937bb3 100644
--- a/storage/innobase/include/fts0priv.inl
+++ b/storage/innobase/include/fts0priv.inl
@@ -52,47 +52,3 @@ fts_read_object_id(
if the id is HEX or DEC and do the right thing with it. */
return(sscanf(str, UINT64PFx, id) == 1);
}
-
-/******************************************************************//**
-Compare two fts_trx_table_t instances.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_trx_table_cmp(
-/*==============*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const dict_table_t* table1
- = (*static_cast<const fts_trx_table_t* const*>(p1))->table;
-
- const dict_table_t* table2
- = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
-
- return((table1->id > table2->id)
- ? 1
- : (table1->id == table2->id)
- ? 0
- : -1);
-}
-
-/******************************************************************//**
-Compare a table id with a fts_trx_table_t table id.
-@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_trx_table_id_cmp(
-/*=================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const uintmax_t* table_id = static_cast<const uintmax_t*>(p1);
- const dict_table_t* table2
- = (*static_cast<const fts_trx_table_t* const*>(p2))->table;
-
- return((*table_id > table2->id)
- ? 1
- : (*table_id == table2->id)
- ? 0
- : -1);
-}
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
index fb278d54..7b95348b 100644
--- a/storage/innobase/include/fts0types.h
+++ b/storage/innobase/include/fts0types.h
@@ -278,44 +278,6 @@ struct fts_token_t {
extern const fts_index_selector_t fts_index_selector[];
/******************************************************************//**
-Compare two fts_trx_row_t instances doc_ids. */
-UNIV_INLINE
-int
-fts_trx_row_doc_id_cmp(
-/*===================*/
- /*!< out:
- < 0 if n1 < n2,
- 0 if n1 == n2,
- > 0 if n1 > n2 */
- const void* p1, /*!< in: id1 */
- const void* p2); /*!< in: id2 */
-
-/******************************************************************//**
-Compare two fts_ranking_t instances doc_ids. */
-UNIV_INLINE
-int
-fts_ranking_doc_id_cmp(
-/*===================*/
- /*!< out:
- < 0 if n1 < n2,
- 0 if n1 == n2,
- > 0 if n1 > n2 */
- const void* p1, /*!< in: id1 */
- const void* p2); /*!< in: id2 */
-
-/******************************************************************//**
-Compare two doc_ids. */
-UNIV_INLINE
-int fts_doc_id_cmp(
-/*==================*/
- /*!< out:
- < 0 if n1 < n2,
- 0 if n1 == n2,
- > 0 if n1 > n2 */
- const void* p1, /*!< in: id1 */
- const void* p2); /*!< in: id2 */
-
-/******************************************************************//**
Duplicate a string. */
UNIV_INLINE
void
diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl
index facc1e5c..5b57cad7 100644
--- a/storage/innobase/include/fts0types.inl
+++ b/storage/innobase/include/fts0types.inl
@@ -47,53 +47,6 @@ fts_string_dup(
}
/******************************************************************//**
-Compare two fts_trx_row_t doc_ids.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_trx_row_doc_id_cmp(
-/*===================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1;
- const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2;
-
- return((int)(tr1->doc_id - tr2->doc_id));
-}
-
-/******************************************************************//**
-Compare two fts_ranking_t doc_ids.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_ranking_doc_id_cmp(
-/*===================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const fts_ranking_t* rk1 = (const fts_ranking_t*) p1;
- const fts_ranking_t* rk2 = (const fts_ranking_t*) p2;
-
- return((int)(rk1->doc_id - rk2->doc_id));
-}
-
-/******************************************************************//**
-Compare two doc_ids.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int fts_doc_id_cmp(
-/*==================*/
- const void* p1, /*!< in: id1 */
- const void* p2) /*!< in: id2 */
-{
- const doc_id_t* up1 = static_cast<const doc_id_t*>(p1);
- const doc_id_t* up2 = static_cast<const doc_id_t*>(p2);
-
- return static_cast<int>(*up1 - *up2);
-}
-
-/******************************************************************//**
Get the first character's code position for FTS index partition */
extern
ulint
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index 746dab80..1adec365 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -78,34 +78,40 @@ void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
MY_ATTRIBUTE((nonnull));
/** Append a file list node to a list.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the node to be added
-@param[in,out] mtr mini-transaction
+@param base base node block
+@param boffset byte offset of the base node
+@param add block to be added
+@param aoffset byte offset of the node to be added
+@param limit fil_space_t::free_limit
+@param mtr mini-transaction
@return error code */
dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
- buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ buf_block_t *add, uint16_t aoffset,
+ uint32_t limit, mtr_t *mtr)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Prepend a file list node to a list.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] add block to be added
-@param[in] aoffset byte offset of the node to be added
-@param[in,out] mtr mini-transaction
+@param base base node block
+@param boffset byte offset of the base node
+@param add block to be added
+@param aoffset byte offset of the node to be added
+@param limit fil_space_t::free_limit
+@param mtr mini-transaction
@return error code */
dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
- buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+ buf_block_t *add, uint16_t aoffset,
+ uint32_t limit, mtr_t *mtr)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Remove a file list node.
-@param[in,out] base base node block
-@param[in] boffset byte offset of the base node
-@param[in,out] cur block to be removed
-@param[in] coffset byte offset of the current record to be removed
-@param[in,out] mtr mini-transaction
+@param base base node block
+@param boffset byte offset of the base node
+@param cur block to be removed
+@param coffset byte offset of the current record to be removed
+@param limit fil_space_t::free_limit
+@param mtr mini-transaction
@return error code */
dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
- buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+ buf_block_t *cur, uint16_t coffset,
+ uint32_t limit, mtr_t *mtr)
MY_ATTRIBUTE((nonnull, warn_unused_result));
/** @return the length of a list */
@@ -117,11 +123,9 @@ inline uint32_t flst_get_len(const flst_base_node_t *base)
/** @return a file address */
inline fil_addr_t flst_read_addr(const byte *faddr)
{
- fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
- mach_read_from_2(faddr + FIL_ADDR_BYTE) };
- ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
- ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
- return addr;
+ ut_ad(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+ return fil_addr_t{mach_read_from_4(faddr + FIL_ADDR_PAGE),
+ mach_read_from_2(faddr + FIL_ADDR_BYTE)};
}
/** @return list first node address */
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
index d6a4ef67..2dc25a89 100644
--- a/storage/innobase/include/gis0type.h
+++ b/storage/innobase/include/gis0type.h
@@ -66,10 +66,7 @@ typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> > rtr_rec_vector;
/* Structure for matched records on the leaf page */
typedef struct matched_rec {
- byte* bufp; /*!< aligned buffer point */
- byte rec_buf[UNIV_PAGE_SIZE_MAX * 2];
- /*!< buffer used to copy matching rec */
- buf_block_t block; /*!< the shadow buffer block */
+ buf_block_t* block; /*!< the shadow buffer block */
ulint used; /*!< memory used */
rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */
mysql_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs
@@ -107,7 +104,6 @@ typedef struct rtr_info{
/*!< mutex protect the "path" vector */
rtr_mbr_t mbr; /*!< the search MBR */
que_thr_t* thr; /*!< the search thread */
- mem_heap_t* heap; /*!< memory heap */
btr_cur_t* cursor; /*!< cursor used for search */
dict_index_t* index; /*!< index it is searching */
bool need_prdt_lock;
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 59ee7f55..08b9f4bc 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -438,6 +438,13 @@ dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
bool no_wait= false)
MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Lock the child tables of a table.
+@param table parent table
+@param trx transaction
+@return error code */
+dberr_t lock_table_children(dict_table_t *table, trx_t *trx)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
/** Exclusively lock the data dictionary tables.
@param trx dictionary transaction
@return error code
@@ -724,13 +731,8 @@ private:
bool m_initialised;
/** mutex proteting the locks */
- alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
-#ifdef UNIV_DEBUG
- /** The owner of exclusive latch (0 if none); protected by latch */
- std::atomic<pthread_t> writer{0};
- /** Number of shared latches */
- std::atomic<ulint> readers{0};
-#endif
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ IF_DBUG(srw_lock_debug,srw_spin_lock) latch;
#ifdef SUX_LOCK_GENERIC
protected:
/** mutex for hash_latch::wait() */
@@ -789,71 +791,35 @@ public:
void wr_lock()
{
mysql_mutex_assert_not_owner(&wait_mutex);
- ut_ad(!is_writer());
latch.wr_lock();
- ut_ad(!writer.exchange(pthread_self(),
- std::memory_order_relaxed));
}
/** Release exclusive lock_sys.latch */
- void wr_unlock()
- {
- ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
- pthread_self());
- latch.wr_unlock();
- }
+ void wr_unlock() { latch.wr_unlock(); }
/** Acquire shared lock_sys.latch */
void rd_lock()
{
mysql_mutex_assert_not_owner(&wait_mutex);
- ut_ad(!is_writer());
latch.rd_lock();
- ut_ad(!writer.load(std::memory_order_relaxed));
- ut_d(readers.fetch_add(1, std::memory_order_relaxed));
}
/** Release shared lock_sys.latch */
- void rd_unlock()
- {
- ut_ad(!is_writer());
- ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
- latch.rd_unlock();
- }
+ void rd_unlock() { latch.rd_unlock(); }
#endif
/** Try to acquire exclusive lock_sys.latch
@return whether the latch was acquired */
- bool wr_lock_try()
- {
- ut_ad(!is_writer());
- if (!latch.wr_lock_try()) return false;
- ut_ad(!writer.exchange(pthread_self(),
- std::memory_order_relaxed));
- return true;
- }
+ bool wr_lock_try() { return latch.wr_lock_try(); }
/** Try to acquire shared lock_sys.latch
@return whether the latch was acquired */
- bool rd_lock_try()
- {
- ut_ad(!is_writer());
- if (!latch.rd_lock_try()) return false;
- ut_ad(!writer.load(std::memory_order_relaxed));
- ut_d(readers.fetch_add(1, std::memory_order_relaxed));
- return true;
- }
+ bool rd_lock_try() { return latch.rd_lock_try(); }
/** Assert that wr_lock() has been invoked by this thread */
- void assert_locked() const { ut_ad(is_writer()); }
+ void assert_locked() const { ut_ad(latch.have_wr()); }
/** Assert that wr_lock() has not been invoked by this thread */
- void assert_unlocked() const { ut_ad(!is_writer()); }
+ void assert_unlocked() const { ut_ad(!latch.have_wr()); }
#ifdef UNIV_DEBUG
/** @return whether the current thread is the lock_sys.latch writer */
- bool is_writer() const
- {
-# ifdef SUX_LOCK_GENERIC
- return writer.load(std::memory_order_relaxed) == pthread_self();
-# else
- return writer.load(std::memory_order_relaxed) == pthread_self() ||
- (xtest() && !latch.is_locked_or_waiting());
-# endif
- }
+ bool is_writer() const { return latch.have_wr(); }
+ /** @return whether the current thread is holding lock_sys.latch */
+ bool is_holder() const { return latch.have_any(); }
/** Assert that a lock shard is exclusively latched (by some thread) */
void assert_locked(const lock_t &lock) const;
/** Assert that a table lock shard is exclusively latched by this thread */
@@ -965,14 +931,14 @@ extern lock_sys_t lock_sys;
/** @return the index of an array element */
inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
{
- ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ ut_ad(lock_sys.is_holder());
return calc_hash(fold, n_cells);
}
/** Get a hash table cell. */
inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
{
- ut_ad(lock_sys.is_writer() || lock_sys.readers);
+ ut_ad(lock_sys.is_holder());
return &array[calc_hash(fold)];
}
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
index 22c0c963..2500ac05 100644
--- a/storage/innobase/include/log0crypt.h
+++ b/storage/innobase/include/log0crypt.h
@@ -28,6 +28,9 @@ MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
#include "log0log.h"
+/** innodb_encrypt_log: whether to encrypt the redo log */
+extern my_bool srv_encrypt_log;
+
/** Initialize the redo log encryption key and random parameters
when creating a new redo log.
The random parameters will be persisted in the log header.
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 54851ca0..cef0dcae 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -132,6 +132,9 @@ public:
/** Redo log buffer */
struct log_t
{
+ /** The maximum buf_size */
+ static constexpr unsigned buf_size_max= os_file_request_size_max;
+
/** The original (not version-tagged) InnoDB redo log format */
static constexpr uint32_t FORMAT_3_23= 0;
/** The MySQL 5.7.9/MariaDB 10.2.2 log format */
@@ -165,60 +168,92 @@ struct log_t
static constexpr lsn_t FIRST_LSN= START_OFFSET;
private:
- /** The log sequence number of the last change of durable InnoDB files */
+ /** the lock bit in buf_free */
+ static constexpr size_t buf_free_LOCK= ~(~size_t{0} >> 1);
alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+ /** first free offset within buf used;
+ the most significant bit is set by lock_lsn() to protect this field
+ as well as write_to_buf, waits */
+ std::atomic<size_t> buf_free;
+public:
+ /** number of write requests (to buf); protected by lock_lsn() or lsn_lock */
+ size_t write_to_buf;
+ /** log record buffer, written to by mtr_t::commit() */
+ byte *buf;
+private:
+ /** The log sequence number of the last change of durable InnoDB files;
+ protected by lock_lsn() or lsn_lock or latch.wr_lock() */
std::atomic<lsn_t> lsn;
/** the first guaranteed-durable log sequence number */
std::atomic<lsn_t> flushed_to_disk_lsn;
- /** log sequence number when log resizing was initiated, or 0 */
- std::atomic<lsn_t> resize_lsn;
- /** set when there may be need to initiate a log checkpoint.
- This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
- std::atomic<bool> need_checkpoint;
+public:
+ /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */
+ size_t waits;
+ /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */
+ unsigned buf_size;
+ /** log file size in bytes, including the header */
+ lsn_t file_size;
+
+#ifdef LOG_LATCH_DEBUG
+ typedef srw_lock_debug log_rwlock;
+ typedef srw_mutex log_lsn_lock;
-#if defined(__aarch64__)
- /* On ARM, we do more spinning */
+ bool latch_have_wr() const { return latch.have_wr(); }
+ bool latch_have_rd() const { return latch.have_rd(); }
+ bool latch_have_any() const { return latch.have_any(); }
+#else
+# ifndef UNIV_DEBUG
+# elif defined SUX_LOCK_GENERIC
+ bool latch_have_wr() const { return true; }
+ bool latch_have_rd() const { return true; }
+ bool latch_have_any() const { return true; }
+# else
+ bool latch_have_wr() const { return latch.is_write_locked(); }
+ bool latch_have_rd() const { return latch.is_locked(); }
+ bool latch_have_any() const { return latch.is_locked(); }
+# endif
+# ifdef __aarch64__
+ /* On ARM, we spin more */
typedef srw_spin_lock log_rwlock;
typedef pthread_mutex_wrapper<true> log_lsn_lock;
-#else
+# else
typedef srw_lock log_rwlock;
typedef srw_mutex log_lsn_lock;
+# endif
#endif
-
-public:
- /** rw-lock protecting writes to buf; normal mtr_t::commit()
- outside any log checkpoint is covered by a shared latch */
+ /** exclusive latch for checkpoint, shared for mtr_t::commit() to buf */
alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch;
-private:
- /** mutex protecting buf_free et al, together with latch */
- log_lsn_lock lsn_lock;
-public:
- /** first free offset within buf use; protected by lsn_lock */
- Atomic_relaxed<size_t> buf_free;
- /** number of write requests (to buf); protected by lsn_lock */
- size_t write_to_buf;
- /** number of append_prepare_wait(); protected by lsn_lock */
- size_t waits;
-private:
+
+ /** number of std::swap(buf, flush_buf) and writes from buf to log;
+ protected by latch.wr_lock() */
+ ulint write_to_log;
+
/** Last written LSN */
lsn_t write_lsn;
-public:
- /** log record buffer, written to by mtr_t::commit() */
- byte *buf;
+
/** buffer for writing data to ib_logfile0, or nullptr if is_pmem()
In write_buf(), buf and flush_buf are swapped */
byte *flush_buf;
- /** number of std::swap(buf, flush_buf) and writes from buf to log;
- protected by latch.wr_lock() */
- ulint write_to_log;
-
+ /** set when there may be need to initiate a log checkpoint.
+ This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+ std::atomic<bool> need_checkpoint;
+ /** whether a checkpoint is pending; protected by latch.wr_lock() */
+ Atomic_relaxed<bool> checkpoint_pending;
+ /** next checkpoint number (protected by latch.wr_lock()) */
+ byte next_checkpoint_no;
+ /** recommended maximum buf_free size, after which the buffer is flushed */
+ unsigned max_buf_free;
/** Log sequence number when a log file overwrite (broken crash recovery)
was noticed. Protected by latch.wr_lock(). */
lsn_t overwrite_warned;
- /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */
- size_t buf_size;
+ /** latest completed checkpoint (protected by latch.wr_lock()) */
+ Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+ /** next checkpoint LSN (protected by latch.wr_lock()) */
+ lsn_t next_checkpoint_lsn;
+ /** Log file */
+ log_file_t log;
private:
/** Log file being constructed during resizing; protected by latch */
log_file_t resize_log;
@@ -229,18 +264,14 @@ private:
/** Buffer for writing to resize_log; @see flush_buf */
byte *resize_flush_buf;
- void init_lsn_lock() {lsn_lock.init(); }
- void lock_lsn() { lsn_lock.wr_lock(); }
- void unlock_lsn() {lsn_lock.wr_unlock(); }
- void destroy_lsn_lock() { lsn_lock.destroy(); }
-
-public:
- /** recommended maximum size of buf, after which the buffer is flushed */
- size_t max_buf_free;
+ /** Special implementation of lock_lsn() for IA-32 and AMD64 */
+ void lsn_lock_bts() noexcept;
+ /** Acquire a lock for updating buf_free and related fields.
+ @return the value of buf_free */
+ size_t lock_lsn() noexcept;
- /** log file size in bytes, including the header */
- lsn_t file_size;
-private:
+ /** log sequence number when log resizing was initiated, or 0 */
+ std::atomic<lsn_t> resize_lsn;
/** the log sequence number at the start of the log file */
lsn_t first_lsn;
#if defined __linux__ || defined _WIN32
@@ -250,8 +281,6 @@ private:
public:
/** format of the redo log: e.g., FORMAT_10_8 */
uint32_t format;
- /** Log file */
- log_file_t log;
#if defined __linux__ || defined _WIN32
/** whether file system caching is enabled for the log */
my_bool log_buffered;
@@ -279,21 +308,29 @@ public:
/*!< this is the maximum allowed value
for lsn - last_checkpoint_lsn when a
new query step is started */
- /** latest completed checkpoint (protected by latch.wr_lock()) */
- Atomic_relaxed<lsn_t> last_checkpoint_lsn;
- /** next checkpoint LSN (protected by log_sys.latch) */
- lsn_t next_checkpoint_lsn;
- /** next checkpoint number (protected by latch.wr_lock()) */
- ulint next_checkpoint_no;
- /** whether a checkpoint is pending */
- Atomic_relaxed<bool> checkpoint_pending;
/** buffer for checkpoint header */
byte *checkpoint_buf;
/* @} */
+private:
+ /** A lock when the spin-only lock_lsn() is not being used */
+ log_lsn_lock lsn_lock;
+public:
+
bool is_initialised() const noexcept { return max_buf_free != 0; }
+ /** whether there is capacity in the log buffer */
+ bool buf_free_ok() const noexcept
+ {
+ ut_ad(!is_pmem());
+ return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) <
+ max_buf_free;
+ }
+
+ void set_buf_free(size_t f) noexcept
+ { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); }
+
#ifdef HAVE_PMEM
bool is_pmem() const noexcept { return !flush_buf; }
#else
@@ -302,7 +339,7 @@ public:
bool is_opened() const noexcept { return log.is_opened(); }
- /** @return target write LSN to react on buf_free >= max_buf_free */
+ /** @return target write LSN to react on !buf_free_ok() */
inline lsn_t get_write_target() const;
/** @return LSN at which log resizing was started and is still in progress
@@ -402,9 +439,7 @@ public:
void set_recovered_lsn(lsn_t lsn) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_write_locked());
-#endif /* SUX_LOCK_GENERIC */
+ ut_ad(latch_have_wr());
write_lsn= lsn;
this->lsn.store(lsn, std::memory_order_relaxed);
flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
@@ -444,17 +479,23 @@ public:
private:
/** Wait in append_prepare() for buffer to become available
- @param lsn log sequence number to write up to
- @param ex whether log_sys.latch is exclusively locked */
- ATTRIBUTE_COLD void append_prepare_wait(lsn_t lsn, bool ex) noexcept;
+ @tparam spin whether to use the spin-only lock_lsn()
+ @param b the value of buf_free
+ @param ex whether log_sys.latch is exclusively locked
+ @param lsn log sequence number to write up to
+ @return the new value of buf_free */
+ template<bool spin>
+ ATTRIBUTE_COLD size_t append_prepare_wait(size_t b, bool ex, lsn_t lsn)
+ noexcept;
public:
/** Reserve space in the log buffer for appending data.
+ @tparam spin whether to use the spin-only lock_lsn()
@tparam pmem log_sys.is_pmem()
@param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */
- template<bool pmem>
- inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
+ template<bool spin,bool pmem>
+ std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
/** Append a string of bytes to the redo log.
@param d destination
@@ -462,9 +503,7 @@ public:
@param size length of str, in bytes */
void append(byte *&d, const void *s, size_t size) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_locked());
-#endif
+ ut_ad(latch_have_any());
ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size));
memcpy(d, s, size);
d+= size;
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index c916edc9..bfa66216 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -695,14 +695,40 @@ private:
/** Encrypt the log */
ATTRIBUTE_NOINLINE void encrypt();
+ /** Commit the mini-transaction log.
+ @tparam pmem log_sys.is_pmem()
+ @param mtr mini-transaction
+ @param lsns {start_lsn,flush_ahead} */
+ template<bool pmem>
+ static void commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns);
+
/** Append the redo log records to the redo log buffer.
@return {start_lsn,flush_ahead} */
std::pair<lsn_t,page_flush_ahead> do_write();
/** Append the redo log records to the redo log buffer.
+ @tparam spin whether to use the spin-only log_sys.lock_lsn()
+ @tparam pmem log_sys.is_pmem()
+ @param mtr mini-transaction
@param len number of bytes to write
@return {start_lsn,flush_ahead} */
- std::pair<lsn_t,page_flush_ahead> finish_write(size_t len);
+ template<bool spin,bool pmem> static
+ std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len);
+
+ /** The applicable variant of commit_log() */
+ static void (*commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>);
+ /** The applicable variant of finish_writer() */
+ static std::pair<lsn_t,page_flush_ahead> (*finisher)(mtr_t *, size_t);
+
+ std::pair<lsn_t,page_flush_ahead> finish_write(size_t len)
+ { return finisher(this, len); }
+public:
+ /** Poll interval in log_sys.lock_lsn(); 0 to use log_sys.lsn_lock.
+ Protected by LOCK_global_system_variables and log_sys.latch. */
+ static unsigned spin_wait_delay;
+ /** Update finisher when spin_wait_delay is changing to or from 0. */
+ static void finisher_update();
+private:
/** Release all latches. */
void release();
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index c8374515..7eba359f 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -46,6 +46,18 @@ Created 10/21/1995 Heikki Tuuri
#include <time.h>
#endif /* !_WIN32 */
+/** The maximum size of a read or write request.
+
+According to Linux "man 2 read" and "man 2 write" this applies to
+both 32-bit and 64-bit systems.
+
+On FreeBSD, the limit is close to the Linux one, INT_MAX.
+
+On Microsoft Windows, the limit is UINT_MAX (4 GiB - 1).
+
+On other systems, the limit typically is up to SSIZE_T_MAX. */
+static constexpr unsigned os_file_request_size_max= 0x7ffff000;
+
extern bool os_has_said_disk_full;
/** File offset in bytes */
@@ -109,25 +121,21 @@ struct pfs_os_file_t
/** Options for os_file_create_func @{ */
enum os_file_create_t {
- OS_FILE_OPEN = 51, /*!< to open an existing file (if
- doesn't exist, error) */
- OS_FILE_CREATE, /*!< to create new file (if
- exists, error) */
- OS_FILE_OVERWRITE, /*!< to create a new file, if exists
- the overwrite old file */
- OS_FILE_OPEN_RAW, /*!< to open a raw device or disk
- partition */
- OS_FILE_CREATE_PATH, /*!< to create the directories */
- OS_FILE_OPEN_RETRY, /*!< open with retry */
-
- /** Flags that can be combined with the above values. Please ensure
- that the above values stay below 128. */
-
- OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */
- OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to
- the log unless it is a fatal error,
- this flag is only used if
- ON_ERROR_NO_EXIT is set */
+ /** create a new file */
+ OS_FILE_CREATE= 0,
+ /** open an existing file */
+ OS_FILE_OPEN,
+ /** retry opening an existing file */
+ OS_FILE_OPEN_RETRY,
+ /** open a raw block device */
+ OS_FILE_OPEN_RAW,
+
+ /** do not display diagnostic messages */
+ OS_FILE_ON_ERROR_SILENT= 4,
+
+ OS_FILE_CREATE_SILENT= OS_FILE_CREATE | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_OPEN_SILENT= OS_FILE_OPEN | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_OPEN_RETRY_SILENT= OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_SILENT
};
static const ulint OS_FILE_READ_ONLY = 333;
@@ -144,7 +152,7 @@ static const ulint OS_FILE_NORMAL = 62;
/** Types for file create @{ */
static constexpr ulint OS_DATA_FILE = 100;
static constexpr ulint OS_LOG_FILE = 101;
-#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+#if defined _WIN32 || defined O_DIRECT
static constexpr ulint OS_DATA_FILE_NO_O_DIRECT = 103;
#endif
/* @} */
@@ -191,14 +199,10 @@ public:
WRITE_ASYNC= WRITE_SYNC | 1,
/** A doublewrite batch */
DBLWR_BATCH= WRITE_ASYNC | 8,
- /** Write data; evict the block on write completion */
- WRITE_LRU= WRITE_ASYNC | 32,
/** Write data and punch hole for the rest */
- PUNCH= WRITE_ASYNC | 64,
- /** Write data and punch hole; evict the block on write completion */
- PUNCH_LRU= PUNCH | WRITE_LRU,
+ PUNCH= WRITE_ASYNC | 16,
/** Zero out a range of bytes in fil_space_t::io() */
- PUNCH_RANGE= WRITE_SYNC | 128,
+ PUNCH_RANGE= WRITE_SYNC | 32,
};
constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
@@ -211,7 +215,6 @@ public:
bool is_read() const { return (type & READ_SYNC) != 0; }
bool is_write() const { return (type & WRITE_SYNC) != 0; }
- bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
void write_complete(int io_error) const;
@@ -349,7 +352,7 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success);
@@ -358,7 +361,7 @@ os_file_create_simple_func(
os_file_create_simple_no_error_handling(), not directly this function!
A simple function to open or create a file.
@param[in] name name of the file or path as a null-terminated string
-@param[in] create_mode create mode
+@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN
@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option
is used by a backup program reading the file
@@ -369,28 +372,12 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_no_error_handling_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success)
MY_ATTRIBUTE((warn_unused_result));
-#ifndef HAVE_FCNTL_DIRECT
-#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
-#else
-/** Tries to disable OS caching on an opened file descriptor.
-@param[in] fd file descriptor to alter
-@param[in] file_name file name, used in the diagnostic message
-@param[in] name "open" or "create"; used in the diagnostic
- message */
-void
-os_file_set_nocache(
-/*================*/
- int fd, /*!< in: file descriptor to alter */
- const char* file_name,
- const char* operation_name);
-#endif
-
#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
/** Obtain an exclusive lock on a file.
@param fd file descriptor
@@ -419,7 +406,7 @@ Opens an existing file or creates a new.
pfs_os_file_t
os_file_create_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint purpose,
ulint type,
bool read_only,
@@ -617,7 +604,7 @@ pfs_os_file_t
pfs_os_file_create_simple_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success,
@@ -633,7 +620,7 @@ monitor file creation/open.
@param[in] key Performance Schema Key
@param[in] name name of the file or path as a null-terminated
string
-@param[in] create_mode create mode
+@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN
@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file
@@ -648,7 +635,7 @@ pfs_os_file_t
pfs_os_file_create_simple_no_error_handling_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success,
@@ -681,7 +668,7 @@ pfs_os_file_t
pfs_os_file_create_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint purpose,
ulint type,
bool read_only,
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
index 7de31505..a7603028 100644
--- a/storage/innobase/include/os0file.inl
+++ b/storage/innobase/include/os0file.inl
@@ -45,7 +45,7 @@ pfs_os_file_t
pfs_os_file_create_simple_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success,
@@ -80,7 +80,7 @@ monitor file creation/open.
@param[in] key Performance Schema Key
@param[in] name name of the file or path as a null-terminated
string
-@param[in] create_mode create mode
+@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN
@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file
@@ -95,7 +95,7 @@ pfs_os_file_t
pfs_os_file_create_simple_no_error_handling_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success,
@@ -146,7 +146,7 @@ pfs_os_file_t
pfs_os_file_create_func(
mysql_pfs_key_t key,
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint purpose,
ulint type,
bool read_only,
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index 93ea650d..1c2af128 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -165,14 +165,11 @@ row_merge_drop_indexes(
prepare_inplace_alter_table_dict(). */
void row_merge_drop_temp_indexes();
-/** Create temporary merge files in the given paramater path, and if
-UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
-@param[in] path location for creating temporary merge files, or NULL
+/** Create a temporary file at the specified path.
+@param path location for creating temporary merge files, or nullptr
@return File descriptor */
-pfs_os_file_t
-row_merge_file_create_low(
- const char* path)
- MY_ATTRIBUTE((warn_unused_result));
+pfs_os_file_t row_merge_file_create_low(const char *path)
+ MY_ATTRIBUTE((warn_unused_result));
/*********************************************************************//**
Destroy a merge file. And de-register the file from Performance Schema
if UNIV_PFS_IO is defined. */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index a1350740..7056c77f 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -370,6 +370,12 @@ row_search_index_entry(
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Get the byte offset of the DB_TRX_ID column
+@param[in] rec clustered index record
+@param[in] index clustered index
+@return the byte offset of DB_TRX_ID, from the start of rec */
+ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index);
+
#define ROW_COPY_DATA 1
#define ROW_COPY_POINTERS 2
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index 8134c60f..54e4a1d2 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -115,8 +115,8 @@ row_sel_convert_mysql_key_to_innobase(
ulint buf_len, /*!< in: buffer length */
dict_index_t* index, /*!< in: index of the key value */
const byte* key_ptr, /*!< in: MySQL key value */
- ulint key_len); /*!< in: MySQL key value length */
-
+ ulint key_len) /*!< in: MySQL key value length */
+ MY_ATTRIBUTE((nonnull(1,4,5)));
/** Search for rows in the database using cursor.
Function is mainly used for tables that are shared across connections and
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index 51f3049b..2ed26748 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -194,7 +194,6 @@ enum monitor_id_t {
MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
MONITOR_LRU_GET_FREE_LOOPS,
- MONITOR_LRU_GET_FREE_WAITS,
MONITOR_FLUSH_AVG_PAGE_RATE,
MONITOR_FLUSH_LSN_AVG_RATE,
@@ -215,7 +214,6 @@ enum monitor_id_t {
MONITOR_LRU_BATCH_SCANNED_PER_CALL,
MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
- MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
MONITOR_LRU_GET_FREE_SEARCH,
MONITOR_LRU_SEARCH_SCANNED,
MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 457d9ab5..5e6bfc33 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -121,10 +121,6 @@ struct srv_stats_t
ulint_ctr_n_t n_temp_blocks_decrypted;
};
-/** We are prepared for a situation that we have this many threads waiting for
-a transactional lock inside InnoDB. srv_start() sets the value. */
-extern ulint srv_max_n_threads;
-
extern const char* srv_main_thread_op_info;
/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
index 01067322..98c256d3 100644
--- a/storage/innobase/include/srw_lock.h
+++ b/storage/innobase/include/srw_lock.h
@@ -153,7 +153,7 @@ template<bool spinloop> class srw_lock_impl;
/** Slim shared-update-exclusive lock with no recursion */
template<bool spinloop>
-class ssux_lock_impl final
+class ssux_lock_impl
{
#ifdef UNIV_PFS_RWLOCK
friend class ssux_lock;
@@ -550,3 +550,51 @@ typedef srw_lock_impl<false> srw_lock;
typedef srw_lock_impl<true> srw_spin_lock;
#endif
+
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+
+class srw_lock_debug : private srw_lock
+{
+ /** The owner of the exclusive lock (0 if none) */
+ std::atomic<pthread_t> writer;
+ /** Protects readers */
+ mutable srw_mutex readers_lock;
+ /** Threads that hold the lock in shared mode */
+ std::atomic<std::unordered_multiset<pthread_t>*> readers;
+
+ /** Register a read lock. */
+ void readers_register();
+
+public:
+ void SRW_LOCK_INIT(mysql_pfs_key_t key);
+ void destroy();
+
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return srw_lock::is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept { return srw_lock::is_write_locked(); }
+#endif
+
+ /** Acquire an exclusive lock */
+ void wr_lock(SRW_LOCK_ARGS(const char *file, unsigned line));
+ /** @return whether an exclusive lock was acquired */
+ bool wr_lock_try();
+ /** Release after wr_lock() */
+ void wr_unlock();
+ /** Acquire a shared lock */
+ void rd_lock(SRW_LOCK_ARGS(const char *file, unsigned line));
+ /** @return whether a shared lock was acquired */
+ bool rd_lock_try();
+ /** Release after rd_lock() */
+ void rd_unlock();
+ /** @return whether this thread is between rd_lock() and rd_unlock() */
+ bool have_rd() const noexcept;
+ /** @return whether this thread is between wr_lock() and wr_unlock() */
+ bool have_wr() const noexcept;
+ /** @return whether this thread is holding rd_lock() or wr_lock() */
+ bool have_any() const noexcept;
+};
+#endif
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 0f4f8afa..1fb6cd68 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -55,80 +55,74 @@ Run a purge batch.
@return number of undo log pages handled in the batch */
ulint trx_purge(ulint n_tasks, ulint history_size);
-/** Rollback segements from a given transaction with trx-no
-scheduled for purge. */
-class TrxUndoRsegs {
-private:
- typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> >
- trx_rsegs_t;
-public:
- typedef trx_rsegs_t::iterator iterator;
- typedef trx_rsegs_t::const_iterator const_iterator;
-
- TrxUndoRsegs() = default;
-
- /** Constructor */
- TrxUndoRsegs(trx_rseg_t& rseg)
- : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {}
- /** Constructor */
- TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg)
- : trx_no(trx_no), m_rsegs(1, &rseg) {}
-
- bool operator!=(const TrxUndoRsegs& other) const
- { return trx_no != other.trx_no; }
- bool empty() const { return m_rsegs.empty(); }
- void erase(iterator& it) { m_rsegs.erase(it); }
- iterator begin() { return(m_rsegs.begin()); }
- iterator end() { return(m_rsegs.end()); }
- const_iterator begin() const { return m_rsegs.begin(); }
- const_iterator end() const { return m_rsegs.end(); }
-
- /** Compare two TrxUndoRsegs based on trx_no.
- @param elem1 first element to compare
- @param elem2 second element to compare
- @return true if elem1 > elem2 else false.*/
- bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs)
- {
- return(lhs.trx_no > rhs.trx_no);
- }
-
- /** Copy of trx_rseg_t::last_trx_no() */
- trx_id_t trx_no= 0;
-private:
- /** Rollback segments of a transaction, scheduled for purge. */
- trx_rsegs_t m_rsegs{};
-};
-
-typedef std::priority_queue<
- TrxUndoRsegs,
- std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >,
- TrxUndoRsegs> purge_pq_t;
-
-/** Chooses the rollback segment with the oldest committed transaction */
-struct TrxUndoRsegsIterator {
- /** Constructor */
- TrxUndoRsegsIterator();
- /** Sets the next rseg to purge in purge_sys.
- Executed in the purge coordinator thread.
- @retval false when nothing is to be purged
- @retval true when purge_sys.rseg->latch was locked */
- inline bool set_next();
-
-private:
- // Disable copying
- TrxUndoRsegsIterator(const TrxUndoRsegsIterator&);
- TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&);
-
- /** The current element to process */
- TrxUndoRsegs m_rsegs;
- /** Track the current element in m_rsegs */
- TrxUndoRsegs::const_iterator m_iter;
-};
-
/** The control structure used in the purge operation */
class purge_sys_t
{
- friend TrxUndoRsegsIterator;
+ /** Min-heap based priority queue of (trx_no, trx_sys.rseg_array index)
+ pairs, ordered on trx_no. The highest 64-TRX_NO_SHIFT bits of each element is
+ trx_no, the lowest 8 bits is rseg's index in trx_sys.rseg_array. */
+ class purge_queue
+ {
+ public:
+ typedef std::vector<uint64_t, ut_allocator<uint64_t>> container_type;
+ /** Number of bits reseved to shift trx_no in purge queue element */
+ static constexpr unsigned TRX_NO_SHIFT= 8;
+
+ bool empty() const { return m_array.empty(); }
+ void clear() { m_array.clear(); }
+
+ /** Push (trx_no, trx_sys.rseg_array index) into min-heap.
+ @param trx_no_rseg (trx_no << TRX_NO_SHIFT | (trx_sys.rseg_array index)) */
+ void push_trx_no_rseg(container_type::value_type trx_no_rseg)
+ {
+ m_array.push_back(trx_no_rseg);
+ std::push_heap(m_array.begin(), m_array.end(),
+ std::greater<container_type::value_type>());
+ }
+
+ /** Push rseg to priority queue.
+ @param trx_no trx_no of committed transaction
+ @param rseg rseg of committed transaction*/
+ void push(trx_id_t trx_no, const trx_rseg_t &rseg)
+ {
+ ut_ad(trx_no < 1ULL << (DATA_TRX_ID_LEN * CHAR_BIT));
+ ut_ad(&rseg >= trx_sys.rseg_array);
+ ut_ad(&rseg < trx_sys.rseg_array + TRX_SYS_N_RSEGS);
+ push_trx_no_rseg(trx_no << TRX_NO_SHIFT |
+ byte(&rseg - trx_sys.rseg_array));
+ }
+
+ /** Extracts rseg from (trx_no, trx_sys.rseg_array index) pair.
+ @param trx_no_rseg (trx_no << TRX_NO_SHIFT | (trx_sys.rseg_array index)
+ @return pointer to rseg in trx_sys.rseg_array */
+ static trx_rseg_t *rseg(container_type::value_type trx_no_rseg) {
+ byte i= static_cast<byte>(trx_no_rseg);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+ return &trx_sys.rseg_array[i];
+ }
+
+ /** Pop rseg from priority queue.
+ @return pointer to popped trx_rseg_t object */
+ trx_rseg_t *pop()
+ {
+ ut_ad(!empty());
+ std::pop_heap(m_array.begin(), m_array.end(),
+ std::greater<container_type::value_type>());
+ trx_rseg_t *r = rseg(m_array.back());
+ m_array.pop_back();
+ return r;
+ }
+
+ /** Clone m_array.
+ @return m_array clone */
+ container_type clone_container() const{ return m_array; }
+
+ private:
+ /** Array of (trx_no, trx_sys.rseg_array index) pairs. */
+ container_type m_array;
+ };
+
+
public:
/** latch protecting view, m_enabled */
alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
@@ -244,15 +238,36 @@ private:
record */
uint16_t hdr_offset; /*!< Header byte offset on the page */
+ /** Binary min-heap of (trx_no, trx_sys.rseg_array index) pairs, ordered on
+ trx_no. It is protected by the pq_mutex */
+ purge_queue purge_queue;
+
+ /** Mutex protecting purge_queue */
+ mysql_mutex_t pq_mutex;
- TrxUndoRsegsIterator
- rseg_iter; /*!< Iterator to get the next rseg
- to process */
public:
- purge_pq_t purge_queue; /*!< Binary min-heap, ordered on
- TrxUndoRsegs::trx_no. It is protected
- by the pq_mutex */
- mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */
+
+ void enqueue(trx_id_t trx_no, const trx_rseg_t &rseg) {
+ mysql_mutex_assert_owner(&pq_mutex);
+ purge_queue.push(trx_no, rseg);
+ }
+
+ /** Push to purge queue without acquiring pq_mutex.
+ @param rseg rseg to push */
+ void enqueue(const trx_rseg_t &rseg) { enqueue(rseg.last_trx_no(), rseg); }
+
+ /** Clone purge queue container.
+ @return purge queue container clone */
+ purge_queue::container_type clone_queue_container() const {
+ mysql_mutex_assert_owner(&pq_mutex);
+ return purge_queue.clone_container();
+ }
+
+ /** Acquare purge_queue_mutex */
+ void queue_lock() { mysql_mutex_lock(&pq_mutex); }
+
+ /** Release purge queue mutex */
+ void queue_unlock() { mysql_mutex_unlock(&pq_mutex); }
/** innodb_undo_log_truncate=ON state;
only modified by purge_coordinator_callback() */
@@ -332,8 +347,9 @@ private:
/** Update the last not yet purged history log info in rseg when
we have purged a whole undo log. Advances also purge_trx_no
- past the purged log. */
- void rseg_get_next_history_log();
+ past the purged log.
+ @return whether anything is to be purged */
+ bool rseg_get_next_history_log();
public:
/**
@@ -438,6 +454,11 @@ public:
@param already_stopped True indicates purge threads were
already stopped */
void stop_FTS(const dict_table_t &table, bool already_stopped=false);
+
+ /** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+ marked for truncate.
+ @param space undo tablespace being truncated */
+ void cleanse_purge_queue(const fil_space_t &space);
};
/** The global data structure coordinating a purge */
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 7fa43047..e0051b2a 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -59,7 +59,7 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
/** tablespace containing the rollback segment; constant after init() */
fil_space_t *space;
/** latch protecting everything except page_no, space */
- srw_spin_lock latch;
+ IF_DBUG(srw_lock_debug,srw_spin_lock) latch;
/** rollback segment header page number; constant after init() */
uint32_t page_no;
/** length of the TRX_RSEG_HISTORY list (number of transactions) */
@@ -170,19 +170,21 @@ public:
/** Last not yet purged undo log header; FIL_NULL if all purged */
uint32_t last_page_no;
- /** trx_t::no | last_offset << 48 */
+ /** trx_t::no << 16 | last_offset */
uint64_t last_commit_and_offset;
/** @return the commit ID of the last committed transaction */
trx_id_t last_trx_no() const
- { return last_commit_and_offset & ((1ULL << 48) - 1); }
+ { return last_commit_and_offset >> 16; }
/** @return header offset of the last committed transaction */
uint16_t last_offset() const
- { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
+ {
+ return static_cast<uint16_t>(last_commit_and_offset);
+ }
void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
{
- last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
+ last_commit_and_offset= trx_no << 16 | static_cast<uint64_t>(last_offset);
}
/** @return the page identifier */
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 0a3e0d62..15255354 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -785,13 +785,19 @@ public:
const char* op_info; /*!< English text describing the
current operation, or an empty
string */
- uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
- bool check_foreigns; /*!< normally TRUE, but if the user
- wants to suppress foreign key checks,
- (in table imports, for example) we
- set this FALSE */
+ /** TRX_ISO_REPEATABLE_READ, ... */
+ unsigned isolation_level:2;
+ /** when set, REPEATABLE READ will actually be Snapshot Isolation, due to
+ detecting write/write conflicts and disabling "semi-consistent read" */
+ unsigned snapshot_isolation:1;
+ /** normally set; "SET foreign_key_checks=0" can be issued to suppress
+ foreign key checks, in table imports, for example */
+ unsigned check_foreigns:1;
+ /** normally set; "SET unique_checks=0, foreign_key_checks=0"
+ enables bulk insert into an empty table */
+ unsigned check_unique_secondary:1;
/** whether an insert into an empty table is active */
- bool bulk_insert;
+ unsigned bulk_insert:1;
/*------------------------------*/
/* MySQL has a transaction coordinator to coordinate two phase
commit between multiple storage engines and the binary log. When
@@ -805,13 +811,6 @@ public:
/** whether this is holding the prepare mutex */
bool active_commit_ordered;
/*------------------------------*/
- bool check_unique_secondary;
- /*!< normally TRUE, but if the user
- wants to speed up inserts by
- suppressing unique key checks
- for secondary indexes when we decide
- if we can use the insert buffer for
- them, we set this FALSE */
bool flush_log_later;/* In 2PC, we hold the
prepare_commit mutex across
both phases. In that case, we
@@ -1189,10 +1188,16 @@ public:
return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS;
}
+ /** Do the bulk insert for the buffered insert operation of a table.
+ @param table bulk insert operation
+ @return DB_SUCCESS or error code. */
+ dberr_t bulk_insert_apply_for_table(dict_table_t *table);
private:
/** Apply the buffered bulk inserts. */
dberr_t bulk_insert_apply_low();
+ /** Rollback the bulk insert operation for the transaction */
+ void bulk_rollback_low();
/** Assign a rollback segment for modifying temporary tables.
@return the assigned rollback segment */
trx_rseg_t *assign_temp_rseg();
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
index 9f05989f..023e2b98 100644
--- a/storage/innobase/include/trx0undo.inl
+++ b/storage/innobase/include/trx0undo.inl
@@ -125,5 +125,6 @@ trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
{
uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
- return next == end ? nullptr : undo_page->page.frame + next;
+ ut_ad(next <= end);
+ return next >= end ? nullptr : undo_page->page.frame + next;
}
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
index f4183e4c..3ff5f885 100644
--- a/storage/innobase/include/ut0new.h
+++ b/storage/innobase/include/ut0new.h
@@ -1071,9 +1071,8 @@ static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
{
void *ptr = my_large_malloc(&n_bytes, MYF(0));
- ut_dontdump(ptr, n_bytes, true);
-
if (ptr) {
+ ut_dontdump(ptr, n_bytes, true);
os_total_large_mem_allocated += n_bytes;
}
return ptr;
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index fe16ce14..500b6455 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -242,20 +242,6 @@ ut_print_name(
FILE* ef, /*!< in: stream */
const trx_t* trx, /*!< in: transaction */
const char* name); /*!< in: table name to print */
-/** Format a table name, quoted as an SQL identifier.
-If the name contains a slash '/', the result will contain two
-identifiers separated by a period (.), as in SQL
-database_name.table_name.
-@see table_name_t
-@param[in] name table or index name
-@param[out] formatted formatted result, will be NUL-terminated
-@param[in] formatted_size size of the buffer in bytes
-@return pointer to 'formatted' */
-char*
-ut_format_name(
- const char* name,
- char* formatted,
- ulint formatted_size);
/**********************************************************************//**
Catenate files. */
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
index f4660f96..ad43e1c8 100644
--- a/storage/innobase/include/ut0vec.h
+++ b/storage/innobase/include/ut0vec.h
@@ -201,15 +201,6 @@ ib_vector_last_const(
const ib_vector_t* vec); /* in: vector */
/********************************************************************
-Sort the vector elements. */
-UNIV_INLINE
-void
-ib_vector_sort(
-/*===========*/
- ib_vector_t* vec, /* in/out: vector */
- ib_compare_t compare); /* in: the comparator to use for sort */
-
-/********************************************************************
The default ib_vector_t heap free. Does nothing. */
UNIV_INLINE
void
diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl
index 531f0f22..1a844dd8 100644
--- a/storage/innobase/include/ut0vec.inl
+++ b/storage/innobase/include/ut0vec.inl
@@ -305,19 +305,6 @@ ib_vector_remove(
}
/********************************************************************
-Sort the vector elements. */
-UNIV_INLINE
-void
-ib_vector_sort(
-/*===========*/
- /* out: void */
- ib_vector_t* vec, /* in: vector */
- ib_compare_t compare)/* in: the comparator to use for sort */
-{
- qsort(vec->data, vec->used, vec->sizeof_value, compare);
-}
-
-/********************************************************************
Destroy the vector. Make sure the vector owns the allocator, e.g.,
the heap in the the heap allocator. */
UNIV_INLINE
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index df51ceb1..a2107007 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -47,6 +47,7 @@ Created 5/7/1996 Heikki Tuuri
#include "que0que.h"
#include "scope.h"
#include <debug_sync.h>
+#include <mysql/service_thd_mdl.h>
#include <set>
@@ -173,7 +174,7 @@ void lock_sys_t::assert_locked(const dict_table_t &table) const
ut_ad(!table.is_temporary());
if (is_writer())
return;
- ut_ad(readers);
+ ut_ad(latch.have_rd());
ut_ad(table.lock_mutex_is_owner());
}
@@ -182,7 +183,7 @@ void lock_sys_t::hash_table::assert_locked(const page_id_t id) const
{
if (lock_sys.is_writer())
return;
- ut_ad(lock_sys.readers);
+ ut_ad(lock_sys.is_holder());
ut_ad(latch(cell_get(id.fold()))->is_locked());
}
@@ -191,7 +192,7 @@ void lock_sys_t::assert_locked(const hash_cell_t &cell) const
{
if (is_writer())
return;
- ut_ad(lock_sys.readers);
+ ut_ad(lock_sys.is_holder());
ut_ad(hash_table::latch(const_cast<hash_cell_t*>(&cell))->is_locked());
}
#endif
@@ -426,13 +427,10 @@ void lock_sys_t::wr_lock(const char *file, unsigned line)
{
mysql_mutex_assert_not_owner(&wait_mutex);
latch.wr_lock(file, line);
- ut_ad(!writer.exchange(pthread_self(), std::memory_order_relaxed));
}
/** Release exclusive lock_sys.latch */
void lock_sys_t::wr_unlock()
{
- ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
- pthread_self());
latch.wr_unlock();
}
@@ -441,15 +439,11 @@ void lock_sys_t::rd_lock(const char *file, unsigned line)
{
mysql_mutex_assert_not_owner(&wait_mutex);
latch.rd_lock(file, line);
- ut_ad(!writer.load(std::memory_order_relaxed));
- ut_d(readers.fetch_add(1, std::memory_order_relaxed));
}
/** Release shared lock_sys.latch */
void lock_sys_t::rd_unlock()
{
- ut_ad(!writer.load(std::memory_order_relaxed));
- ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
latch.rd_unlock();
}
#endif
@@ -976,8 +970,31 @@ func_exit:
for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock;
lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
{
- /* if victim has also BF status, but has earlier seqno, we have to wait */
- if (lock->trx != trx &&
+ /* Victim trx needs to be different from BF trx and it has to have a
+ THD so that we can kill it. Victim might not have THD in two cases:
+
+ (1) An incomplete transaction that was recovered from undo logs
+ on server startup (and not yet rolled back).
+
+ (2) Transaction that is in XA PREPARE state and whose client
+ connection was disconnected.
+
+ Neither of these can complete before lock_wait_wsrep() releases
+ lock_sys.latch.
+
+ (1) trx_t::commit_in_memory() is clearing both
+ trx_t::state and trx_t::is_recovered before it invokes
+ lock_release(trx_t*) (which would be blocked by the exclusive
+ lock_sys.latch that we are holding here). Hence, it is not
+ possible to write a debug assertion to document this scenario.
+
+ (2) If is in XA PREPARE state, it would eventually be rolled
+ back and the lock conflict would be resolved when an XA COMMIT
+ or XA ROLLBACK statement is executed in some other connection.
+
+ If victim has also BF status, but has earlier seqno, we have to wait.
+ */
+ if (lock->trx != trx && lock->trx->mysql_thd &&
!(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
{
@@ -1009,8 +1026,11 @@ func_exit:
lock= lock_rec_get_next(heap_no, lock);
do
{
- /* if victim has also BF status, but has earlier seqno, we have to wait */
- if (lock->trx != trx &&
+ /* This is similar case as above except here we have
+ record-locks instead of table locks. See details
+ from comment above.
+ */
+ if (lock->trx != trx && lock->trx->mysql_thd &&
!(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
{
@@ -1036,8 +1056,12 @@ func_exit:
std::vector<std::pair<ulong,trx_id_t>> victim_id;
for (trx_t *v : victims)
+ {
+ /* Victim must have THD */
+ ut_ad(v->mysql_thd);
victim_id.emplace_back(std::pair<ulong,trx_id_t>
{thd_get_thread_id(v->mysql_thd), v->id});
+ }
DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
{
@@ -3940,6 +3964,8 @@ static void lock_table_dequeue(lock_t *in_lock, bool owns_wait_mutex)
dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
bool no_wait)
{
+ ut_ad(!dict_sys.frozen());
+
mem_heap_t *heap= mem_heap_create(512);
sel_node_t *node= sel_node_create(heap);
que_thr_t *thr= pars_complete_graph_for_exec(node, trx, heap, nullptr);
@@ -3976,6 +4002,67 @@ run_again:
return err;
}
+/** Lock the child tables of a table.
+@param table parent table
+@param trx transaction
+@return error code */
+dberr_t lock_table_children(dict_table_t *table, trx_t *trx)
+{
+ MDL_context *mdl_context=
+ static_cast<MDL_context*>(thd_mdl_context(trx->mysql_thd));
+ ut_ad(mdl_context);
+ struct table_mdl{dict_table_t* table; MDL_ticket *mdl;};
+ std::vector<table_mdl> children;
+ children.emplace_back(table_mdl{table, nullptr});
+
+ dberr_t err= DB_SUCCESS;
+ dict_sys.freeze(SRW_LOCK_CALL);
+
+ rescan:
+ for (auto f : table->referenced_set)
+ if (dict_table_t *child= f->foreign_table)
+ {
+ if (std::find_if(children.begin(), children.end(),
+ [&](const table_mdl &c){ return c.table == child; }) !=
+ children.end())
+ continue; /* We already acquired MDL on this child table. */
+ MDL_ticket *mdl= nullptr;
+ child->acquire();
+ child= dict_acquire_mdl_shared<false>(child, mdl_context, &mdl,
+ DICT_TABLE_OP_NORMAL);
+ if (child)
+ {
+ if (!mdl)
+ child->release();
+ children.emplace_back(table_mdl{child, mdl});
+ goto rescan;
+ }
+ err= DB_LOCK_WAIT_TIMEOUT;
+ break;
+ }
+ dict_sys.unfreeze();
+
+ if (err == DB_SUCCESS)
+ for (const table_mdl &child : children)
+ if (child.mdl)
+ if ((err= lock_table_for_trx(child.table, trx, LOCK_X)) != DB_SUCCESS)
+ break;
+
+ dict_sys.freeze(SRW_LOCK_CALL);
+ for (table_mdl &child : children)
+ {
+ if (child.mdl)
+ {
+ child.table->release();
+ mdl_context->release_lock(child.mdl);
+ }
+ }
+ dict_sys.unfreeze();
+
+ return err;
+}
+
+
/** Exclusively lock the data dictionary tables.
@param trx dictionary transaction
@return error code
@@ -4125,7 +4212,7 @@ restart:
ulint count= 1000;
/* We will not attempt hardware lock elision (memory transaction)
here. Both lock_rec_dequeue_from_page() and lock_table_dequeue()
- would likely lead to a memory transaction due to a system call, to
+ would likely lead to a memory transaction abort due to a system call, to
wake up a waiting transaction. */
lock_sys.rd_lock(SRW_LOCK_CALL);
trx->mutex_lock();
@@ -4295,28 +4382,82 @@ void lock_release_on_drop(trx_t *trx)
}
}
-/** Reset lock bit for supremum and rebuild waiting queue.
+/** Reset a lock bit and rebuild waiting queue.
@param cell rec hash cell of in_lock
@param lock the lock with supemum bit set */
-static void lock_rec_unlock_supremum(hash_cell_t &cell, lock_t *lock)
+static void lock_rec_unlock(hash_cell_t &cell, lock_t *lock, ulint heap_no)
{
- ut_ad(lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
+ ut_ad(lock_rec_get_nth_bit(lock, heap_no));
#ifdef SAFE_MUTEX
ut_ad(!mysql_mutex_is_owner(&lock_sys.wait_mutex));
#endif /* SAFE_MUTEX */
ut_ad(!lock->is_table());
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
- lock_rec_reset_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM);
+ lock_rec_reset_nth_bit(lock, heap_no);
- lock_t *first_lock= lock_sys_t::get_first(
- cell, lock->un_member.rec_lock.page_id, PAGE_HEAP_NO_SUPREMUM);
+ lock_t *first_lock=
+ lock_sys_t::get_first(cell, lock->un_member.rec_lock.page_id, heap_no);
lock_rec_rebuild_waiting_queue(
#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
lock->trx,
#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
- cell, first_lock, PAGE_HEAP_NO_SUPREMUM);
+ cell, first_lock, heap_no);
+}
+
+/** Release locks to unmodified records on a clustered index page.
+@param cell lock_sys.rec_hash cell of lock
+@param lock record lock
+@param offsets storage for rec_get_offsets()
+@param heap storage for rec_get_offsets()
+@param mtr mini-transaction (will be started and committed) */
+static void lock_rec_unlock_unmodified(hash_cell_t &cell, lock_t *lock,
+ rec_offs *&offsets, mem_heap_t *&heap,
+ mtr_t &mtr)
+{
+ ut_ad(!lock->is_waiting());
+
+ dict_index_t *const index= lock->index;
+
+ mtr.start();
+ if (buf_block_t *block=
+ btr_block_get(*index, lock->un_member.rec_lock.page_id.page_no(),
+ RW_S_LATCH, true, &mtr))
+ {
+ if (UNIV_UNLIKELY(!page_is_leaf(block->page.frame)))
+ {
+ ut_ad("corrupted lock system" == 0);
+ goto func_exit;
+ }
+
+ for (ulint i= PAGE_HEAP_NO_USER_LOW; i < lock_rec_get_n_bits(lock); ++i)
+ {
+ if (!lock_rec_get_nth_bit(lock, i));
+ else if (const rec_t *rec=
+ page_find_rec_with_heap_no(block->page.frame, i))
+ {
+ if (index->is_clust())
+ {
+ if (trx_read_trx_id(rec + row_trx_id_offset(rec, index)) ==
+ lock->trx->id)
+ continue;
+ unlock_rec:
+ lock_rec_unlock(cell, lock, i);
+ }
+ else
+ {
+ offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ ULINT_UNDEFINED, &heap);
+ if (lock->trx !=
+ lock_sec_rec_some_has_impl(lock->trx, rec, index, offsets))
+ goto unlock_rec;
+ }
+ }
+ }
+ }
+func_exit:
+ mtr.commit();
}
/** Release non-exclusive locks on XA PREPARE,
@@ -4334,6 +4475,12 @@ static bool lock_release_on_prepare_try(trx_t *trx)
DBUG_ASSERT(trx->state == TRX_STATE_PREPARED);
bool all_released= true;
+ mtr_t mtr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ mem_heap_t *heap= nullptr;
+ rec_offs_init(offsets_);
+
lock_sys.rd_lock(SRW_LOCK_CALL);
trx->mutex_lock();
@@ -4350,20 +4497,24 @@ static bool lock_release_on_prepare_try(trx_t *trx)
if (!lock->is_table())
{
ut_ad(!lock->index->table->is_temporary());
- bool supremum_bit = lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM);
- bool rec_granted_exclusive_not_gap =
+ bool supremum_bit= lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM);
+ bool rec_granted_exclusive_not_gap=
lock->is_rec_granted_exclusive_not_gap();
if (!supremum_bit && rec_granted_exclusive_not_gap)
continue;
- auto &lock_hash= lock_sys.hash_get(lock->type_mode);
- auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+ if (UNIV_UNLIKELY(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))
+ continue; /* SPATIAL INDEX locking is broken. */
+ auto cell=
+ lock_sys.rec_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
auto latch= lock_sys_t::hash_table::latch(cell);
if (latch->try_acquire())
{
if (!rec_granted_exclusive_not_gap)
lock_rec_dequeue_from_page(lock, false);
else if (supremum_bit)
- lock_rec_unlock_supremum(*cell, lock);
+ lock_rec_unlock(*cell, lock, PAGE_HEAP_NO_SUPREMUM);
+ else
+ lock_rec_unlock_unmodified(*cell, lock, offsets, heap, mtr);
latch->release();
}
else
@@ -4396,6 +4547,8 @@ static bool lock_release_on_prepare_try(trx_t *trx)
lock_sys.rd_unlock();
trx->mutex_unlock();
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
return all_released;
}
@@ -4409,52 +4562,71 @@ void lock_release_on_prepare(trx_t *trx)
if (lock_release_on_prepare_try(trx))
return;
- LockMutexGuard g{SRW_LOCK_CALL};
- trx->mutex_lock();
+ mtr_t mtr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(offsets_);
- for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
- lock= prev)
{
- ut_ad(lock->trx == trx);
- prev= UT_LIST_GET_PREV(trx_locks, lock);
- if (!lock->is_table())
+ LockMutexGuard g{SRW_LOCK_CALL};
+ trx->mutex_lock();
+
+ for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
+ lock= prev)
{
- ut_ad(!lock->index->table->is_temporary());
- if (!lock->is_rec_granted_exclusive_not_gap())
- lock_rec_dequeue_from_page(lock, false);
- else if (lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM))
+ ut_ad(lock->trx == trx);
+ prev= UT_LIST_GET_PREV(trx_locks, lock);
+ if (!lock->is_table())
{
- auto &lock_hash= lock_sys.hash_get(lock->type_mode);
- auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
- lock_rec_unlock_supremum(*cell, lock);
+ ut_ad(!lock->index->table->is_temporary());
+ if (!lock->is_rec_granted_exclusive_not_gap())
+ lock_rec_dequeue_from_page(lock, false);
+ else if (UNIV_UNLIKELY(lock->type_mode &
+ (LOCK_PREDICATE | LOCK_PRDT_PAGE)))
+ /* SPATIAL INDEX locking is broken. */;
+ else
+ {
+ auto cell= lock_sys.rec_hash.cell_get(lock->un_member.rec_lock.
+ page_id.fold());
+ if (lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM))
+ lock_rec_unlock(*cell, lock, PAGE_HEAP_NO_SUPREMUM);
+ else
+ {
+ ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
+ /* Insert-intention lock is valid for supremum for isolation
+ level > TRX_ISO_READ_COMMITTED */
+ lock->mode() == LOCK_X ||
+ !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
+ lock_rec_unlock_unmodified(*cell, lock, offsets, heap, mtr);
+ }
+ }
}
else
- ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
- /* Insert-intention lock is valid for supremum for isolation
- level > TRX_ISO_READ_COMMITTED */
- lock->mode() == LOCK_X ||
- !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
- }
- else
- {
- ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
- ut_ad(!table->is_temporary());
- switch (lock->mode()) {
- case LOCK_IS:
- case LOCK_S:
- lock_table_dequeue(lock, false);
- break;
- case LOCK_IX:
- case LOCK_X:
- ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
- /* fall through */
- default:
- break;
+ {
+ ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+ ut_ad(!table->is_temporary());
+ switch (lock->mode()) {
+ case LOCK_IS:
+ case LOCK_S:
+ lock_table_dequeue(lock, false);
+ break;
+ case LOCK_IX:
+ case LOCK_X:
+ ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
+ /* fall through */
+ default:
+ break;
+ }
}
}
}
trx->mutex_unlock();
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
}
/** Release locks on a table whose creation is being rolled back */
@@ -5414,47 +5586,43 @@ lock_rec_insert_check_and_lock(
return err;
}
-/*********************************************************************//**
-Creates an explicit record lock for a running transaction that currently only
-has an implicit lock on the record. The transaction instance must have a
-reference count > 0 so that it can't be committed and freed before this
-function has completed. */
-static
-bool
-lock_rec_convert_impl_to_expl_for_trx(
-/*==================================*/
- trx_t* trx, /*!< in/out: active transaction */
- const page_id_t id, /*!< in: page identifier */
- const rec_t* rec, /*!< in: user record on page */
- dict_index_t* index) /*!< in: index of record */
+/** Create an explicit record lock for a transaction that currently only
+has an implicit lock on the record.
+@param trx referenced, active transaction, or nullptr
+@param id page identifier
+@param rec record in the page
+@param index the index B-tree that the record belongs to
+@return trx, with the reference released */
+static trx_t *lock_rec_convert_impl_to_expl_for_trx(trx_t *trx,
+ const page_id_t id,
+ const rec_t *rec,
+ dict_index_t *index)
{
- if (!trx)
- return false;
-
- ut_ad(trx->is_referenced());
- ut_ad(page_rec_is_leaf(rec));
- ut_ad(!rec_is_metadata(rec, *index));
+ if (trx)
+ {
+ ut_ad(trx->is_referenced());
+ ut_ad(page_rec_is_leaf(rec));
+ ut_ad(!rec_is_metadata(rec, *index));
- DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx");
- ulint heap_no= page_rec_get_heap_no(rec);
+ ulint heap_no= page_rec_get_heap_no(rec);
- {
- LockGuard g{lock_sys.rec_hash, id};
- trx->mutex_lock();
- ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+ {
+ LockGuard g{lock_sys.rec_hash, id};
+ trx->mutex_lock();
+ ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+ if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) &&
+ !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no,
+ trx))
+ lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id,
+ page_align(rec), heap_no, index, trx, true);
+ }
- if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) &&
- !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no,
- trx))
- lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id,
- page_align(rec), heap_no, index, trx, true);
+ trx->release_reference();
+ trx->mutex_unlock();
}
- trx->mutex_unlock();
- trx->release_reference();
-
- DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx");
- return false;
+ return trx;
}
@@ -5545,10 +5713,11 @@ should be created.
@param[in] rec record on the leaf page
@param[in] index the index of the record
@param[in] offsets rec_get_offsets(rec,index)
-@return whether caller_trx already holds an exclusive lock on rec */
+@return unsafe pointer to a transaction that held an exclusive lock on rec
+@retval nullptr if no transaction held an exclusive lock */
template<bool is_primary>
static
-bool
+const trx_t *
lock_rec_convert_impl_to_expl(
trx_t* caller_trx,
page_id_t id,
@@ -5572,10 +5741,10 @@ lock_rec_convert_impl_to_expl(
trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
if (trx_id == 0) {
- return false;
+ return nullptr;
}
if (UNIV_UNLIKELY(trx_id == caller_trx->id)) {
- return true;
+ return caller_trx;
}
trx = trx_sys.find(caller_trx, trx_id);
@@ -5586,7 +5755,7 @@ lock_rec_convert_impl_to_expl(
offsets);
if (trx == caller_trx) {
trx->release_reference();
- return true;
+ return trx;
}
ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id));
@@ -5631,11 +5800,18 @@ lock_clust_rec_modify_check_and_lock(
/* If a transaction has no explicit x-lock set on the record, set one
for it */
- if (lock_rec_convert_impl_to_expl<true>(thr_get_trx(thr),
- block->page.id(),
+ trx_t *trx = thr_get_trx(thr);
+ if (const trx_t *owner =
+ lock_rec_convert_impl_to_expl<true>(trx, block->page.id(),
rec, index, offsets)) {
- /* We already hold an implicit exclusive lock. */
- return DB_SUCCESS;
+ if (owner == trx) {
+ /* We already hold an exclusive lock. */
+ return DB_SUCCESS;
+ }
+
+ if (trx->snapshot_isolation && trx->read_view.is_open()) {
+ return DB_RECORD_CHANGED;
+ }
}
err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
@@ -5798,12 +5974,19 @@ lock_sec_rec_read_check_and_lock(
return DB_SUCCESS;
}
- if (!page_rec_is_supremum(rec)
- && lock_rec_convert_impl_to_expl<false>(
- trx, block->page.id(), rec, index, offsets)
- && gap_mode == LOCK_REC_NOT_GAP) {
- /* We already hold an implicit exclusive lock. */
- return DB_SUCCESS;
+ if (page_rec_is_supremum(rec)) {
+ } else if (const trx_t *owner =
+ lock_rec_convert_impl_to_expl<false>(trx, block->page.id(),
+ rec, index, offsets)) {
+ if (owner == trx) {
+ if (gap_mode == LOCK_REC_NOT_GAP) {
+ /* We already hold an exclusive lock. */
+ return DB_SUCCESS;
+ }
+ } else if (trx->snapshot_isolation
+ && trx->read_view.is_open()) {
+ return DB_RECORD_CHANGED;
+ }
}
#ifdef WITH_WSREP
@@ -5883,13 +6066,28 @@ lock_clust_rec_read_check_and_lock(
ulint heap_no = page_rec_get_heap_no(rec);
trx_t *trx = thr_get_trx(thr);
- if (!lock_table_has(trx, index->table, LOCK_X)
- && heap_no != PAGE_HEAP_NO_SUPREMUM
- && lock_rec_convert_impl_to_expl<true>(trx, id,
- rec, index, offsets)
- && gap_mode == LOCK_REC_NOT_GAP) {
- /* We already hold an implicit exclusive lock. */
- return DB_SUCCESS;
+ if (lock_table_has(trx, index->table, LOCK_X)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM) {
+ } else if (const trx_t *owner =
+ lock_rec_convert_impl_to_expl<true>(trx, id,
+ rec, index, offsets)) {
+ if (owner == trx) {
+ if (gap_mode == LOCK_REC_NOT_GAP) {
+ /* We already hold an exclusive lock. */
+ return DB_SUCCESS;
+ }
+ } else if (trx->snapshot_isolation
+ && trx->read_view.is_open()) {
+ return DB_RECORD_CHANGED;
+ }
+ }
+
+ if (heap_no > PAGE_HEAP_NO_SUPREMUM && gap_mode != LOCK_GAP
+ && trx->snapshot_isolation
+ && trx->read_view.is_open()
+ && !trx->read_view.changes_visible(
+ trx_read_trx_id(rec + row_trx_id_offset(rec, index)))) {
+ return DB_RECORD_CHANGED;
}
dberr_t err = lock_rec_lock(false, gap_mode | mode,
@@ -6656,6 +6854,7 @@ and less modified rows. Bit 0 is used to prefer orig_trx in case of a tie.
print(buf);
}
+ DBUG_EXECUTE_IF("innodb_deadlock_victim_self", victim= trx;);
ut_ad(victim->state == TRX_STATE_ACTIVE);
/* victim->lock.was_chosen_as_deadlock_victim must always be set before
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 9f39b303..ea717de2 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -69,9 +69,7 @@ log_t log_sys;
void log_t::set_capacity()
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
/* Margin for the free space in the smallest log, before a new query
step which modifies the database, is started */
@@ -134,7 +132,6 @@ bool log_t::create()
#endif
latch.SRW_LOCK_INIT(log_latch_key);
- init_lsn_lock();
last_checkpoint_lsn= FIRST_LSN;
log_capacity= 0;
@@ -143,7 +140,7 @@ bool log_t::create()
next_checkpoint_lsn= 0;
checkpoint_pending= false;
- buf_free= 0;
+ set_buf_free(0);
ut_ad(is_initialised());
#ifndef HAVE_PMEM
@@ -175,11 +172,13 @@ void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
ut_ad(is_opened());
if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file,
buf.data(), offset, buf.size()))
- ib::fatal() << "write(\"ib_logfile0\") returned " << err;
+ ib::fatal() << "write(\"ib_logfile0\") returned " << err
+ << ". Operating system error number "
+ << IF_WIN(GetLastError(), errno) << ".";
}
#ifdef HAVE_PMEM
-# include <libpmem.h>
+# include "cache.h"
/** Attempt to memory map a file.
@param file log file handle
@@ -236,12 +235,13 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
log.close();
mprotect(ptr, size_t(size), PROT_READ);
buf= static_cast<byte*>(ptr);
- max_buf_free= size;
+ max_buf_free= 1;
# if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
# endif
log_maybe_unbuffered= true;
log_buffered= false;
+ mtr_t::finisher_update();
return true;
}
}
@@ -276,6 +276,7 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
block_size);
#endif
+ mtr_t::finisher_update();
#ifdef HAVE_PMEM
checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size));
memset_aligned<64>(checkpoint_buf, 0, block_size);
@@ -311,9 +312,7 @@ void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted)
void log_t::create(lsn_t lsn) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_write_locked());
-#endif
+ ut_ad(latch_have_wr());
ut_ad(!recv_no_log_write);
ut_ad(is_latest());
ut_ad(this == &log_sys);
@@ -330,12 +329,12 @@ void log_t::create(lsn_t lsn) noexcept
{
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
memset_aligned<4096>(buf, 0, 4096);
- buf_free= START_OFFSET;
+ set_buf_free(START_OFFSET);
}
else
#endif
{
- buf_free= 0;
+ set_buf_free(0);
memset_aligned<4096>(flush_buf, 0, buf_size);
memset_aligned<4096>(buf, 0, buf_size);
}
@@ -464,8 +463,7 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
resize_lsn.store(1, std::memory_order_relaxed);
resize_target= 0;
resize_log.m_file=
- os_file_create_func(path.c_str(),
- OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+ os_file_create_func(path.c_str(), OS_FILE_CREATE,
OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
if (success)
{
@@ -812,9 +810,7 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
@return the current log sequence number */
template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_write_locked());
-#endif
+ ut_ad(latch_have_wr());
ut_ad(!is_pmem());
ut_ad(!srv_read_only_mode);
@@ -930,7 +926,7 @@ wait and check if an already running write is covering the request.
void log_write_up_to(lsn_t lsn, bool durable,
const completion_callback *callback)
{
- ut_ad(!srv_read_only_mode || (log_sys.buf_free < log_sys.max_buf_free));
+ ut_ad(!srv_read_only_mode || log_sys.buf_free_ok());
ut_ad(lsn != LSN_MAX);
ut_ad(lsn != 0);
@@ -1067,7 +1063,7 @@ NOTE that this function may only be called while not holding
any synchronization objects except dict_sys.latch. */
void log_free_check()
{
- ut_ad(!lock_sys.is_writer());
+ ut_ad(!lock_sys.is_holder());
if (log_sys.check_for_checkpoint())
{
ut_ad(!recv_no_log_write);
@@ -1291,6 +1287,7 @@ log_print(
void log_t::close()
{
ut_ad(this == &log_sys);
+ ut_ad(!(buf_free & buf_free_LOCK));
if (!is_initialised()) return;
close_file();
@@ -1308,7 +1305,6 @@ void log_t::close()
#endif
latch.destroy();
- destroy_lsn_lock();
recv_sys.close();
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index e72f842f..6b6a6868 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -844,8 +844,7 @@ processed:
inside recv_sys_t::recover_deferred(). */
bool success;
handle= os_file_create(innodb_data_file_key, filename,
- OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT |
- OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_CREATE_SILENT,
OS_FILE_AIO, OS_DATA_FILE, false, &success);
}
space->add(filename, handle, size, false, false);
@@ -1714,7 +1713,7 @@ dberr_t recv_sys_t::find_checkpoint()
std::string path{get_log_file_path()};
bool success;
os_file_t file{os_file_create_func(path.c_str(),
- OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_OPEN,
OS_FILE_NORMAL, OS_LOG_FILE,
srv_read_only_mode, &success)};
if (file == OS_FILE_CLOSED)
@@ -1744,8 +1743,7 @@ dberr_t recv_sys_t::find_checkpoint()
{
path= get_log_file_path(LOG_FILE_NAME_PREFIX).append(std::to_string(i));
file= os_file_create_func(path.c_str(),
- OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT |
- OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_OPEN_SILENT,
OS_FILE_NORMAL, OS_LOG_FILE, true, &success);
if (file == OS_FILE_CLOSED)
break;
@@ -2520,11 +2518,9 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists)
noexcept
{
restart:
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked() ||
+ ut_ad(log_sys.latch_have_wr() ||
srv_operation == SRV_OPERATION_BACKUP ||
srv_operation == SRV_OPERATION_BACKUP_NO_DEFER);
-#endif
mysql_mutex_assert_owner(&mutex);
ut_ad(log_sys.next_checkpoint_lsn);
ut_ad(log_sys.is_latest());
@@ -4052,9 +4048,7 @@ static bool recv_scan_log(bool last_phase)
lsn_t rewound_lsn= 0;
for (ut_d(lsn_t source_offset= 0);;)
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
#ifdef UNIV_DEBUG
const bool wrap{source_offset + recv_sys.len == log_sys.file_size};
#endif
@@ -4120,9 +4114,10 @@ static bool recv_scan_log(bool last_phase)
const lsn_t end{recv_sys.file_checkpoint};
ut_ad(!end || end == recv_sys.lsn);
+ bool corrupt_fs= recv_sys.is_corrupt_fs();
mysql_mutex_unlock(&recv_sys.mutex);
- if (!end)
+ if (!end && !corrupt_fs)
{
recv_sys.set_corrupt_log();
sql_print_error("InnoDB: Missing FILE_CHECKPOINT(" LSN_PF
@@ -4448,9 +4443,7 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
static dberr_t recv_rename_files()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
dberr_t err= DB_SUCCESS;
@@ -4602,6 +4595,9 @@ read_only_recovery:
LSN_PF, recv_sys.lsn);
goto err_exit;
}
+ if (recv_sys.is_corrupt_fs()) {
+ goto err_exit;
+ }
ut_ad(recv_sys.file_checkpoint);
if (rewind) {
recv_sys.lsn = log_sys.next_checkpoint_lsn;
@@ -4640,9 +4636,9 @@ read_only_recovery:
do {
rescan = recv_scan_log(false);
- ut_ad(!recv_sys.is_corrupt_fs());
- if (recv_sys.is_corrupt_log()) {
+ if (recv_sys.is_corrupt_log() ||
+ recv_sys.is_corrupt_fs()) {
goto err_exit;
}
@@ -4730,7 +4726,7 @@ err_exit:
PROT_READ | PROT_WRITE);
#endif
}
- log_sys.buf_free = recv_sys.offset;
+ log_sys.set_buf_free(recv_sys.offset);
if (recv_needed_recovery
&& srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
/* Write a FILE_CHECKPOINT marker as the first thing,
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 01641f74..74d3adb2 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -37,6 +37,31 @@ Created 11/26/1995 Heikki Tuuri
#include "srv0start.h"
#include "log.h"
#include "mariadb_stats.h"
+#include "my_cpu.h"
+
+#ifdef HAVE_PMEM
+void (*mtr_t::commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>);
+#endif
+std::pair<lsn_t,mtr_t::page_flush_ahead> (*mtr_t::finisher)(mtr_t *, size_t);
+unsigned mtr_t::spin_wait_delay;
+
+void mtr_t::finisher_update()
+{
+ ut_ad(log_sys.latch_have_wr());
+#ifdef HAVE_PMEM
+ if (log_sys.is_pmem())
+ {
+ commit_logger= mtr_t::commit_log<true>;
+ finisher= spin_wait_delay
+ ? mtr_t::finish_writer<true,true> : mtr_t::finish_writer<false,true>;
+ return;
+ }
+ commit_logger= mtr_t::commit_log<false>;
+#endif
+ finisher=
+ (spin_wait_delay
+ ? mtr_t::finish_writer<true,false> : mtr_t::finish_writer<false,false>);
+}
void mtr_memo_slot_t::release() const
{
@@ -82,9 +107,7 @@ void mtr_memo_slot_t::release() const
inline buf_page_t *buf_pool_t::prepare_insert_into_flush_list(lsn_t lsn)
noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(recv_recovery_is_on() || log_sys.latch.is_locked());
-#endif
+ ut_ad(recv_recovery_is_on() || log_sys.latch_have_any());
ut_ad(lsn >= log_sys.last_checkpoint_lsn);
mysql_mutex_assert_owner(&flush_list_mutex);
static_assert(log_t::FIRST_LSN >= 2, "compatibility");
@@ -234,7 +257,14 @@ static void insert_imported(buf_block_t *block)
if (block->page.oldest_modification() <= 1)
{
log_sys.latch.rd_lock(SRW_LOCK_CALL);
- const lsn_t lsn= log_sys.last_checkpoint_lsn;
+ /* For unlogged mtrs (MTR_LOG_NO_REDO), we use the current system LSN. The
+ mtr that generated the LSN is either already committed or in mtr_t::commit.
+ Shared latch and relaxed atomics should be fine here as it is guaranteed
+ that both the current mtr and the mtr that generated the LSN would have
+ added the dirty pages to flush list before we access the minimum LSN during
+ checkpoint. log_checkpoint_low() acquires exclusive log_sys.latch before
+ commencing. */
+ const lsn_t lsn= log_sys.get_lsn();
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_pool.insert_into_flush_list
(buf_pool.prepare_insert_into_flush_list(lsn), block, lsn);
@@ -310,12 +340,9 @@ void mtr_t::release()
inline lsn_t log_t::get_write_target() const
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_locked());
-#endif
- if (UNIV_LIKELY(buf_free < max_buf_free))
+ ut_ad(latch_have_any());
+ if (UNIV_LIKELY(buf_free_ok()))
return 0;
- ut_ad(!is_pmem());
/* The LSN corresponding to the end of buf is
write_lsn - (first_lsn & 4095) + buf_free,
but we use simpler arithmetics to return a smaller write target in
@@ -324,151 +351,161 @@ inline lsn_t log_t::get_write_target() const
return write_lsn + max_buf_free / 2;
}
-/** Commit a mini-transaction. */
-void mtr_t::commit()
+template<bool pmem>
+void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
{
- ut_ad(is_active());
- ut_ad(!is_inside_ibuf());
-
- /* This is a dirty read, for debugging. */
- ut_ad(!m_modifications || !recv_no_log_write);
- ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
- ut_ad(!m_latch_ex);
+ size_t modified= 0;
+ const lsn_t write_lsn= pmem ? 0 : log_sys.get_write_target();
- if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
+ if (mtr->m_made_dirty)
{
- if (UNIV_UNLIKELY(!is_logged()))
+ auto it= mtr->m_memo.rbegin();
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ buf_page_t *const prev=
+ buf_pool.prepare_insert_into_flush_list(lsns.first);
+
+ while (it != mtr->m_memo.rend())
{
- release_unlogged();
- goto func_exit;
+ const mtr_memo_slot_t &slot= *it++;
+ if (slot.type & MTR_MEMO_MODIFY)
+ {
+ ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+ slot.type == MTR_MEMO_PAGE_SX_MODIFY);
+ modified++;
+ buf_block_t *b= static_cast<buf_block_t*>(slot.object);
+ ut_ad(b->page.id() < end_page_id);
+ ut_d(const auto s= b->page.state());
+ ut_ad(s > buf_page_t::FREED);
+ ut_ad(s < buf_page_t::READ_FIX);
+ ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <=
+ mtr->m_commit_lsn);
+ mach_write_to_8(b->page.frame + FIL_PAGE_LSN, mtr->m_commit_lsn);
+ if (UNIV_LIKELY_NULL(b->page.zip.data))
+ memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
+ FIL_PAGE_LSN + b->page.frame, 8);
+ buf_pool.insert_into_flush_list(prev, b, lsns.first);
+ }
}
- ut_ad(!srv_read_only_mode);
- std::pair<lsn_t,page_flush_ahead> lsns{do_write()};
- process_freed_pages();
- size_t modified= 0;
- const lsn_t write_lsn= log_sys.get_write_target();
+ ut_ad(modified);
+ buf_pool.flush_list_requests+= modified;
+ buf_pool.page_cleaner_wakeup();
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- if (m_made_dirty)
+ if (mtr->m_latch_ex)
{
- auto it= m_memo.rbegin();
-
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ log_sys.latch.wr_unlock();
+ mtr->m_latch_ex= false;
+ }
+ else
+ log_sys.latch.rd_unlock();
- buf_page_t *const prev=
- buf_pool.prepare_insert_into_flush_list(lsns.first);
+ mtr->release();
+ }
+ else
+ {
+ if (mtr->m_latch_ex)
+ {
+ log_sys.latch.wr_unlock();
+ mtr->m_latch_ex= false;
+ }
+ else
+ log_sys.latch.rd_unlock();
- while (it != m_memo.rend())
- {
- const mtr_memo_slot_t &slot= *it++;
+ for (auto it= mtr->m_memo.rbegin(); it != mtr->m_memo.rend(); )
+ {
+ const mtr_memo_slot_t &slot= *it++;
+ ut_ad(slot.object);
+ switch (slot.type) {
+ case MTR_MEMO_S_LOCK:
+ static_cast<index_lock*>(slot.object)->s_unlock();
+ break;
+ case MTR_MEMO_SPACE_X_LOCK:
+ static_cast<fil_space_t*>(slot.object)->set_committed_size();
+ static_cast<fil_space_t*>(slot.object)->x_unlock();
+ break;
+ case MTR_MEMO_X_LOCK:
+ case MTR_MEMO_SX_LOCK:
+ static_cast<index_lock*>(slot.object)->
+ u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK);
+ break;
+ default:
+ buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
+ ut_d(const auto s=)
+ bpage->unfix();
if (slot.type & MTR_MEMO_MODIFY)
{
ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
slot.type == MTR_MEMO_PAGE_SX_MODIFY);
- modified++;
- buf_block_t *b= static_cast<buf_block_t*>(slot.object);
- ut_ad(b->page.id() < end_page_id);
- ut_d(const auto s= b->page.state());
- ut_ad(s > buf_page_t::FREED);
+ ut_ad(bpage->oldest_modification() > 1);
+ ut_ad(bpage->oldest_modification() < mtr->m_commit_lsn);
+ ut_ad(bpage->id() < end_page_id);
+ ut_ad(s >= buf_page_t::FREED);
ut_ad(s < buf_page_t::READ_FIX);
- ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <=
- m_commit_lsn);
- mach_write_to_8(b->page.frame + FIL_PAGE_LSN, m_commit_lsn);
- if (UNIV_LIKELY_NULL(b->page.zip.data))
- memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
- FIL_PAGE_LSN + b->page.frame, 8);
- buf_pool.insert_into_flush_list(prev, b, lsns.first);
+ ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <=
+ mtr->m_commit_lsn);
+ mach_write_to_8(bpage->frame + FIL_PAGE_LSN, mtr->m_commit_lsn);
+ if (UNIV_LIKELY_NULL(bpage->zip.data))
+ memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data,
+ FIL_PAGE_LSN + bpage->frame, 8);
+ modified++;
+ }
+ switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) {
+ case MTR_MEMO_PAGE_S_FIX:
+ bpage->lock.s_unlock();
+ continue;
+ case MTR_MEMO_PAGE_SX_FIX:
+ case MTR_MEMO_PAGE_X_FIX:
+ bpage->lock.u_or_x_unlock(latch == MTR_MEMO_PAGE_SX_FIX);
+ continue;
+ default:
+ ut_ad(latch == MTR_MEMO_BUF_FIX);
}
}
+ }
- ut_ad(modified);
- buf_pool.flush_list_requests+= modified;
- buf_pool.page_cleaner_wakeup();
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ buf_pool.add_flush_list_requests(modified);
+ mtr->m_memo.clear();
+ }
- if (m_latch_ex)
- {
- log_sys.latch.wr_unlock();
- m_latch_ex= false;
- }
- else
- log_sys.latch.rd_unlock();
+ mariadb_increment_pages_updated(modified);
- release();
- }
- else
- {
- if (m_latch_ex)
- {
- log_sys.latch.wr_unlock();
- m_latch_ex= false;
- }
- else
- log_sys.latch.rd_unlock();
+ if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
+ buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
- for (auto it= m_memo.rbegin(); it != m_memo.rend(); )
- {
- const mtr_memo_slot_t &slot= *it++;
- ut_ad(slot.object);
- switch (slot.type) {
- case MTR_MEMO_S_LOCK:
- static_cast<index_lock*>(slot.object)->s_unlock();
- break;
- case MTR_MEMO_SPACE_X_LOCK:
- static_cast<fil_space_t*>(slot.object)->set_committed_size();
- static_cast<fil_space_t*>(slot.object)->x_unlock();
- break;
- case MTR_MEMO_X_LOCK:
- case MTR_MEMO_SX_LOCK:
- static_cast<index_lock*>(slot.object)->
- u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK);
- break;
- default:
- buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
- ut_d(const auto s=)
- bpage->unfix();
- if (slot.type & MTR_MEMO_MODIFY)
- {
- ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
- slot.type == MTR_MEMO_PAGE_SX_MODIFY);
- ut_ad(bpage->oldest_modification() > 1);
- ut_ad(bpage->oldest_modification() < m_commit_lsn);
- ut_ad(bpage->id() < end_page_id);
- ut_ad(s >= buf_page_t::FREED);
- ut_ad(s < buf_page_t::READ_FIX);
- ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <=
- m_commit_lsn);
- mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn);
- if (UNIV_LIKELY_NULL(bpage->zip.data))
- memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data,
- FIL_PAGE_LSN + bpage->frame, 8);
- modified++;
- }
- switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) {
- case MTR_MEMO_PAGE_S_FIX:
- bpage->lock.s_unlock();
- continue;
- case MTR_MEMO_PAGE_SX_FIX:
- case MTR_MEMO_PAGE_X_FIX:
- bpage->lock.u_or_x_unlock(latch == MTR_MEMO_PAGE_SX_FIX);
- continue;
- default:
- ut_ad(latch == MTR_MEMO_BUF_FIX);
- }
- }
- }
+ if (!pmem && UNIV_UNLIKELY(write_lsn != 0))
+ log_write_up_to(write_lsn, false);
+}
- buf_pool.add_flush_list_requests(modified);
- m_memo.clear();
- }
+/** Commit a mini-transaction. */
+void mtr_t::commit()
+{
+ ut_ad(is_active());
+ ut_ad(!is_inside_ibuf());
- mariadb_increment_pages_updated(modified);
+ /* This is a dirty read, for debugging. */
+ ut_ad(!m_modifications || !recv_no_log_write);
+ ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
+ ut_ad(!m_latch_ex);
- if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
- buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
+ if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
+ {
+ if (UNIV_UNLIKELY(!is_logged()))
+ {
+ release_unlogged();
+ goto func_exit;
+ }
- if (UNIV_UNLIKELY(write_lsn != 0))
- log_write_up_to(write_lsn, false);
+ ut_ad(!srv_read_only_mode);
+ std::pair<lsn_t,page_flush_ahead> lsns{do_write()};
+ process_freed_pages();
+#ifdef HAVE_PMEM
+ commit_logger(this, lsns);
+#else
+ commit_log<false>(this, lsns);
+#endif
}
else
{
@@ -513,10 +550,8 @@ void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
/** Set create_lsn. */
inline void fil_space_t::set_create_lsn(lsn_t lsn)
{
-#ifndef SUX_LOCK_GENERIC
/* Concurrent log_checkpoint_low() must be impossible. */
- ut_ad(latch.is_write_locked());
-#endif
+ ut_ad(latch.have_wr());
create_lsn= lsn;
}
@@ -529,7 +564,6 @@ void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
ut_ad(!is_inside_ibuf());
ut_ad(!high_level_read_only);
ut_ad(m_modifications);
- ut_ad(m_made_dirty);
ut_ad(!m_memo.empty());
ut_ad(!recv_recovery_is_on());
ut_ad(m_log_mode == MTR_LOG_ALL);
@@ -554,9 +588,7 @@ void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
os_file_truncate(space.chain.start->name, space.chain.start->handle,
os_offset_t{size} << srv_page_size_shift, true);
@@ -713,9 +745,7 @@ This is to be used at log_checkpoint().
@return current LSN */
ATTRIBUTE_COLD lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_write_locked());
-#endif
+ ut_ad(log_sys.latch_have_wr());
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
ut_ad(m_log_mode == MTR_LOG_ALL);
@@ -870,13 +900,111 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn)
? ". Shutdown is in progress" : "");
}
-/** Wait in append_prepare() for buffer to become available
-@param lsn log sequence number to write up to
-@param ex whether log_sys.latch is exclusively locked */
-ATTRIBUTE_COLD void log_t::append_prepare_wait(lsn_t lsn, bool ex) noexcept
+static ATTRIBUTE_NOINLINE void lsn_delay(size_t delay, size_t mult) noexcept
+{
+ delay*= mult * 2; // GCC 13.2.0 -O2 targeting AMD64 wants to unroll twice
+ HMT_low();
+ do
+ MY_RELAX_CPU();
+ while (--delay);
+ HMT_medium();
+}
+
+#if defined __clang_major__ && __clang_major__ < 10
+/* Only clang-10 introduced support for asm goto */
+#elif defined __APPLE__
+/* At least some versions of Apple Xcode do not support asm goto */
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if SIZEOF_SIZE_T == 8
+# define LOCK_TSET \
+ __asm__ goto("lock btsq $63, %0\n\t" "jnc %l1" \
+ : : "m"(buf_free) : "cc", "memory" : got)
+# else
+# define LOCK_TSET \
+ __asm__ goto("lock btsl $31, %0\n\t" "jnc %l1" \
+ : : "m"(buf_free) : "cc", "memory" : got)
+# endif
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# if SIZEOF_SIZE_T == 8
+# define LOCK_TSET \
+ if (!_interlockedbittestandset64 \
+ (reinterpret_cast<volatile LONG64*>(&buf_free), 63)) return
+# else
+# define LOCK_TSET \
+ if (!_interlockedbittestandset \
+ (reinterpret_cast<volatile long*>(&buf_free), 31)) return
+# endif
+#endif
+
+#ifdef LOCK_TSET
+ATTRIBUTE_NOINLINE
+void log_t::lsn_lock_bts() noexcept
+{
+ LOCK_TSET;
+ {
+ const size_t m= mtr_t::spin_wait_delay;
+ constexpr size_t DELAY= 10, MAX_ITERATIONS= 10;
+ for (size_t delay_count= DELAY, delay_iterations= 1;;
+ lsn_delay(delay_iterations, m))
+ {
+ if (!(buf_free.load(std::memory_order_relaxed) & buf_free_LOCK))
+ LOCK_TSET;
+ if (!delay_count);
+ else if (delay_iterations < MAX_ITERATIONS)
+ delay_count= DELAY, delay_iterations++;
+ else
+ delay_count--;
+ }
+ }
+
+# ifdef __GNUC__
+ got:
+ return;
+# endif
+}
+
+inline
+#else
+ATTRIBUTE_NOINLINE
+#endif
+size_t log_t::lock_lsn() noexcept
+{
+#ifdef LOCK_TSET
+ lsn_lock_bts();
+ return ~buf_free_LOCK & buf_free.load(std::memory_order_relaxed);
+# undef LOCK_TSET
+#else
+ size_t b= buf_free.fetch_or(buf_free_LOCK, std::memory_order_acquire);
+ if (b & buf_free_LOCK)
+ {
+ const size_t m= mtr_t::spin_wait_delay;
+ constexpr size_t DELAY= 10, MAX_ITERATIONS= 10;
+ for (size_t delay_count= DELAY, delay_iterations= 1;
+ ((b= buf_free.load(std::memory_order_relaxed)) & buf_free_LOCK) ||
+ (buf_free_LOCK & (b= buf_free.fetch_or(buf_free_LOCK,
+ std::memory_order_acquire)));
+ lsn_delay(delay_iterations, m))
+ if (!delay_count);
+ else if (delay_iterations < MAX_ITERATIONS)
+ delay_count= DELAY, delay_iterations++;
+ else
+ delay_count--;
+ }
+ return b;
+#endif
+}
+
+template<bool spin>
+ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn)
+ noexcept
{
waits++;
- unlock_lsn();
+ ut_ad(buf_free.load(std::memory_order_relaxed) ==
+ (spin ? (b | buf_free_LOCK) : b));
+ if (spin)
+ buf_free.store(b, std::memory_order_release);
+ else
+ lsn_lock.wr_unlock();
if (ex)
latch.wr_unlock();
@@ -890,51 +1018,57 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(lsn_t lsn, bool ex) noexcept
else
latch.rd_lock(SRW_LOCK_CALL);
- lock_lsn();
+ if (spin)
+ return lock_lsn();
+
+ lsn_lock.wr_lock();
+ return buf_free.load(std::memory_order_relaxed);
}
/** Reserve space in the log buffer for appending data.
+@tparam spin whether to use the spin-only lock_lsn()
@tparam pmem log_sys.is_pmem()
@param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */
-template<bool pmem>
+template<bool spin,bool pmem>
inline
std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_locked());
-# ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK
- ut_ad(ex == latch.is_write_locked());
-# endif
-#endif
+ ut_ad(ex ? latch_have_wr() : latch_have_rd());
ut_ad(pmem == is_pmem());
- lock_lsn();
+ if (!spin)
+ lsn_lock.wr_lock();
+ size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)};
write_to_buf++;
const lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size};
- size_t b{buf_free};
if (UNIV_UNLIKELY(pmem
? (end_lsn -
get_flushed_lsn(std::memory_order_relaxed)) > capacity()
: b + size >= buf_size))
- {
- append_prepare_wait(l, ex);
- b= buf_free;
- }
+ b= append_prepare_wait<spin>(b, ex, l);
- lsn.store(end_lsn, std::memory_order_relaxed);
size_t new_buf_free= b + size;
if (pmem && new_buf_free >= file_size)
new_buf_free-= size_t(capacity());
- buf_free= new_buf_free;
- unlock_lsn();
+
+ lsn.store(end_lsn, std::memory_order_relaxed);
if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity))
- set_check_for_checkpoint();
+ set_check_for_checkpoint(true);
+
+ byte *our_buf= buf;
+ if (spin)
+ buf_free.store(new_buf_free, std::memory_order_release);
+ else
+ {
+ buf_free.store(new_buf_free, std::memory_order_relaxed);
+ lsn_lock.wr_unlock();
+ }
- return {l, &buf[b]};
+ return {l, our_buf + b};
}
/** Finish appending data to the log.
@@ -942,9 +1076,7 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
@return whether buf_flush_ahead() will have to be invoked */
static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(log_sys.latch.is_locked());
-#endif
+ ut_ad(log_sys.latch_have_any());
const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
@@ -1009,9 +1141,7 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
ut_ad(!recv_no_log_write);
ut_ad(is_logged());
ut_ad(m_log.size());
-#ifndef SUX_LOCK_GENERIC
- ut_ad(!m_latch_ex || log_sys.latch.is_write_locked());
-#endif
+ ut_ad(!m_latch_ex || log_sys.latch_have_wr());
#ifndef DBUG_OFF
do
@@ -1069,9 +1199,7 @@ func_exit:
inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
size_t seq) noexcept
{
-#ifndef SUX_LOCK_GENERIC
- ut_ad(latch.is_locked());
-#endif
+ ut_ad(latch_have_any());
if (UNIV_LIKELY_NULL(resize_buf))
{
@@ -1176,50 +1304,47 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
}
}
+template<bool spin,bool pmem>
std::pair<lsn_t,mtr_t::page_flush_ahead>
-mtr_t::finish_write(size_t len)
+mtr_t::finish_writer(mtr_t *mtr, size_t len)
{
+ ut_ad(log_sys.is_latest());
ut_ad(!recv_no_log_write);
- ut_ad(is_logged());
-#ifndef SUX_LOCK_GENERIC
-# ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK
- ut_ad(m_latch_ex == log_sys.latch.is_write_locked());
-# endif
-#endif
+ ut_ad(mtr->is_logged());
+ ut_ad(mtr->m_latch_ex ? log_sys.latch_have_wr() : log_sys.latch_have_rd());
- const size_t size{m_commit_lsn ? 5U + 8U : 5U};
- std::pair<lsn_t, byte*> start;
+ const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U};
+ std::pair<lsn_t, byte*> start=
+ log_sys.append_prepare<spin,pmem>(len, mtr->m_latch_ex);
- if (!log_sys.is_pmem())
+ if (!pmem)
{
- start= log_sys.append_prepare<false>(len, m_latch_ex);
- m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+ mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{ log_sys.append(start.second, b->begin(), b->used()); return true; });
#ifdef HAVE_PMEM
write_trailer:
#endif
*start.second++= log_sys.get_sequence_bit(start.first + len - size);
- if (m_commit_lsn)
+ if (mtr->m_commit_lsn)
{
- mach_write_to_8(start.second, m_commit_lsn);
- m_crc= my_crc32c(m_crc, start.second, 8);
+ mach_write_to_8(start.second, mtr->m_commit_lsn);
+ mtr->m_crc= my_crc32c(mtr->m_crc, start.second, 8);
start.second+= 8;
}
- mach_write_to_4(start.second, m_crc);
+ mach_write_to_4(start.second, mtr->m_crc);
start.second+= 4;
}
#ifdef HAVE_PMEM
else
{
- start= log_sys.append_prepare<true>(len, m_latch_ex);
if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size]))
{
- m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+ mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{ log_sys.append(start.second, b->begin(), b->used()); return true; });
goto write_trailer;
}
- m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+ mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{
size_t size{b->used()};
const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second);
@@ -1242,14 +1367,14 @@ mtr_t::finish_write(size_t len)
byte tail[5 + 8];
tail[0]= log_sys.get_sequence_bit(start.first + len - size);
- if (m_commit_lsn)
+ if (mtr->m_commit_lsn)
{
- mach_write_to_8(tail + 1, m_commit_lsn);
- m_crc= my_crc32c(m_crc, tail + 1, 8);
- mach_write_to_4(tail + 9, m_crc);
+ mach_write_to_8(tail + 1, mtr->m_commit_lsn);
+ mtr->m_crc= my_crc32c(mtr->m_crc, tail + 1, 8);
+ mach_write_to_4(tail + 9, mtr->m_crc);
}
else
- mach_write_to_4(tail + 1, m_crc);
+ mach_write_to_4(tail + 1, mtr->m_crc);
::memcpy(start.second, tail, size_left);
::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left,
@@ -1258,12 +1383,14 @@ mtr_t::finish_write(size_t len)
((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) +
(size - size_left);
}
+#else
+ static_assert(!pmem, "");
#endif
log_sys.resize_write(start.first, start.second, len, size);
- m_commit_lsn= start.first + len;
- return {start.first, log_close(m_commit_lsn)};
+ mtr->m_commit_lsn= start.first + len;
+ return {start.first, log_close(mtr->m_commit_lsn)};
}
bool mtr_t::have_x_latch(const buf_block_t &block) const
@@ -1385,7 +1512,7 @@ void mtr_t::upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch)
ut_ad(slot.type == MTR_MEMO_BUF_FIX);
buf_block_t *block= static_cast<buf_block_t*>(slot.object);
ut_d(const auto state= block->page.state());
- ut_ad(state > buf_page_t::UNFIXED);
+ ut_ad(state > buf_page_t::FREED);
ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
static_assert(int{MTR_MEMO_PAGE_S_FIX} == int{RW_S_LATCH}, "");
static_assert(int{MTR_MEMO_PAGE_X_FIX} == int{RW_X_LATCH}, "");
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 31bec346..3293db12 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -200,17 +200,10 @@ os_file_handle_error_cond_exit(
bool on_error_silent);
/** Does error handling when a file operation fails.
-@param[in] name name of a file or NULL
-@param[in] operation operation name that failed
-@return true if we should retry the operation */
-static
-bool
-os_file_handle_error(
- const char* name,
- const char* operation)
+@param operation name of operation that failed */
+static void os_file_handle_error(const char *operation)
{
- /* Exit in case of unknown error */
- return(os_file_handle_error_cond_exit(name, operation, true, false));
+ os_file_handle_error_cond_exit(nullptr, operation, true, false);
}
/** Does error handling when a file operation fails.
@@ -327,6 +320,12 @@ private:
ssize_t m_n;
/** Offset from where to read/write */
os_offset_t m_offset;
+
+ /** Do the read/write
+ @param request The IO context and type
+ @param n Number of bytes to read/write
+ @return the number of bytes read/written or negative value on error */
+ ssize_t execute_low(const IORequest& request, ssize_t n);
};
#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
@@ -680,28 +679,46 @@ os_file_create_subdirs_if_needed(
/** Do the read/write
@param[in] request The IO context and type
+@param[in] n Number of bytes to read/write
+@return the number of bytes read/written or negative value on error */
+ssize_t
+SyncFileIO::execute_low(const IORequest& request, ssize_t n)
+{
+ ut_ad(n > 0);
+ ut_ad(size_t(n) <= os_file_request_size_max);
+
+ if (request.is_read())
+ return IF_WIN(tpool::pread(m_fh, m_buf, n, m_offset), pread(m_fh, m_buf, n, m_offset));
+ return IF_WIN(tpool::pwrite(m_fh, m_buf, n, m_offset), pwrite(m_fh, m_buf, n, m_offset));
+}
+
+/** Do the read/write
+@param[in] request The IO context and type
@return the number of bytes read/written or negative value on error */
ssize_t
SyncFileIO::execute(const IORequest& request)
{
- ssize_t n_bytes;
+ ssize_t n_bytes= 0;
+ ut_ad(m_n > 0);
- if (request.is_read()) {
-#ifdef _WIN32
- n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset);
-#else
- n_bytes = pread(m_fh, m_buf, m_n, m_offset);
-#endif
- } else {
- ut_ad(request.is_write());
-#ifdef _WIN32
- n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset);
-#else
- n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
-#endif
- }
+ while (size_t(m_n) > os_file_request_size_max)
+ {
+ ssize_t n_partial_bytes= execute_low(request, os_file_request_size_max);
+ if (n_partial_bytes < 0)
+ return n_partial_bytes;
+ n_bytes+= n_partial_bytes;
+ if (n_partial_bytes != os_file_request_size_max)
+ return n_bytes;
+ advance(os_file_request_size_max);
+ }
- return(n_bytes);
+ if (ssize_t n= execute_low(request, m_n))
+ {
+ if (n < 0)
+ return n;
+ n_bytes += n;
+ }
+ return n_bytes;
}
#ifndef _WIN32
@@ -942,7 +959,7 @@ os_file_flush_func(
ib::error() << "The OS said file flush did not succeed";
- os_file_handle_error(NULL, "flush");
+ os_file_handle_error("flush");
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -965,7 +982,7 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success)
@@ -974,76 +991,52 @@ os_file_create_simple_func(
*success = false;
- int create_flag;
- const char* mode_str __attribute__((unused));
-
- ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
- ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
-
- if (create_mode == OS_FILE_OPEN) {
- mode_str = "OPEN";
-
- if (access_type == OS_FILE_READ_ONLY) {
-
- create_flag = O_RDONLY;
-
- } else if (read_only) {
-
- create_flag = O_RDONLY;
-
- } else {
- create_flag = O_RDWR;
- }
-
- } else if (read_only) {
-
- mode_str = "OPEN";
- create_flag = O_RDONLY;
+ int create_flag = O_RDONLY | O_CLOEXEC;
+ if (read_only) {
} else if (create_mode == OS_FILE_CREATE) {
-
- mode_str = "CREATE";
- create_flag = O_RDWR | O_CREAT | O_EXCL;
-
- } else if (create_mode == OS_FILE_CREATE_PATH) {
-
- mode_str = "CREATE PATH";
- /* Create subdirs along the path if needed. */
-
- *success = os_file_create_subdirs_if_needed(name);
-
- if (!*success) {
-
- ib::error()
- << "Unable to create subdirectories '"
- << name << "'";
-
- return(OS_FILE_CLOSED);
- }
-
- create_flag = O_RDWR | O_CREAT | O_EXCL;
- create_mode = OS_FILE_CREATE;
+ create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
} else {
-
- ib::error()
- << "Unknown file create mode ("
- << create_mode
- << " for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
+ ut_ad(create_mode == OS_FILE_OPEN);
+ if (access_type != OS_FILE_READ_ONLY) {
+ create_flag = O_RDWR | O_CLOEXEC;
+ }
}
bool retry;
+#ifdef O_DIRECT
+ int direct_flag = 0;
+ /* This function is always called for data files, we should disable
+ OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+ we open the same file in the same mode, see man page of open(2). */
+ switch (srv_file_flush_method) {
+ case SRV_O_DSYNC:
+ case SRV_O_DIRECT:
+ case SRV_O_DIRECT_NO_FSYNC:
+ direct_flag = O_DIRECT;
+ break;
+ }
+#else
+ constexpr int direct_flag = 0;
+#endif
+
do {
- file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+ file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) {
+#ifdef O_DIRECT
+ if (direct_flag && errno == EINVAL) {
+ direct_flag = 0;
+ retry = true;
+ continue;
+ }
+#endif
*success = false;
- retry = os_file_handle_error(
+ retry = os_file_handle_error_no_exit(
name,
- create_mode == OS_FILE_OPEN
- ? "open" : "create");
+ create_mode == OS_FILE_CREATE
+ ? "create" : "open", false);
} else {
*success = true;
retry = false;
@@ -1051,24 +1044,6 @@ os_file_create_simple_func(
} while (retry);
-#ifdef HAVE_FCNTL_DIRECT
- /* This function is always called for data files, we should disable
- OS caching (O_DIRECT) here as we do in os_file_create_func(), so
- we open the same file in the same mode, see man page of open(2). */
- if (!srv_read_only_mode && *success) {
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
- os_file_set_nocache(file, name, mode_str);
- break;
- default:
- break;
- }
- }
-#endif
-
-#ifndef _WIN32
if (!read_only
&& *success
&& access_type == OS_FILE_READ_WRITE
@@ -1079,7 +1054,6 @@ os_file_create_simple_func(
close(file);
file = -1;
}
-#endif /* !_WIN32 */
return(file);
}
@@ -1113,6 +1087,61 @@ os_file_create_directory(
return(true);
}
+#ifdef O_DIRECT
+# if defined __linux
+/** Note that the log file uses buffered I/O. */
+static ATTRIBUTE_COLD void os_file_log_buffered()
+{
+ log_sys.log_maybe_unbuffered= false;
+ log_sys.log_buffered= true;
+ log_sys.set_block_size(512);
+}
+# endif
+
+/** @return whether the log file may work with unbuffered I/O. */
+static ATTRIBUTE_COLD bool os_file_log_maybe_unbuffered(const struct stat &st)
+{
+ MSAN_STAT_WORKAROUND(&st);
+# ifdef __linux__
+ char b[20 + sizeof "/sys/dev/block/" ":" "/../queue/physical_block_size"];
+ if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/queue/physical_block_size",
+ major(st.st_dev), minor(st.st_dev)) >=
+ static_cast<int>(sizeof b))
+ return false;
+ int f= open(b, O_RDONLY);
+ if (f == -1)
+ {
+ if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/../queue/"
+ "physical_block_size",
+ major(st.st_dev), minor(st.st_dev)) >=
+ static_cast<int>(sizeof b))
+ return false;
+ f= open(b, O_RDONLY);
+ }
+ unsigned long s= 0;
+ if (f != -1)
+ {
+ ssize_t l= read(f, b, sizeof b);
+ if (l > 0 && size_t(l) < sizeof b && b[l - 1] == '\n')
+ {
+ char *end= b;
+ s= strtoul(b, &end, 10);
+ if (b == end || *end != '\n')
+ s = 0;
+ }
+ close(f);
+ }
+ if (s > 4096 || s < 64 || !ut_is_2pow(s))
+ return false;
+ log_sys.set_block_size(uint32_t(s));
+# else
+ constexpr unsigned long s= 4096;
+# endif
+
+ return !(st.st_size & (s - 1));
+}
+#endif
+
/** NOTE! Use the corresponding macro os_file_create(), not directly
this function!
Opens an existing file or creates a new.
@@ -1133,73 +1162,83 @@ Opens an existing file or creates a new.
pfs_os_file_t
os_file_create_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint purpose,
ulint type,
bool read_only,
bool* success)
{
- bool on_error_no_exit;
- bool on_error_silent;
-
*success = false;
DBUG_EXECUTE_IF(
"ib_create_table_fail_disk_full",
- *success = false;
errno = ENOSPC;
return(OS_FILE_CLOSED);
);
- int create_flag;
- const char* mode_str __attribute__((unused));
-
- on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
- ? true : false;
- on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
- ? true : false;
+ int create_flag;
- create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
- | OS_FILE_ON_ERROR_SILENT));
-
- if (create_mode == OS_FILE_OPEN
- || create_mode == OS_FILE_OPEN_RAW
- || create_mode == OS_FILE_OPEN_RETRY) {
-
- mode_str = "OPEN";
-
- create_flag = read_only ? O_RDONLY : O_RDWR;
-
- } else if (read_only) {
-
- mode_str = "OPEN";
-
- create_flag = O_RDONLY;
-
- } else if (create_mode == OS_FILE_CREATE) {
-
- mode_str = "CREATE";
- create_flag = O_RDWR | O_CREAT | O_EXCL;
+ if (read_only) {
+ create_flag = O_RDONLY | O_CLOEXEC;
+ } else if (create_mode == OS_FILE_CREATE
+ || create_mode == OS_FILE_CREATE_SILENT) {
+ create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
+ } else {
+ ut_ad(create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_SILENT
+ || create_mode == OS_FILE_OPEN_RETRY
+ || create_mode == OS_FILE_OPEN_RETRY_SILENT
+ || create_mode == OS_FILE_OPEN_RAW);
+ create_flag = O_RDWR | O_CLOEXEC;
+ }
- } else if (create_mode == OS_FILE_OVERWRITE) {
+#ifdef O_DIRECT
+ struct stat st;
+ ut_a(type == OS_LOG_FILE
+ || type == OS_DATA_FILE || type == OS_DATA_FILE_NO_O_DIRECT);
+ int direct_flag = 0;
- mode_str = "OVERWRITE";
- create_flag = O_RDWR | O_CREAT | O_TRUNC;
+ if (type == OS_DATA_FILE) {
+ switch (srv_file_flush_method) {
+ case SRV_O_DSYNC:
+ case SRV_O_DIRECT:
+ case SRV_O_DIRECT_NO_FSYNC:
+ direct_flag = O_DIRECT;
+ break;
+ default:
+ break;
+ }
+# ifdef __linux__
+ } else if (type != OS_LOG_FILE) {
+ } else if (log_sys.log_buffered) {
+ skip_o_direct:
+ os_file_log_buffered();
+ } else if (create_mode != OS_FILE_CREATE
+ && create_mode != OS_FILE_CREATE_SILENT
+ && !log_sys.is_opened()) {
+ if (stat(name, &st)) {
+ if (errno == ENOENT) {
+ if (create_mode & OS_FILE_ON_ERROR_SILENT) {
+ goto not_found;
+ }
+ sql_print_error(
+ "InnoDB: File %s was not found", name);
+ goto not_found;
+ }
+ goto skip_o_direct;
+ }
- } else {
- ib::error()
- << "Unknown file create mode (" << create_mode << ")"
- << " for file '" << name << "'";
+ if (!os_file_log_maybe_unbuffered(st)) {
+ goto skip_o_direct;
+ }
- return(OS_FILE_CLOSED);
+ direct_flag = O_DIRECT;
+ log_sys.log_maybe_unbuffered= true;
+# endif
}
-
-#ifdef HAVE_FCNTL_DIRECT
- ut_a(type == OS_LOG_FILE
- || type == OS_DATA_FILE
- || type == OS_DATA_FILE_NO_O_DIRECT);
#else
ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
+ constexpr int direct_flag = 0;
#endif
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
@@ -1216,115 +1255,66 @@ os_file_create_func(
}
os_file_t file;
- bool retry;
- do {
- file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+ for (;;) {
+ file = open(name, create_flag | direct_flag, os_innodb_umask);
if (file == -1) {
- const char* operation;
-
- operation = (create_mode == OS_FILE_CREATE
- && !read_only) ? "create" : "open";
-
- *success = false;
-
- if (on_error_no_exit) {
- retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
- } else {
- retry = os_file_handle_error(name, operation);
+#ifdef O_DIRECT
+ if (direct_flag && errno == EINVAL) {
+ direct_flag = 0;
+# ifdef __linux__
+ if (type == OS_LOG_FILE) {
+ os_file_log_buffered();
+ }
+# endif
+ if (create_mode == OS_FILE_CREATE
+ || create_mode == OS_FILE_CREATE_SILENT) {
+ /* Linux may create the file
+ before rejecting the O_DIRECT. */
+ unlink(name);
+ }
+ continue;
+ }
+#endif
+ if (!os_file_handle_error_no_exit(
+ name, (create_flag & O_CREAT)
+ ? "create" : "open",
+ create_mode & OS_FILE_ON_ERROR_SILENT)) {
+ break;
}
} else {
*success = true;
- retry = false;
+ break;
}
-
- } while (retry);
+ }
if (!*success) {
- return file;
+#ifdef __linux__
+not_found:
+#endif
+ return OS_FILE_CLOSED;
}
-#ifdef HAVE_FCNTL_DIRECT
- if (type == OS_DATA_FILE) {
- switch (srv_file_flush_method) {
- case SRV_O_DSYNC:
- case SRV_O_DIRECT:
- case SRV_O_DIRECT_NO_FSYNC:
-# ifdef __linux__
-use_o_direct:
-# endif
- os_file_set_nocache(file, name, mode_str);
- break;
- default:
- break;
- }
- }
-# ifdef __linux__
- else if (type == OS_LOG_FILE && !log_sys.is_opened()) {
- struct stat st;
- char b[20 + sizeof "/sys/dev/block/" ":"
- "/../queue/physical_block_size"];
- int f;
- if (fstat(file, &st)) {
- goto skip_o_direct;
- }
- MSAN_STAT_WORKAROUND(&st);
- if (snprintf(b, sizeof b,
- "/sys/dev/block/%u:%u/queue/physical_block_size",
- major(st.st_dev), minor(st.st_dev))
- >= static_cast<int>(sizeof b)) {
- goto skip_o_direct;
- }
- if ((f = open(b, O_RDONLY)) == -1) {
- if (snprintf(b, sizeof b,
- "/sys/dev/block/%u:%u/../queue/"
- "physical_block_size",
- major(st.st_dev), minor(st.st_dev))
- >= static_cast<int>(sizeof b)) {
- goto skip_o_direct;
- }
- f = open(b, O_RDONLY);
- }
- if (f != -1) {
- ssize_t l = read(f, b, sizeof b);
- unsigned long s = 0;
-
- if (l > 0 && static_cast<size_t>(l) < sizeof b
- && b[l - 1] == '\n') {
- char* end = b;
- s = strtoul(b, &end, 10);
- if (b == end || *end != '\n') {
- s = 0;
- }
- }
- close(f);
- if (s > 4096 || s < 64 || !ut_is_2pow(s)) {
- goto skip_o_direct;
- }
- log_sys.log_maybe_unbuffered= true;
- log_sys.set_block_size(uint32_t(s));
- if (!log_sys.log_buffered && !(st.st_size & (s - 1))) {
- goto use_o_direct;
- }
+#ifdef __linux__
+ if ((create_flag & O_CREAT) && type == OS_LOG_FILE) {
+ if (fstat(file, &st) || !os_file_log_maybe_unbuffered(st)) {
+ os_file_log_buffered();
} else {
-skip_o_direct:
- log_sys.log_maybe_unbuffered= false;
- log_sys.log_buffered= true;
- log_sys.set_block_size(512);
+ close(file);
+ return os_file_create_func(name, OS_FILE_OPEN, purpose,
+ type, false, success);
}
}
-# endif
#endif
-#ifndef _WIN32
if (!read_only
&& create_mode != OS_FILE_OPEN_RAW
&& !my_disable_locking
&& os_file_lock(file, name)) {
- if (create_mode == OS_FILE_OPEN_RETRY) {
+ if (create_mode == OS_FILE_OPEN_RETRY
+ || create_mode == OS_FILE_OPEN_RETRY_SILENT) {
ib::info()
<< "Retrying to lock the first data file";
@@ -1346,7 +1336,6 @@ skip_o_direct:
close(file);
file = -1;
}
-#endif /* !_WIN32 */
return(file);
}
@@ -1356,7 +1345,7 @@ os_file_create_simple_no_error_handling(), not directly this function!
A simple function to open or create a file.
@param[in] name name of the file or path as a null-terminated
string
-@param[in] create_mode create mode
+@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN
@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option
is used by a backup program reading the file
@@ -1367,59 +1356,33 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_no_error_handling_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success)
{
os_file_t file;
- int create_flag;
-
- ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
- ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+ int create_flag = O_RDONLY | O_CLOEXEC;
*success = false;
- if (create_mode == OS_FILE_OPEN) {
-
- if (access_type == OS_FILE_READ_ONLY) {
-
- create_flag = O_RDONLY;
-
- } else if (read_only) {
-
- create_flag = O_RDONLY;
-
- } else {
-
+ if (read_only) {
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
+ } else {
+ ut_ad(create_mode == OS_FILE_OPEN);
+ if (access_type != OS_FILE_READ_ONLY) {
ut_a(access_type == OS_FILE_READ_WRITE
|| access_type == OS_FILE_READ_ALLOW_DELETE);
create_flag = O_RDWR;
}
-
- } else if (read_only) {
-
- create_flag = O_RDONLY;
-
- } else if (create_mode == OS_FILE_CREATE) {
-
- create_flag = O_RDWR | O_CREAT | O_EXCL;
-
- } else {
-
- ib::error()
- << "Unknown file create mode "
- << create_mode << " for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
}
- file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+ file = open(name, create_flag, os_innodb_umask);
*success = (file != -1);
-#ifndef _WIN32
if (!read_only
&& *success
&& access_type == OS_FILE_READ_WRITE
@@ -1431,7 +1394,6 @@ os_file_create_simple_no_error_handling_func(
file = -1;
}
-#endif /* !_WIN32 */
return(file);
}
@@ -1537,7 +1499,7 @@ bool os_file_close_func(os_file_t file)
if (!ret)
return true;
- os_file_handle_error(NULL, "close");
+ os_file_handle_error("close");
return false;
}
@@ -1810,7 +1772,7 @@ bool os_file_flush_func(os_file_t file)
if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
return true;
- os_file_handle_error(nullptr, "flush");
+ os_file_handle_error("flush");
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -1924,7 +1886,7 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success)
@@ -1933,82 +1895,31 @@ os_file_create_simple_func(
*success = false;
- DWORD access;
+ DWORD access = GENERIC_READ;
DWORD create_flag;
DWORD attributes = 0;
- ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
- ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
ut_ad(srv_operation == SRV_OPERATION_NORMAL);
- if (create_mode == OS_FILE_OPEN) {
-
- create_flag = OPEN_EXISTING;
-
- } else if (read_only) {
-
+ if (read_only || create_mode == OS_FILE_OPEN) {
create_flag = OPEN_EXISTING;
-
- } else if (create_mode == OS_FILE_CREATE) {
-
- create_flag = CREATE_NEW;
-
- } else if (create_mode == OS_FILE_CREATE_PATH) {
-
- /* Create subdirs along the path if needed. */
- *success = os_file_create_subdirs_if_needed(name);
-
- if (!*success) {
-
- ib::error()
- << "Unable to create subdirectories '"
- << name << "'";
-
- return(OS_FILE_CLOSED);
- }
-
- create_flag = CREATE_NEW;
- create_mode = OS_FILE_CREATE;
-
} else {
-
- ib::error()
- << "Unknown file create mode ("
- << create_mode << ") for file '"
- << name << "'";
-
- return(OS_FILE_CLOSED);
+ ut_ad(create_mode == OS_FILE_CREATE);
+ create_flag = CREATE_NEW;
}
if (access_type == OS_FILE_READ_ONLY) {
-
- access = GENERIC_READ;
-
} else if (read_only) {
-
ib::info()
<< "Read only mode set. Unable to"
" open file '" << name << "' in RW mode, "
<< "trying RO mode";
-
- access = GENERIC_READ;
-
- } else if (access_type == OS_FILE_READ_WRITE) {
-
- access = GENERIC_READ | GENERIC_WRITE;
-
} else {
-
- ib::error()
- << "Unknown file access type (" << access_type << ") "
- "for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
+ ut_ad(access_type == OS_FILE_READ_WRITE);
+ access = GENERIC_READ | GENERIC_WRITE;
}
- bool retry;
-
- do {
+ for (;;) {
/* Use default security attributes and no template file. */
file = CreateFile(
@@ -2016,22 +1927,18 @@ os_file_create_simple_func(
FILE_SHARE_READ | FILE_SHARE_DELETE,
my_win_file_secattr(), create_flag, attributes, NULL);
- if (file == INVALID_HANDLE_VALUE) {
-
- *success = false;
-
- retry = os_file_handle_error(
- name, create_mode == OS_FILE_OPEN ?
- "open" : "create");
-
- } else {
-
- retry = false;
-
+ if (file != INVALID_HANDLE_VALUE) {
*success = true;
+ break;
}
- } while (retry);
+ if (!os_file_handle_error_no_exit(name,
+ create_flag == CREATE_NEW
+ ? "create" : "open",
+ false)) {
+ break;
+ }
+ }
return(file);
}
@@ -2100,16 +2007,13 @@ Opens an existing file or creates a new.
pfs_os_file_t
os_file_create_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint purpose,
ulint type,
bool read_only,
bool* success)
{
os_file_t file;
- bool retry;
- bool on_error_no_exit;
- bool on_error_silent;
*success = false;
@@ -2120,54 +2024,30 @@ os_file_create_func(
return(OS_FILE_CLOSED);
);
- DWORD create_flag;
+ DWORD create_flag = OPEN_EXISTING;
DWORD share_mode = read_only
? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
: FILE_SHARE_READ | FILE_SHARE_DELETE;
- on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
- ? true : false;
-
- on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
- ? true : false;
-
- create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
-
- if (create_mode == OS_FILE_OPEN_RAW) {
-
+ switch (create_mode) {
+ case OS_FILE_OPEN_RAW:
ut_a(!read_only);
-
/* On Windows Physical devices require admin privileges and
have to have the write-share mode set. See the remarks
section for the CreateFile() function documentation in MSDN. */
share_mode |= FILE_SHARE_WRITE;
-
- create_flag = OPEN_EXISTING;
-
- } else if (create_mode == OS_FILE_OPEN
- || create_mode == OS_FILE_OPEN_RETRY) {
-
- create_flag = OPEN_EXISTING;
-
- } else if (read_only) {
-
- create_flag = OPEN_EXISTING;
-
- } else if (create_mode == OS_FILE_CREATE) {
-
+ break;
+ case OS_FILE_CREATE_SILENT:
+ case OS_FILE_CREATE:
create_flag = CREATE_NEW;
-
- } else if (create_mode == OS_FILE_OVERWRITE) {
-
- create_flag = CREATE_ALWAYS;
-
- } else {
- ib::error()
- << "Unknown file create mode (" << create_mode << ") "
- << " for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
+ break;
+ default:
+ ut_ad(create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_SILENT
+ || create_mode == OS_FILE_OPEN_RETRY_SILENT
+ || create_mode == OS_FILE_OPEN_RETRY);
+ break;
}
DWORD attributes = (purpose == OS_FILE_AIO && srv_use_native_aio)
@@ -2225,18 +2105,11 @@ os_file_create_func(
break;
}
- operation = (create_mode == OS_FILE_CREATE && !read_only) ?
- "create" : "open";
+ operation = create_flag == CREATE_NEW ? "create" : "open";
- if (on_error_no_exit) {
- retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
- }
- else {
- retry = os_file_handle_error(name, operation);
- }
-
- if (!retry) {
+ if (!os_file_handle_error_no_exit(name, operation,
+ create_mode
+ & OS_FILE_ON_ERROR_SILENT)) {
break;
}
}
@@ -2263,79 +2136,42 @@ A simple function to open or create a file.
pfs_os_file_t
os_file_create_simple_no_error_handling_func(
const char* name,
- ulint create_mode,
+ os_file_create_t create_mode,
ulint access_type,
bool read_only,
bool* success)
{
os_file_t file;
- *success = false;
-
- DWORD access;
- DWORD create_flag;
+ DWORD access = GENERIC_READ;
+ DWORD create_flag = OPEN_EXISTING;
DWORD attributes = 0;
- DWORD share_mode = read_only
- ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
- : FILE_SHARE_READ | FILE_SHARE_DELETE;
+ DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_DELETE;
ut_a(name);
- ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
- ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
-
- if (create_mode == OS_FILE_OPEN) {
-
- create_flag = OPEN_EXISTING;
-
- } else if (read_only) {
-
- create_flag = OPEN_EXISTING;
-
- } else if (create_mode == OS_FILE_CREATE) {
-
- create_flag = CREATE_NEW;
-
+ if (read_only) {
+ share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE
+ | FILE_SHARE_DELETE;
} else {
+ if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else {
+ ut_ad(create_mode == OS_FILE_OPEN);
+ }
- ib::error()
- << "Unknown file create mode (" << create_mode << ") "
- << " for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
- }
-
- if (access_type == OS_FILE_READ_ONLY) {
-
- access = GENERIC_READ;
-
- } else if (read_only) {
-
- access = GENERIC_READ;
-
- } else if (access_type == OS_FILE_READ_WRITE) {
-
- access = GENERIC_READ | GENERIC_WRITE;
-
- } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
-
- ut_a(!read_only);
-
- access = GENERIC_READ;
-
- /*!< A backup program has to give mysqld the maximum
- freedom to do what it likes with the file */
-
- share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
- | FILE_SHARE_READ;
-
- } else {
-
- ib::error()
- << "Unknown file access type (" << access_type << ") "
- << "for file '" << name << "'";
-
- return(OS_FILE_CLOSED);
+ switch (access_type) {
+ case OS_FILE_READ_ONLY: break;
+ case OS_FILE_READ_WRITE:
+ access = GENERIC_READ | GENERIC_WRITE;
+ break;
+ default:
+ ut_ad(access_type == OS_FILE_READ_ALLOW_DELETE);
+ /* A backup program has to give mariadbd the maximum
+ freedom to do what it likes with the file */
+ share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
+ | FILE_SHARE_READ;
+ }
}
file = CreateFile((LPCTSTR) name,
@@ -2503,7 +2339,7 @@ bool os_file_close_func(os_file_t file)
ut_ad(file);
if (!CloseHandle(file))
{
- os_file_handle_error(NULL, "close");
+ os_file_handle_error("close");
return false;
}
@@ -2941,8 +2777,8 @@ os_file_read_func(
if (ulint(n_bytes) == n || err != DB_SUCCESS)
return err;
- os_file_handle_error_cond_exit(type.node ? type.node->name : nullptr, "read",
- false, false);
+ os_file_handle_error_no_exit(type.node ? type.node->name : nullptr, "read",
+ false);
sql_print_error("InnoDB: Tried to read %zu bytes at offset %llu"
" of file %s, but was only able to read %zd",
n, offset, type.node ? type.node->name : "(unknown)",
@@ -3045,36 +2881,6 @@ os_file_handle_error_cond_exit(
return(false);
}
-#ifdef HAVE_FCNTL_DIRECT
-/** Tries to disable OS caching on an opened file descriptor.
-@param[in] fd file descriptor to alter
-@param[in] file_name file name, used in the diagnostic message
-@param[in] name "open" or "create"; used in the diagnostic
- message */
-void
-os_file_set_nocache(int fd, const char *file_name, const char *operation_name)
-{
- if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
- int errno_save = errno;
- static bool warning_message_printed = false;
- if (errno_save == EINVAL) {
- if (!warning_message_printed) {
- warning_message_printed = true;
- ib::info()
- << "Setting O_DIRECT on file "
- << file_name << " failed";
- }
- } else {
- ib::warn()
- << "Failed to set O_DIRECT on file "
- << file_name << "; " << operation_name
- << " : " << strerror(errno_save)
- << ", continuing anyway.";
- }
- }
-}
-#endif /* HAVE_FCNTL_DIRECT */
-
/** Check if the file system supports sparse files.
@param fh file handle
@return true if the file system supports sparse files */
@@ -3859,8 +3665,9 @@ func_exit:
if (srv_thread_pool->submit_io(cb)) {
slots->release(cb);
- os_file_handle_error(type.node->name, type.is_read()
- ? "aio read" : "aio write");
+ os_file_handle_error_no_exit(type.node->name, type.is_read()
+ ? "aio read" : "aio write",
+ false);
err = DB_IO_ERROR;
type.node->space->release();
}
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index 89e6d149..e375fbfb 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -3269,7 +3269,6 @@ page_zip_validate_low(
ibool sloppy) /*!< in: FALSE=strict,
TRUE=ignore the MIN_REC_FLAG */
{
- page_zip_des_t temp_page_zip;
ibool valid;
if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
@@ -3310,7 +3309,7 @@ page_zip_validate_low(
MEM_CHECK_DEFINED(page, srv_page_size);
MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
- temp_page_zip = *page_zip;
+ page_zip_des_t temp_page_zip(*page_zip);
valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
if (!valid) {
fputs("page_zip_validate(): failed to decompress\n", stderr);
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index a862edd7..47be5e44 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -427,7 +427,7 @@ start:
}
if (!field->fixed_len
- || (format == REC_LEAF_TEMP
+ || (format <= REC_LEAF_TEMP_INSTANT
&& !dict_col_get_fixed_size(col, true))) {
/* Variable-length field: read the length */
len = *lens--;
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index 2516e24e..6194e9c3 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -45,9 +45,11 @@ Created 2012-02-08 by Sunny Bains.
#include "lzo/lzo1x.h"
#include "snappy-c.h"
#include "log.h"
+#include "table.h"
+#include "ha_innodb.h"
#include "scope.h"
-
+#include "dict0crea.h"
#include <vector>
#ifdef HAVE_MY_AES_H
@@ -192,6 +194,60 @@ struct row_import {
dberr_t match_flags(THD *thd) const ;
+ ulint find_fts_idx_offset() const
+ {
+ for (ulint i= 0; i < m_n_indexes; i++)
+ {
+ const char* index_name=
+ reinterpret_cast<const char*>(m_indexes[i].m_name);
+ if (!strcmp(index_name, FTS_DOC_ID_INDEX_NAME))
+ return i;
+ }
+ return ULINT_UNDEFINED;
+ }
+
+ const row_index_t *find_index_by_name(const char *name) const
+ {
+ for (ulint i= 0; i < m_n_indexes; i++)
+ {
+ const char* index_name=
+ reinterpret_cast<const char*>(m_indexes[i].m_name);
+ if (!strcmp(index_name, name))
+ return &m_indexes[i];
+ }
+ return nullptr;
+ }
+
+ /** @return whether cfg file has FTS_DOC_ID
+ & FTS_DOC_ID_INDEX*/
+ bool has_hidden_fts() const
+ {
+ if (m_missing) return false;
+ ulint col_offset= find_col(FTS_DOC_ID_COL_NAME);
+ if (col_offset == ULINT_UNDEFINED) return false;
+
+ const dict_col_t *col= &m_cols[col_offset];
+ if (col->mtype != DATA_INT
+ || (col->prtype & ~(DATA_NOT_NULL
+ | DATA_UNSIGNED | DATA_BINARY_TYPE
+ | DATA_FTS_DOC_ID))
+ || col->len != sizeof(doc_id_t))
+ return false;
+
+ return find_index_by_name(FTS_DOC_ID_INDEX_NAME) != nullptr;
+ }
+
+ /** Need to check whether the table need to add system
+ generated fts column and system generated fts document index
+ @param table table to be imported
+ @return whether the table has to add system generated
+ fts column and fts index */
+ bool need_hidden_fts(dict_table_t *table) const
+ {
+ return has_hidden_fts() && !table->fts_doc_id_index &&
+ m_n_cols == static_cast<ulint>(table->n_cols + 1) &&
+ m_n_indexes == UT_LIST_GET_LEN(table->indexes) + 1;
+ }
dict_table_t* m_table; /*!< Table instance */
@@ -547,7 +603,7 @@ protected:
if (m_xdes != 0) {
const xdes_t* xdesc = xdes(page_no, m_xdes);
- ulint pos = page_no % FSP_EXTENT_SIZE;
+ uint32_t pos = page_no % FSP_EXTENT_SIZE;
return xdes_is_free(xdesc, pos);
}
@@ -1079,7 +1135,6 @@ row_import::find_col(
return(i);
}
}
-
return(ULINT_UNDEFINED);
}
@@ -1800,14 +1855,39 @@ PageConverter::update_records(
bool clust_index = m_index->m_srv_index == m_cluster_index;
/* This will also position the cursor on the first user record. */
+ rec_t* rec = m_rec_iter.open(block, m_index->m_srv_index);
- if (!m_rec_iter.open(block, m_index->m_srv_index)) {
+ if (!rec) {
return DB_CORRUPTION;
}
+ ulint deleted;
+
+ if (!page_has_prev(block->page.frame)
+ && m_index->m_srv_index->is_instant()) {
+ /* Expect to find the hidden metadata record */
+ if (page_rec_is_supremum(rec)) {
+ return DB_CORRUPTION;
+ }
+
+ const ulint info_bits = rec_get_info_bits(rec, comp);
+
+ if (!(info_bits & REC_INFO_MIN_REC_FLAG)) {
+ return DB_CORRUPTION;
+ }
+
+ if (!(info_bits & REC_INFO_DELETED_FLAG)
+ != !m_index->m_srv_index->table->instant) {
+ return DB_CORRUPTION;
+ }
+
+ deleted = 0;
+ goto first;
+ }
+
while (!m_rec_iter.end()) {
- rec_t* rec = m_rec_iter.current();
- ibool deleted = rec_get_deleted_flag(rec, comp);
+ rec = m_rec_iter.current();
+ deleted = rec_get_deleted_flag(rec, comp);
/* For the clustered index we have to adjust the BLOB
reference and the system fields irrespective of the
@@ -1815,6 +1895,7 @@ PageConverter::update_records(
cluster records is required for purge to work later. */
if (deleted || clust_index) {
+first:
m_offsets = rec_get_offsets(
rec, m_index->m_srv_index, m_offsets,
m_index->m_srv_index->n_core_fields,
@@ -2101,14 +2182,30 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
return DB_SUCCESS;
}
-/*****************************************************************//**
-Clean up after import tablespace. */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
+static void reload_fts_table(row_prebuilt_t *prebuilt,
+ dict_table_t* table)
+{
+ ut_ad(prebuilt->table != table);
+ /* Reload the table in case of hidden fts column */
+ const table_id_t id= prebuilt->table->id;
+ prebuilt->table->release();
+ dict_sys.remove(prebuilt->table);
+ prebuilt->table=
+ dict_table_open_on_id(id, true, DICT_TABLE_OP_NORMAL);
+ prebuilt->table->space= table->space;
+}
+
+/** Clean up after import tablespace.
+@param prebuilt prebuilt from handler
+@param err error code
+@param fts_table constructed table which has system generated
+ fulltext document id
+@return error code or DB_SUCCESS */
+static
dberr_t
-row_import_cleanup(
-/*===============*/
- row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
- dberr_t err) /*!< in: error code */
+row_import_cleanup(row_prebuilt_t* prebuilt,
+ dberr_t err,
+ dict_table_t* fts_table = nullptr)
{
if (err != DB_SUCCESS) {
dict_table_t* table = prebuilt->table;
@@ -2128,11 +2225,44 @@ row_import_cleanup(
index = UT_LIST_GET_NEXT(indexes, index)) {
index->page = FIL_NULL;
}
+
+ prebuilt->trx->rollback();
+ }
+ else {
+ DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+ prebuilt->trx->commit();
}
- DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+ if (fts_table && fts_table != prebuilt->table) {
- prebuilt->trx->commit();
+ if (err == DB_SUCCESS) {
+ reload_fts_table(prebuilt, fts_table);
+ ib::warn() << "Added system generated FTS_DOC_ID "
+ "and FTS_DOC_ID_INDEX while importing "
+ "the tablespace " << prebuilt->table->name;
+ } else if (fts_table->space) {
+ fil_close_tablespace(fts_table->space_id);
+ fts_table->space = NULL;
+ }
+
+ if (!prebuilt->trx->dict_operation_lock_mode) {
+ dict_sys.lock(SRW_LOCK_CALL);
+ }
+
+ dict_index_t* index = UT_LIST_GET_FIRST(
+ fts_table->indexes);
+ while (index) {
+ dict_index_t* next_index =
+ UT_LIST_GET_NEXT(indexes, index);
+ dict_index_remove_from_cache(fts_table, index);
+ index = next_index;
+ }
+ dict_mem_table_free(fts_table);
+
+ if (!prebuilt->trx->dict_operation_lock_mode) {
+ dict_sys.unlock();
+ }
+ }
if (prebuilt->trx->dict_operation_lock_mode) {
row_mysql_unlock_data_dictionary(prebuilt->trx);
@@ -2145,14 +2275,17 @@ row_import_cleanup(
return(err);
}
-/*****************************************************************//**
-Report error during tablespace import. */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Report error during tablespace import.
+@param prebuilt prebuilt from the handler
+@param err error code
+@param fts_table table definition containing hidden FTS_DOC_ID column
+@return error code or DB_SUCCESS */
+static
dberr_t
row_import_error(
-/*=============*/
- row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */
- dberr_t err) /*!< in: error code */
+ row_prebuilt_t* prebuilt,
+ dberr_t err,
+ dict_table_t* fts_table=nullptr)
{
if (!trx_is_interrupted(prebuilt->trx)) {
char table_name[MAX_FULL_NAME_LEN + 1];
@@ -2167,7 +2300,7 @@ row_import_error(
table_name, (ulong) err, ut_strerr(err));
}
- return row_import_cleanup(prebuilt, err);
+ return row_import_cleanup(prebuilt, err, fts_table);
}
/*****************************************************************//**
@@ -3059,7 +3192,139 @@ static size_t get_buf_size()
);
}
-/* find, parse instant metadata, performing variaous checks,
+/** Add fts index to the table
+@param table fts index to be added on the table */
+static void add_fts_index(dict_table_t *table)
+{
+ dict_index_t *fts_index= dict_mem_index_create(
+ table, FTS_DOC_ID_INDEX_NAME, DICT_UNIQUE, 2);
+ fts_index->page= FIL_NULL;
+ fts_index->cached= 1;
+ fts_index->n_uniq= 1;
+ /* Add fields for FTS_DOC_ID_INDEX */
+ dict_index_add_col(
+ fts_index, table,
+ &table->cols[table->n_cols - (DATA_N_SYS_COLS + 1)], 0);
+ dict_index_t *clust_index= UT_LIST_GET_FIRST(table->indexes);
+ for (ulint i= 0; i < clust_index->n_uniq; i++)
+ dict_index_add_col(fts_index, table, clust_index->fields[i].col,
+ clust_index->fields[i].prefix_len);
+#ifdef BTR_CUR_HASH_ADAPT
+ fts_index->search_info= btr_search_info_create(fts_index->heap);
+ fts_index->search_info->ref_count= 0;
+#endif /* BTR_CUR_HASH_ADAPT */
+ UT_LIST_ADD_LAST(fts_index->table->indexes, fts_index);
+}
+
+/** Append the hidden fts column and fts doc index to the
+existing table
+@param table table to be imported
+@param thd thread
+@param cfg metadata required by import
+@return table which has fts doc id and fts doc id index */
+static dict_table_t *build_fts_hidden_table(
+ dict_table_t *table, const row_import &cfg)
+{
+ dict_table_t *new_table= dict_table_t::create(
+ {table->name.m_name, strlen(table->name.m_name)},
+ table->space, table->n_t_cols - (DATA_N_SYS_COLS - 1),
+ table->n_v_cols, table->flags,
+ table->flags2);
+
+ new_table->id= table->id;
+ new_table->space_id= table->space_id;
+ const char* col_name= &table->col_names[0];
+ /* Copy columns from old table to new fts table */
+ for (ulint new_i= 0;
+ new_i < ulint(new_table->n_cols - (DATA_N_SYS_COLS + 1));
+ new_i++)
+ {
+ dict_mem_table_add_col(new_table, new_table->heap, col_name,
+ table->cols[new_i].mtype,
+ table->cols[new_i].prtype,
+ table->cols[new_i].len);
+ col_name+= strlen(col_name) + 1;
+ }
+
+ unsigned fts_col_ind= unsigned(table->n_cols - DATA_N_SYS_COLS);
+ fts_add_doc_id_column(new_table, new_table->heap);
+ new_table->cols[fts_col_ind].ind=
+ fts_col_ind & dict_index_t::MAX_N_FIELDS;
+ new_table->cols[fts_col_ind].ord_part= 1;
+ dict_table_add_system_columns(new_table, new_table->heap);
+
+ col_name= &table->v_col_names[0];
+ for (ulint new_i= 0; new_i < new_table->n_v_cols; new_i++)
+ {
+ dict_col_t old_vcol= table->v_cols[new_i].m_col;
+ dict_mem_table_add_v_col(new_table, new_table->heap, col_name,
+ old_vcol.mtype, old_vcol.prtype,
+ old_vcol.len, old_vcol.ind + 1,
+ table->v_cols[new_i].num_base);
+ for (ulint i= 0; i < table->v_cols[new_i].num_base; i++)
+ {
+ dict_col_t *base_col= dict_table_get_nth_col(
+ new_table, table->v_cols[new_i].base_col[i]->ind);
+ new_table->v_cols[new_i].base_col[i]= base_col;
+ }
+ col_name+= strlen(col_name) + 1;
+ }
+
+ bool is_clustered= true;
+ /* Copy indexes from old table to new table */
+ for (dict_index_t *old_index= UT_LIST_GET_FIRST(table->indexes);
+ old_index; is_clustered= false)
+ {
+ dict_index_t *new_index= dict_mem_index_create(
+ new_table, old_index->name, old_index->type,
+ old_index->n_fields + is_clustered);
+
+ new_index->id= old_index->id;
+ new_index->n_uniq= old_index->n_uniq;
+ new_index->type= old_index->type;
+ new_index->cached= 1;
+ new_index->n_user_defined_cols= old_index->n_user_defined_cols;
+ new_index->n_core_null_bytes= old_index->n_core_null_bytes;
+ /* Copy all fields from old index to new index */
+ for (ulint i= 0; i < old_index->n_fields; i++)
+ {
+ dict_field_t *field= dict_index_get_nth_field(old_index, i);
+ dict_col_t *col= field->col;
+ if (col->is_virtual())
+ {
+ dict_v_col_t *v_col= reinterpret_cast<dict_v_col_t*>(col);
+ col= &new_table->v_cols[v_col->v_pos].m_col;
+ }
+ else
+ {
+ unsigned ind= field->col->ind;
+ if (ind >= fts_col_ind) ind++;
+ col= &new_table->cols[ind];
+ }
+ dict_index_add_col(new_index, new_table, col,
+ field->prefix_len);
+ if (i < old_index->n_uniq) col->ord_part= 1;
+ }
+
+ if (is_clustered)
+ {
+ /* Add fts doc id in clustered index */
+ dict_index_add_col(
+ new_index, new_table, &table->cols[fts_col_ind], 0);
+ new_index->fields[old_index->n_fields].fixed_len= sizeof(doc_id_t);
+ }
+
+ new_index->search_info= old_index->search_info;
+ UT_LIST_ADD_LAST(new_index->table->indexes, new_index);
+ old_index= UT_LIST_GET_NEXT(indexes, old_index);
+ if (UT_LIST_GET_LEN(new_table->indexes)
+ == cfg.find_fts_idx_offset())
+ add_fts_index(new_table);
+ }
+ return new_table;
+}
+
+/* find, parse instant metadata, performing various checks,
and apply it to dict_table_t
@return DB_SUCCESS or some error */
static dberr_t handle_instant_metadata(dict_table_t *table,
@@ -4228,6 +4493,107 @@ fil_tablespace_iterate(
return(err);
}
+static void row_import_autoinc(dict_table_t *table, row_prebuilt_t *prebuilt,
+ uint64_t autoinc)
+{
+ if (!table->persistent_autoinc)
+ {
+ ut_ad(!autoinc);
+ return;
+ }
+
+ if (autoinc)
+ {
+ btr_write_autoinc(dict_table_get_first_index(table), autoinc - 1);
+ autoinc_set:
+ table->autoinc= autoinc;
+ sql_print_information("InnoDB: %`.*s.%`s autoinc value set to " UINT64PF,
+ int(table->name.dblen()), table->name.m_name,
+ table->name.basename(), autoinc);
+ }
+ else if (TABLE *t= prebuilt->m_mysql_table)
+ {
+ if (const Field *ai= t->found_next_number_field)
+ {
+ autoinc= 1 +
+ btr_read_autoinc_with_fallback(table, innodb_col_no(ai),
+ t->s->mysql_version,
+ innobase_get_int_col_max_value(ai));
+ goto autoinc_set;
+ }
+ }
+}
+
+/** Update the virtual column position in SYS_COLUMNS and SYS_VIRTUAL
+@param table_id table identifier
+@param new_pos position value
+@param trx transaction
+@return DB_SUCCESS or error code */
+dberr_t update_vcol_pos(table_id_t table_id, ulint new_pos, trx_t *trx)
+{
+ pars_info_t *info= pars_info_create();
+ pars_info_add_ull_literal(info, "id", table_id);
+ pars_info_add_int4_literal(info, "old_pos", new_pos - 1);
+ DBUG_EXECUTE_IF("ib_import_vcol_update_fail",
+ return DB_DUPLICATE_KEY;);
+ return que_eval_sql(info,
+ "PROCEDURE UPDATE_VCOL () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_COLUMNS SET POS = POS + 1 "
+ "WHERE TABLE_ID= :id AND POS = :old_pos;\n"
+ "UPDATE SYS_VIRTUAL SET POS = POS + 1 "
+ "WHERE TABLE_ID= :id AND POS = :old_pos;\n"
+ "END\n;", trx);
+}
+
+/**
+1) Update the position of the columns and
+2) Insert the hidden fts doc id in the sys columns table
+3) Insert the hidden fts doc id in the sys indexes and
+sys_fields table
+@param table table to be imported
+@param fts_pos position of fts doc id column
+@param trx transaction
+@return DB_SUCCESS or error code */
+static
+dberr_t innodb_insert_hidden_fts_col(dict_table_t* table,
+ ulint fts_pos,
+ trx_t* trx)
+{
+ dict_index_t* fts_idx=
+ dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+ if (!fts_idx) return DB_ERROR;
+ for (ulint new_i= 0; new_i < table->n_v_cols; new_i++)
+ {
+ ulint pos= dict_create_v_col_pos(
+ table->v_cols[new_i].v_pos,
+ table->v_cols[new_i].m_col.ind);
+ if (dberr_t err= update_vcol_pos(table->id, pos, trx))
+ return err;
+ }
+ pars_info_t *info= pars_info_create();
+ pars_info_add_ull_literal(info, "id", table->id);
+ dict_hdr_get_new_id(NULL, &fts_idx->id, NULL);
+ pars_info_add_ull_literal(info, "idx_id", fts_idx->id);
+ pars_info_add_int4_literal(info, "pos", fts_pos);
+ pars_info_add_int4_literal(info, "space", fts_idx->table->space_id);
+ pars_info_add_int4_literal(info, "page_no", fts_idx->page);
+
+ return que_eval_sql(info,
+ "PROCEDURE ADD_FTS_COL () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_COLUMNS VALUES"
+ "(:id,:pos,'FTS_DOC_ID',6, 1795, 8, 0);\n"
+ "UPDATE SYS_TABLES SET N_COLS = N_COLS + 1"
+ " WHERE ID = :id;\n"
+ "INSERT INTO SYS_INDEXES VALUES"
+ "(:id, :idx_id, 'FTS_DOC_ID_INDEX', 1,"
+ " 2, :space, :page_no, 50);\n"
+ "INSERT INTO SYS_FIELDS VALUES"
+ "(:idx_id, 1, 'FTS_DOC_ID');\n"
+ "END;\n", trx);
+}
+
/*****************************************************************//**
Imports a tablespace. The space id in the .ibd file must match the space id
of the table in the data dictionary.
@@ -4253,9 +4619,27 @@ row_import_for_mysql(
ut_ad(trx);
ut_ad(trx->state == TRX_STATE_ACTIVE);
ut_ad(!table->is_readable());
+ ut_ad(prebuilt->table == table);
ibuf_delete_for_discarded_space(table->space_id);
+#ifdef BTR_CUR_HASH_ADAPT
+ /* On DISCARD TABLESPACE, we did not drop any adaptive hash
+ index entries. If we replaced the discarded tablespace with a
+ smaller one here, there could still be some adaptive hash
+ index entries that point to cached garbage pages in the buffer
+ pool, because PageConverter::operator() only evicted those
+ pages that were replaced by the imported pages. We must
+ detach any remaining adaptive hash index entries, because the
+ adaptive hash index must be a subset of the table contents;
+ false positives are not tolerated. */
+ for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ index = index->clone_if_needed();
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+ UT_LIST_GET_FIRST(table->indexes)->clear_instant_alter();
+
/* Assign an undo segment for the transaction, so that the
transaction will be recovered after a crash. */
@@ -4281,7 +4665,6 @@ row_import_for_mysql(
row_import cfg;
THD* thd = trx->mysql_thd;
-
err = row_import_read_cfg(table, thd, cfg);
/* Check if the table column definitions match the contents
@@ -4289,8 +4672,16 @@ row_import_for_mysql(
if (err == DB_SUCCESS) {
- if (dberr_t err = handle_instant_metadata(table, cfg)) {
- return row_import_error(prebuilt, err);
+ if (cfg.need_hidden_fts(table)) {
+ cfg.m_table = table = build_fts_hidden_table(
+ table, cfg);
+ }
+
+ err = handle_instant_metadata(table, cfg);
+ if (err != DB_SUCCESS) {
+import_error:
+ return row_import_error(
+ prebuilt, err, table);
}
/* We have a schema file, try and match it with our
@@ -4326,7 +4717,7 @@ row_import_for_mysql(
"table %s when .cfg file is missing.",
table->name.m_name);
err = DB_ERROR;
- return row_import_error(prebuilt, err);
+ goto import_error;
}
FetchIndexRootPages fetchIndexRootPages(table, trx);
@@ -4355,7 +4746,7 @@ row_import_for_mysql(
}
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
}
trx->op_info = "importing tablespace";
@@ -4375,21 +4766,6 @@ row_import_for_mysql(
DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
err = DB_TOO_MANY_CONCURRENT_TRXS;);
-#ifdef BTR_CUR_HASH_ADAPT
- /* On DISCARD TABLESPACE, we did not drop any adaptive hash
- index entries. If we replaced the discarded tablespace with a
- smaller one here, there could still be some adaptive hash
- index entries that point to cached garbage pages in the buffer
- pool, because PageConverter::operator() only evicted those
- pages that were replaced by the imported pages. We must
- detach any remaining adaptive hash index entries, because the
- adaptive hash index must be a subset of the table contents;
- false positives are not tolerated. */
- for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
- index = UT_LIST_GET_NEXT(indexes, index)) {
- index = index->clone_if_needed();
- }
-#endif /* BTR_CUR_HASH_ADAPT */
if (err != DB_SUCCESS) {
char table_name[MAX_FULL_NAME_LEN + 1];
@@ -4406,7 +4782,7 @@ row_import_for_mysql(
table_name, ut_strerr(err));
}
- return row_import_cleanup(prebuilt, err);
+ goto import_error;
}
/* If the table is stored in a remote tablespace, we need to
@@ -4469,7 +4845,8 @@ row_import_for_mysql(
dict_index_t* index = dict_table_get_first_index(table);
if (!dict_index_is_clust(index)) {
- return row_import_error(prebuilt, DB_CORRUPTION);
+ err = DB_CORRUPTION;
+ goto import_error;
}
/* Update the Btree segment headers for index node and
@@ -4481,7 +4858,7 @@ row_import_for_mysql(
err = DB_CORRUPTION;);
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
} else if (cfg.requires_purge(index->name)) {
/* Purge any delete-marked records that couldn't be
@@ -4500,7 +4877,7 @@ row_import_for_mysql(
DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
}
/* For secondary indexes, purge any records that couldn't be purged
@@ -4513,7 +4890,7 @@ row_import_for_mysql(
err = DB_CORRUPTION;);
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
}
/* Ensure that the next available DB_ROW_ID is not smaller than
@@ -4528,9 +4905,9 @@ row_import_for_mysql(
/* Ensure that all pages dirtied during the IMPORT make it to disk.
The only dirty pages generated should be from the pessimistic purge
of delete marked records that couldn't be purged in Phase I. */
- while (buf_flush_list_space(prebuilt->table->space));
+ while (buf_flush_list_space(table->space));
- for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+ for (ulint count = 0; table->space->referenced(); count++) {
/* Issue a warning every 10.24 seconds, starting after
2.56 seconds */
if ((count & 511) == 128) {
@@ -4541,38 +4918,48 @@ row_import_for_mysql(
}
ib::info() << "Phase IV - Flush complete";
- prebuilt->table->space->set_imported();
+ /* Set tablespace purpose as FIL_TYPE_TABLESPACE,
+ so that rollback can go ahead smoothly */
+ table->space->set_imported();
+ err = lock_sys_tables(trx);
+ if (err != DB_SUCCESS) {
+ goto import_error;
+ }
/* The dictionary latches will be released in in row_import_cleanup()
after the transaction commit, for both success and error. */
row_mysql_lock_data_dictionary(trx);
+ if (prebuilt->table != table) {
+ /* Add fts_doc_id and fts_doc_idx in data dictionary */
+ err = innodb_insert_hidden_fts_col(
+ table, cfg.find_col(FTS_DOC_ID_COL_NAME), trx);
+ DBUG_EXECUTE_IF("ib_import_fts_error",
+ err= DB_DUPLICATE_KEY;);
+ if (err != DB_SUCCESS) {
+ goto import_error;
+ }
+ }
/* Update the root pages of the table's indexes. */
err = row_import_update_index_root(trx, table, false);
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
}
err = row_import_update_discarded_flag(trx, table->id, false);
if (err != DB_SUCCESS) {
- return row_import_error(prebuilt, err);
+ goto import_error;
}
table->file_unreadable = false;
table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
/* Set autoinc value read from .cfg file, if one was specified.
- Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
- if (autoinc) {
- ib::info() << table->name << " autoinc value set to "
- << autoinc;
-
- table->autoinc = autoinc--;
- btr_write_autoinc(dict_table_get_first_index(table), autoinc);
- }
+ Otherwise, read the PAGE_ROOT_AUTO_INC and set it to table autoinc. */
+ row_import_autoinc(table, prebuilt, autoinc);
- return row_import_cleanup(prebuilt, err);
+ return row_import_cleanup(prebuilt, err, table);
}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index 9c3c5d22..1f319aae 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -2000,7 +2000,7 @@ row_ins_dupl_error_with_rec(
/* In a unique secondary index we allow equal key values if they
contain SQL NULLs */
- if (!dict_index_is_clust(index) && !index->nulls_equal) {
+ if (!dict_index_is_clust(index)) {
for (i = 0; i < n_unique; i++) {
if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
@@ -2102,16 +2102,8 @@ row_ins_scan_sec_index_for_duplicate(
/* If the secondary index is unique, but one of the fields in the
n_unique first fields is NULL, a unique key violation cannot occur,
since we define NULL != NULL in this case */
-
- if (!index->nulls_equal) {
- for (ulint i = 0; i < n_unique; i++) {
- if (UNIV_SQL_NULL == dfield_get_len(
- dtuple_get_nth_field(entry, i))) {
-
- DBUG_RETURN(DB_SUCCESS);
- }
- }
- }
+ if (index->n_nullable && dtuple_contains_null(entry, n_unique))
+ DBUG_RETURN(DB_SUCCESS);
/* Store old value on n_fields_cmp */
@@ -2569,12 +2561,6 @@ row_ins_index_entry_big_rec(
return(error);
}
-#ifdef HAVE_REPLICATION /* Working around MDEV-24622 */
-extern "C" int thd_is_slave(const MYSQL_THD thd);
-#else
-# define thd_is_slave(thd) 0
-#endif
-
#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
We would only need this for row_ins_clust_index_entry_low(),
@@ -2728,7 +2714,8 @@ err_exit:
&& !index->table->is_active_ddl()
&& !index->table->has_spatial_index()
&& !index->table->versioned()
- && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) {
+ && (!dict_table_is_partition(index->table)
+ || thd_sql_command(trx->mysql_thd) == SQLCOM_INSERT)) {
DEBUG_SYNC_C("empty_root_page_insert");
trx->bulk_insert = true;
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 188d8ba5..6fb530f0 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -526,8 +526,6 @@ static ulint row_merge_bulk_buf_add(row_merge_buf_t* buf,
@param[in,out] row table row
@param[in] ext cache of externally stored
column prefixes, or NULL
-@param[in] history_fts row is historical in a system-versioned table
- on which a FTS_DOC_ID_INDEX(FTS_DOC_ID) exists
@param[in,out] doc_id Doc ID if we are creating
FTS index
@param[in,out] conv_heap memory heap where to allocate data when
@@ -550,7 +548,6 @@ row_merge_buf_add(
fts_psort_t* psort_info,
dtuple_t* row,
const row_ext_t* ext,
- const bool history_fts,
doc_id_t* doc_id,
mem_heap_t* conv_heap,
dberr_t* err,
@@ -615,7 +612,7 @@ error:
: NULL;
/* Process the Doc ID column */
- if (!v_col && (history_fts || *doc_id)
+ if (!v_col && *doc_id
&& col->ind == index->table->fts->doc_col) {
fts_write_doc_id((byte*) &write_doc_id, *doc_id);
@@ -676,7 +673,7 @@ error:
}
/* Tokenize and process data for FTS */
- if (!history_fts && (index->type & DICT_FTS)) {
+ if (index->type & DICT_FTS) {
fts_doc_item_t* doc_item;
byte* value;
void* ptr;
@@ -1895,6 +1892,7 @@ row_merge_read_clustered_index(
DBUG_ENTER("row_merge_read_clustered_index");
ut_ad((old_table == new_table) == !col_map);
+ ut_ad(old_table->fts || !new_table->fts || !new_table->versioned());
ut_ad(!defaults || col_map);
ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
ut_ad(trx->id);
@@ -2120,7 +2118,6 @@ corrupted_metadata:
dtuple_t* row;
row_ext_t* ext;
page_cur_t* cur = btr_pcur_get_page_cur(&pcur);
- bool history_row, history_fts = false;
stage->n_pk_recs_inc();
@@ -2382,11 +2379,6 @@ end_of_index:
row_heap);
ut_ad(row);
- history_row = new_table->versioned()
- && dtuple_get_nth_field(row, new_table->vers_end)
- ->vers_history_row();
- history_fts = history_row && new_table->fts;
-
for (ulint i = 0; i < n_nonnull; i++) {
dfield_t* field = &row->fields[nonnull[i]];
@@ -2415,7 +2407,7 @@ end_of_index:
}
/* Get the next Doc ID */
- if (add_doc_id && !history_fts) {
+ if (add_doc_id) {
doc_id++;
} else {
doc_id = 0;
@@ -2455,7 +2447,9 @@ end_of_index:
add_autoinc);
if (new_table->versioned()) {
- if (history_row) {
+ if (dtuple_get_nth_field(row,
+ new_table->vers_end)
+ ->vers_history_row()) {
if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) {
err = DB_UNSUPPORTED;
my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
@@ -2571,7 +2565,7 @@ write_buffers:
if (UNIV_LIKELY
(row && (rows_added = row_merge_buf_add(
buf, fts_index, old_table, new_table,
- psort_info, row, ext, history_fts,
+ psort_info, row, ext,
&doc_id, conv_heap, &err,
&v_heap, eval_table, trx,
col_collate)))) {
@@ -2904,7 +2898,7 @@ write_buffers:
(!(rows_added = row_merge_buf_add(
buf, fts_index, old_table,
new_table, psort_info,
- row, ext, history_fts, &doc_id,
+ row, ext, &doc_id,
conv_heap, &err, &v_heap,
eval_table, trx, col_collate)))) {
/* An empty buffer should have enough
@@ -4355,9 +4349,7 @@ void row_merge_drop_temp_indexes()
UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
@param[in] path location for creating temporary merge files, or NULL
@return File descriptor */
-pfs_os_file_t
-row_merge_file_create_low(
- const char* path)
+static pfs_os_file_t row_merge_file_create_mode(const char *path, int mode)
{
if (!path) {
path = mysql_tmpdir;
@@ -4398,6 +4390,13 @@ row_merge_file_create_low(
return(fd);
}
+/** Create a temporary file at the specified path.
+@param path location for creating temporary merge files, or nullptr
+@return File descriptor */
+pfs_os_file_t row_merge_file_create_low(const char *path)
+{
+ return row_merge_file_create_mode(path, O_BINARY | O_SEQUENTIAL);
+}
/** Create a merge file in the given location.
@param[out] merge_file merge file structure
@@ -4408,17 +4407,16 @@ row_merge_file_create(
merge_file_t* merge_file,
const char* path)
{
- merge_file->fd = row_merge_file_create_low(path);
merge_file->offset = 0;
merge_file->n_rec = 0;
-#ifdef HAVE_FCNTL_DIRECT
- if (merge_file->fd != OS_FILE_CLOSED) {
- if (srv_disable_sort_file_cache) {
- os_file_set_nocache(merge_file->fd,
- "row0merge.cc", "sort");
- }
- }
+ merge_file->fd =
+ row_merge_file_create_mode(path,
+#if !defined _WIN32 && defined O_DIRECT
+ srv_disable_sort_file_cache
+ ? O_DIRECT | O_BINARY | O_SEQUENTIAL
+ :
#endif
+ O_BINARY | O_SEQUENTIAL);
return(merge_file->fd);
}
@@ -5353,18 +5351,8 @@ dberr_t trx_mod_table_time_t::write_bulk(dict_table_t *table, trx_t *trx)
return err;
}
-dberr_t trx_t::bulk_insert_apply_low()
+void trx_t::bulk_rollback_low()
{
- ut_ad(bulk_insert);
- ut_ad(!check_unique_secondary);
- ut_ad(!check_foreigns);
- dberr_t err;
- for (auto& t : mod_tables)
- if (t.second.is_bulk_insert())
- if ((err= t.second.write_bulk(t.first, this)) != DB_SUCCESS)
- goto bulk_rollback;
- return DB_SUCCESS;
-bulk_rollback:
undo_no_t low_limit= UINT64_MAX;
for (auto& t : mod_tables)
{
@@ -5374,9 +5362,37 @@ bulk_rollback:
low_limit= t.second.get_first();
delete t.second.bulk_store;
t.second.bulk_store= nullptr;
+ t.second.end_bulk_insert();
}
}
trx_savept_t bulk_save{low_limit};
rollback(&bulk_save);
- return err;
+}
+
+dberr_t trx_t::bulk_insert_apply_for_table(dict_table_t *table)
+{
+ auto it= mod_tables.find(table);
+ if (it != mod_tables.end())
+ {
+ if (dberr_t err= it->second.write_bulk(table, this))
+ {
+ bulk_rollback_low();
+ return err;
+ }
+ it->second.end_bulk_insert();
+ }
+ return DB_SUCCESS;
+}
+
+dberr_t trx_t::bulk_insert_apply_low()
+{
+ ut_ad(bulk_insert);
+ for (auto& t : mod_tables)
+ if (t.second.is_bulk_insert())
+ if (dberr_t err= t.second.write_bulk(t.first, this))
+ {
+ bulk_rollback_low();
+ return err;
+ }
+ return DB_SUCCESS;
}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index c5ee3be7..6a71cf3a 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -695,6 +695,7 @@ handle_new_error:
DBUG_RETURN(true);
case DB_DEADLOCK:
+ case DB_RECORD_CHANGED:
case DB_LOCK_TABLE_FULL:
rollback:
/* Roll back the whole transaction; this resolution was added
@@ -1585,7 +1586,8 @@ init_fts_doc_id_for_ref(
for (dict_foreign_t* foreign : table->referenced_set) {
ut_ad(foreign->foreign_table);
- if (foreign->foreign_table->fts) {
+ if (foreign->foreign_table->space
+ && foreign->foreign_table->fts) {
fts_init_doc_id(foreign->foreign_table);
}
@@ -2383,7 +2385,6 @@ row_discard_tablespace(
dict_table_change_id_in_cache(table, new_id);
dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
- if (index) index->clear_instant_alter();
/* Reset the root page numbers. */
for (; index; index = UT_LIST_GET_NEXT(indexes, index)) {
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
index e927096f..057b20c7 100644
--- a/storage/innobase/row/row0quiesce.cc
+++ b/storage/innobase/row/row0quiesce.cc
@@ -431,6 +431,10 @@ row_quiesce_write_header(
/*********************************************************************//**
Write the table meta data after quiesce.
@return DB_SUCCESS or error code */
+
+/* Stack size 20904 with clang */
+PRAGMA_DISABLE_CHECK_STACK_FRAME
+
static MY_ATTRIBUTE((nonnull, warn_unused_result))
dberr_t
row_quiesce_write_cfg(
@@ -488,6 +492,7 @@ row_quiesce_write_cfg(
return(err);
}
+PRAGMA_REENABLE_CHECK_STACK_FRAME
/*********************************************************************//**
Check whether a table has an FTS index defined on it.
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 33f4d81f..944f7358 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -864,6 +864,11 @@ row_sel_build_committed_vers_for_mysql(
column version if any */
mtr_t* mtr) /*!< in: mtr */
{
+ if (prebuilt->trx->snapshot_isolation) {
+ *old_vers = rec;
+ return;
+ }
+
if (prebuilt->old_vers_heap) {
mem_heap_empty(prebuilt->old_vers_heap);
} else {
@@ -1184,11 +1189,11 @@ sel_set_rtr_rec_lock(
ut_ad(page_align(first_rec) == cur_block->page.frame);
ut_ad(match->valid);
- match->block.page.lock.x_lock();
+ match->block->page.lock.x_lock();
retry:
cur_block = btr_pcur_get_block(pcur);
- ut_ad(match->block.page.lock.have_x()
- || match->block.page.lock.have_s());
+ ut_ad(match->block->page.lock.have_x()
+ || match->block->page.lock.have_s());
ut_ad(page_is_leaf(cur_block->page.frame));
err = lock_sec_rec_read_check_and_lock(
@@ -1288,7 +1293,7 @@ re_scan:
ULINT_UNDEFINED, &heap);
err = lock_sec_rec_read_check_and_lock(
- 0, &match->block, rtr_rec->r_rec, index,
+ 0, match->block, rtr_rec->r_rec, index,
my_offsets, static_cast<lock_mode>(mode),
type, thr);
@@ -1304,7 +1309,7 @@ re_scan:
match->locked = true;
func_end:
- match->block.page.lock.x_unlock();
+ match->block->page.lock.x_unlock();
if (heap != NULL) {
mem_heap_free(heap);
}
@@ -3401,7 +3406,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
if (dict_index_is_spatial(sec_index)
&& btr_cur->rtr_info->matches
&& (page_align(rec)
- == btr_cur->rtr_info->matches->block.page.frame
+ == btr_cur->rtr_info->matches->block->page.frame
|| rec != btr_pcur_get_rec(prebuilt->pcur))) {
#ifdef UNIV_DEBUG
rtr_info_t* rtr_info = btr_cur->rtr_info;
@@ -4456,13 +4461,11 @@ early_not_found:
DBUG_RETURN(DB_RECORD_NOT_FOUND);
}
+#if SIZEOF_SIZE_T < 8
+ if (UNIV_LIKELY(~prebuilt->n_rows_fetched))
+#endif
prebuilt->n_rows_fetched++;
- if (prebuilt->n_rows_fetched > 1000000000) {
- /* Prevent wrap-over */
- prebuilt->n_rows_fetched = 500000000;
- }
-
mode = pcur->search_mode;
}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index a01eaea5..52f54443 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -190,7 +190,7 @@ row_undo_mod_clust_low(
@param[in] rec clustered index record
@param[in] index clustered index
@return the byte offset of DB_TRX_ID, from the start of rec */
-static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
+ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
{
ut_ad(index->n_uniq <= MAX_REF_PARTS);
ulint trx_id_offset = index->trx_id_offset;
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index 75798241..62229842 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -364,11 +364,6 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_NONE,
MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_LOOPS},
- {"buffer_LRU_get_free_waits", "buffer",
- "Total sleep waits in LRU get free.",
- MONITOR_NONE,
- MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_WAITS},
-
{"buffer_flush_avg_page_rate", "buffer",
"Average number of pages at which flushing is happening",
MONITOR_NONE,
@@ -472,11 +467,6 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_EXISTING | MONITOR_DEFAULT_ON),
MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
- {"buffer_LRU_single_flush_failure_count", "Buffer",
- "Number of times attempt to flush a single page from LRU failed",
- MONITOR_NONE,
- MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
-
{"buffer_LRU_get_free_search", "Buffer",
"Number of searches performed for a clean page",
MONITOR_NONE,
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 738e0a7e..fc557673 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -91,10 +91,6 @@ Created 2/16/1996 Heikki Tuuri
#include "zlib.h"
#include "log.h"
-/** We are prepared for a situation that we have this many threads waiting for
-a transactional lock inside InnoDB. srv_start() sets the value. */
-ulint srv_max_n_threads;
-
/** Log sequence number at shutdown */
lsn_t srv_shutdown_lsn;
@@ -201,7 +197,7 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn)
bool ret;
os_file_t file{
os_file_create_func(logfile0.c_str(),
- OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_CREATE,
OS_FILE_NORMAL, OS_LOG_FILE, false, &ret)
};
@@ -632,15 +628,15 @@ static uint32_t srv_undo_tablespace_open(bool create, const char* name,
}
}
- pfs_os_file_t fh= os_file_create(innodb_data_file_key, name, OS_FILE_OPEN |
- OS_FILE_ON_ERROR_NO_EXIT |
- OS_FILE_ON_ERROR_SILENT,
+ pfs_os_file_t fh= os_file_create(innodb_data_file_key, name,
+ OS_FILE_OPEN_SILENT,
OS_FILE_AIO, OS_DATA_FILE,
srv_read_only_mode, &success);
if (!success)
return 0;
+ ulint n_retries = 5;
os_offset_t size= os_file_get_size(fh);
ut_a(size != os_offset_t(-1));
@@ -648,15 +644,25 @@ static uint32_t srv_undo_tablespace_open(bool create, const char* name,
{
page_t *page= static_cast<byte*>(aligned_malloc(srv_page_size,
srv_page_size));
+undo_retry:
if (os_file_read(IORequestRead, fh, page, 0, srv_page_size, nullptr) !=
DB_SUCCESS)
{
err_exit:
+ if (n_retries && srv_operation == SRV_OPERATION_BACKUP)
+ {
+ sql_print_information("InnoDB: Retrying to read undo "
+ "tablespace %s", name);
+ n_retries--;
+ goto undo_retry;
+ }
ib::error() << "Unable to read first page of file " << name;
aligned_free(page);
return ~0U;
}
+ DBUG_EXECUTE_IF("undo_space_read_fail", goto err_exit;);
+
uint32_t id= mach_read_from_4(FIL_PAGE_SPACE_ID + page);
if (id == 0 || id >= SRV_SPACE_ID_UPPER_BOUND ||
memcmp_aligned<2>(FIL_PAGE_SPACE_ID + page,
@@ -731,9 +737,7 @@ srv_check_undo_redo_logs_exists()
fh = os_file_create_func(
name,
- OS_FILE_OPEN_RETRY
- | OS_FILE_ON_ERROR_NO_EXIT
- | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_OPEN_RETRY_SILENT,
OS_FILE_NORMAL,
OS_DATA_FILE,
srv_read_only_mode,
@@ -755,8 +759,7 @@ srv_check_undo_redo_logs_exists()
auto logfilename = get_log_file_path();
fh = os_file_create_func(logfilename.c_str(),
- OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_NO_EXIT
- | OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_OPEN_RETRY_SILENT,
OS_FILE_NORMAL, OS_LOG_FILE,
srv_read_only_mode, &ret);
@@ -1179,12 +1182,6 @@ dberr_t srv_start(bool create_new_db)
mysql_stage_register("innodb", srv_stages,
static_cast<int>(UT_ARR_SIZE(srv_stages)));
- srv_max_n_threads =
- 1 /* dict_stats_thread */
- + 1 /* fts_optimize_thread */
- + 128 /* safety margin */
- + max_connections;
-
srv_boot();
ib::info() << my_crc32c_implementation();
@@ -1523,6 +1520,71 @@ dberr_t srv_start(bool create_new_db)
fil_system.space_id_reuse_warned = false;
+ if (srv_operation > SRV_OPERATION_EXPORT_RESTORED) {
+ ut_ad(srv_operation == SRV_OPERATION_RESTORE_EXPORT
+ || srv_operation == SRV_OPERATION_RESTORE);
+ return(err);
+ }
+
+ /* Upgrade or resize or rebuild the redo logs before
+ generating any dirty pages, so that the old redo log
+ file will not be written to. */
+
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+ /* Completely ignore the redo log. */
+ } else if (srv_read_only_mode) {
+ /* Leave the redo log alone. */
+ } else if (log_sys.file_size == srv_log_file_size
+ && log_sys.format
+ == (srv_encrypt_log
+ ? log_t::FORMAT_ENC_10_8
+ : log_t::FORMAT_10_8)) {
+ /* No need to add or remove encryption,
+ upgrade, or resize. */
+ delete_log_files();
+ } else {
+ /* Prepare to delete the old redo log file */
+ const lsn_t lsn{srv_prepare_to_delete_redo_log_file()};
+
+ DBUG_EXECUTE_IF("innodb_log_abort_1",
+ return(srv_init_abort(DB_ERROR)););
+ /* Prohibit redo log writes from any other
+ threads until creating a log checkpoint at the
+ end of create_log_file(). */
+ ut_d(recv_no_log_write = true);
+ ut_ad(!os_aio_pending_reads());
+ ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+ ut_ad(!buf_pool.get_oldest_modification(0));
+ ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+ /* os_aio_pending_writes() may hold here if
+ some write_io_callback() did not release the
+ slot yet. However, the page write itself must
+ have completed, because the buf_pool.flush_list
+ is empty. In debug builds, we wait for this to
+ happen, hoping to get a hung process if this
+ assumption does not hold. */
+ ut_d(os_aio_wait_until_no_pending_writes(false));
+
+ /* Close the redo log file, so that we can replace it */
+ log_sys.close_file();
+
+ DBUG_EXECUTE_IF("innodb_log_abort_5",
+ return(srv_init_abort(DB_ERROR)););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
+
+ err = create_log_file(false, lsn);
+
+ if (err == DB_SUCCESS && log_sys.resize_rename()) {
+ err = DB_ERROR;
+ }
+
+ if (err != DB_SUCCESS) {
+ return(srv_init_abort(err));
+ }
+ }
+
+ recv_sys.debug_free();
+
if (!srv_read_only_mode) {
const uint32_t flags = FSP_FLAGS_PAGE_SSIZE();
for (uint32_t id = srv_undo_space_id_start;
@@ -1607,71 +1669,6 @@ dberr_t srv_start(bool create_new_db)
return(srv_init_abort(DB_ERROR));
}
}
-
- if (srv_operation > SRV_OPERATION_EXPORT_RESTORED) {
- ut_ad(srv_operation == SRV_OPERATION_RESTORE_EXPORT
- || srv_operation == SRV_OPERATION_RESTORE);
- return(err);
- }
-
- /* Upgrade or resize or rebuild the redo logs before
- generating any dirty pages, so that the old redo log
- file will not be written to. */
-
- if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
- /* Completely ignore the redo log. */
- } else if (srv_read_only_mode) {
- /* Leave the redo log alone. */
- } else if (log_sys.file_size == srv_log_file_size
- && log_sys.format
- == (srv_encrypt_log
- ? log_t::FORMAT_ENC_10_8
- : log_t::FORMAT_10_8)) {
- /* No need to add or remove encryption,
- upgrade, or resize. */
- delete_log_files();
- } else {
- /* Prepare to delete the old redo log file */
- const lsn_t lsn{srv_prepare_to_delete_redo_log_file()};
-
- DBUG_EXECUTE_IF("innodb_log_abort_1",
- return(srv_init_abort(DB_ERROR)););
- /* Prohibit redo log writes from any other
- threads until creating a log checkpoint at the
- end of create_log_file(). */
- ut_d(recv_no_log_write = true);
- ut_ad(!os_aio_pending_reads());
- ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
- ut_ad(!buf_pool.get_oldest_modification(0));
- ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
- /* os_aio_pending_writes() may hold here if
- some write_io_callback() did not release the
- slot yet. However, the page write itself must
- have completed, because the buf_pool.flush_list
- is empty. In debug builds, we wait for this to
- happen, hoping to get a hung process if this
- assumption does not hold. */
- ut_d(os_aio_wait_until_no_pending_writes(false));
-
- /* Close the redo log file, so that we can replace it */
- log_sys.close_file();
-
- DBUG_EXECUTE_IF("innodb_log_abort_5",
- return(srv_init_abort(DB_ERROR)););
- DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
-
- err = create_log_file(false, lsn);
-
- if (err == DB_SUCCESS && log_sys.resize_rename()) {
- err = DB_ERROR;
- }
-
- if (err != DB_SUCCESS) {
- return(srv_init_abort(err));
- }
- }
-
- recv_sys.debug_free();
}
ut_ad(err == DB_SUCCESS);
diff --git a/storage/innobase/sync/cache.cc b/storage/innobase/sync/cache.cc
new file mode 100644
index 00000000..43d642d0
--- /dev/null
+++ b/storage/innobase/sync/cache.cc
@@ -0,0 +1,160 @@
+/*****************************************************************************
+
+Copyright (c) 2024, MariaDB plc
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/* This is based on the implementation of pmem_persist() in
+https://github.com/pmem/pmdk/, Copyright 2014-2020, Intel Corporation,
+last revised in libpmem-1.12.0. */
+
+#include "my_global.h"
+#include "cache.h"
+#include <cstdint>
+
+#if defined __x86_64__ || defined __aarch64__ || defined __powerpc64__
+# ifdef __x86_64__
+static void pmem_clflush(const void *buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ __asm__ __volatile__("clflush %0" ::
+ "m"(*reinterpret_cast<const char*>(u)) : "memory");
+}
+
+static void pmem_clflushopt(const void *buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ __asm__ __volatile__(".byte 0x66; clflush %0" /* clflushopt */ ::
+ "m"(*reinterpret_cast<const char*>(u)) : "memory");
+ __asm__ __volatile__("sfence" ::: "memory");
+}
+
+static void pmem_clwb(const void *buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ __asm__ __volatile__(".byte 0x66; xsaveopt %0" /* clwb */ ::
+ "m"(*reinterpret_cast<const char*>(u)) : "memory");
+ __asm__ __volatile__("sfence" ::: "memory");
+}
+
+# include <cpuid.h>
+static decltype(pmem_control::persist) pmem_persist_init()
+{
+ uint32_t eax= 0, ebx= 0, ecx= 0, edx= 0;
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
+ if (ebx & 1U<<24 /* CLWB */)
+ return pmem_clwb;
+ else if (ebx & 1U<<23 /* CLFLUSHOPT */)
+ return pmem_clflushopt;
+ else
+ return pmem_clflush;
+}
+# elif defined __aarch64__
+static void pmem_cvac(const void* buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ __asm__ __volatile__("dc cvac, %0" :: "r"(u) : "memory");
+ __asm__ __volatile__("dmb ishst" ::: "memory");
+}
+
+static void pmem_cvap(const void* buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ __asm__ __volatile__(".arch armv8.2-a\n dc cvap, %0" :: "r"(u) : "memory");
+ __asm__ __volatile__("dmb ishst" ::: "memory");
+}
+
+# include <sys/auxv.h>
+# include <asm/hwcap.h>
+# ifndef HWCAP_DCPOP
+# define HWCAP_DCPOP (1 << 16)
+# endif
+
+static decltype(pmem_control::persist) pmem_persist_init()
+{
+ return (getauxval(AT_HWCAP) & HWCAP_DCPOP) ? pmem_cvap : pmem_cvac;
+}
+# elif defined __powerpc64__
+static void pmem_phwsync(const void* buf, size_t size)
+{
+ for (uintptr_t u= uintptr_t(buf) & ~(CPU_LEVEL1_DCACHE_LINESIZE),
+ end= uintptr_t(buf) + size;
+ u < end; u+= CPU_LEVEL1_DCACHE_LINESIZE)
+ {
+ /* GCC is just passing the inline asm snippets to the assembler,
+ and it does not even define these mnemonics by itself. Clang does,
+ and it includes a built-in assembler.
+
+ Let us hope that having a recent enough GCC is an adequate proxy
+ for having a recent enough assembler. */
+# if __GNUC__ >= 11 || (defined __clang_major__ && __clang_major__ >= 12)
+ __asm__ __volatile__("dcbstps 0,%0" :: "r"(u) : "memory");
+# else
+ __asm__ __volatile__(".long (0x7cc000AC | %0 << 11)" :: "r"(u) : "memory");
+# endif
+ }
+
+# if __GNUC__ >= 11 || (defined __clang_major__ && __clang_major__ >= 18)
+ __asm__ __volatile__("phwsync" ::: "memory");
+# else
+ __asm__ __volatile__(".long 0x7c80040a" ::: "memory");
+# endif
+}
+
+# include <atomic>
+static void pmem_fence(const void*, size_t)
+{
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+}
+
+# include <sys/auxv.h>
+# ifndef AT_HWCAP2
+# define AT_HWCAP2 26
+# endif
+# ifndef PPC_FEATURE2_ARCH_3_1
+# define PPC_FEATURE2_ARCH_3_1 4
+# endif
+
+static decltype(pmem_control::persist) pmem_persist_init()
+{
+ return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_1)
+ ? pmem_phwsync : pmem_fence;
+}
+# endif
+
+pmem_control::pmem_control() : persist(pmem_persist_init()) {}
+const pmem_control pmem;
+#else
+void pmem_persist(const void *buf, size_t size)
+{
+# if defined __riscv && __riscv_xlen == 64
+ __asm__ __volatile__("fence w,w" ::: "memory");
+# elif defined __loongarch64
+ __asm__ __volatile__("dbar 0" ::: "memory");
+# else
+# error "Missing implementation; recompile with cmake -DWITH_INNODB_PMEM=OFF"
+# endif
+}
+#endif
diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc
index 5afb79f2..19db1245 100644
--- a/storage/innobase/sync/srw_lock.cc
+++ b/storage/innobase/sync/srw_lock.cc
@@ -548,3 +548,124 @@ template void ssux_lock_impl<false>::rd_unlock();
template void ssux_lock_impl<false>::u_unlock();
template void ssux_lock_impl<false>::wr_unlock();
#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_DEBUG
+void srw_lock_debug::SRW_LOCK_INIT(mysql_pfs_key_t key)
+{
+ srw_lock::SRW_LOCK_INIT(key);
+ readers_lock.init();
+ ut_ad(!readers.load(std::memory_order_relaxed));
+ ut_ad(!have_any());
+}
+
+void srw_lock_debug::destroy()
+{
+ ut_ad(!writer);
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ readers.store(0, std::memory_order_relaxed);
+ ut_ad(r->empty());
+ delete r;
+ }
+ srw_lock::destroy();
+}
+
+bool srw_lock_debug::wr_lock_try()
+{
+ ut_ad(!have_any());
+ if (!srw_lock::wr_lock_try())
+ return false;
+ ut_ad(!writer);
+ writer.store(pthread_self(), std::memory_order_relaxed);
+ return true;
+}
+
+void srw_lock_debug::wr_lock(SRW_LOCK_ARGS(const char *file, unsigned line))
+{
+ ut_ad(!have_any());
+ srw_lock::wr_lock(SRW_LOCK_ARGS(file, line));
+ ut_ad(!writer);
+ writer.store(pthread_self(), std::memory_order_relaxed);
+}
+
+void srw_lock_debug::wr_unlock()
+{
+ ut_ad(have_wr());
+ writer.store(0, std::memory_order_relaxed);
+ srw_lock::wr_unlock();
+}
+
+void srw_lock_debug::readers_register()
+{
+ readers_lock.wr_lock();
+ auto r= readers.load(std::memory_order_relaxed);
+ if (!r)
+ {
+ r= new std::unordered_multiset<pthread_t>();
+ readers.store(r, std::memory_order_relaxed);
+ }
+ r->emplace(pthread_self());
+ readers_lock.wr_unlock();
+}
+
+bool srw_lock_debug::rd_lock_try()
+{
+ ut_ad(!have_any());
+ if (!srw_lock::rd_lock_try())
+ return false;
+ readers_register();
+ return true;
+}
+
+void srw_lock_debug::rd_lock(SRW_LOCK_ARGS(const char *file, unsigned line))
+{
+ ut_ad(!have_any());
+ srw_lock::rd_lock(SRW_LOCK_ARGS(file, line));
+ readers_register();
+}
+
+void srw_lock_debug::rd_unlock()
+{
+ const pthread_t self= pthread_self();
+ ut_ad(writer != self);
+ readers_lock.wr_lock();
+ auto r= readers.load(std::memory_order_relaxed);
+ ut_ad(r);
+ auto i= r->find(self);
+ ut_ad(i != r->end());
+ r->erase(i);
+ readers_lock.wr_unlock();
+
+ srw_lock::rd_unlock();
+}
+
+bool srw_lock_debug::have_rd() const noexcept
+{
+ if (auto r= readers.load(std::memory_order_relaxed))
+ {
+ readers_lock.wr_lock();
+ bool found= r->find(pthread_self()) != r->end();
+ readers_lock.wr_unlock();
+# ifndef SUX_LOCK_GENERIC
+ ut_ad(!found || is_locked());
+# endif
+ return found;
+ }
+ return false;
+}
+
+bool srw_lock_debug::have_wr() const noexcept
+{
+ if (writer != pthread_self())
+ return false;
+# ifndef SUX_LOCK_GENERIC
+ ut_ad(is_write_locked());
+# endif
+ return true;
+}
+
+bool srw_lock_debug::have_any() const noexcept
+{
+ return have_wr() || have_rd();
+}
+#endif
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index cff16d9c..f32f4de5 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -56,84 +56,6 @@ purge_sys_t purge_sys;
my_bool srv_purge_view_update_only_debug;
#endif /* UNIV_DEBUG */
-/** Sentinel value */
-static const TrxUndoRsegs NullElement;
-
-/** Default constructor */
-TrxUndoRsegsIterator::TrxUndoRsegsIterator()
- : m_rsegs(NullElement), m_iter(m_rsegs.begin())
-{
-}
-
-/** Sets the next rseg to purge in purge_sys.
-Executed in the purge coordinator thread.
-@retval false when nothing is to be purged
-@retval true when purge_sys.rseg->latch was locked */
-inline bool TrxUndoRsegsIterator::set_next()
-{
- ut_ad(!purge_sys.next_stored);
- mysql_mutex_lock(&purge_sys.pq_mutex);
-
- /* Only purge consumes events from the priority queue, user
- threads only produce the events. */
-
- /* Check if there are more rsegs to process in the
- current element. */
- if (m_iter != m_rsegs.end()) {
- /* We are still processing rollback segment from
- the same transaction and so expected transaction
- number shouldn't increase. Undo the increment of
- expected commit done by caller assuming rollback
- segments from given transaction are done. */
- purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
- } else if (!purge_sys.purge_queue.empty()) {
- m_rsegs = purge_sys.purge_queue.top();
- purge_sys.purge_queue.pop();
- ut_ad(purge_sys.purge_queue.empty()
- || purge_sys.purge_queue.top() != m_rsegs);
- m_iter = m_rsegs.begin();
- } else {
- /* Queue is empty, reset iterator. */
- purge_sys.rseg = NULL;
- mysql_mutex_unlock(&purge_sys.pq_mutex);
- m_rsegs = NullElement;
- m_iter = m_rsegs.begin();
- return false;
- }
-
- purge_sys.rseg = *m_iter++;
- mysql_mutex_unlock(&purge_sys.pq_mutex);
-
- /* We assume in purge of externally stored fields that space
- id is in the range of UNDO tablespace space ids */
- ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
- || srv_is_undo_tablespace(purge_sys.rseg->space->id));
-
- purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
- trx_id_t last_trx_no = purge_sys.rseg->last_trx_no();
- purge_sys.hdr_offset = purge_sys.rseg->last_offset();
- purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
-
- /* Only the purge_coordinator_task will access this object
- purge_sys.rseg_iter, or any of purge_sys.hdr_page_no,
- purge_sys.tail.
- The field purge_sys.head and purge_sys.view are modified by
- purge_sys_t::clone_end_view()
- in the purge_coordinator_task
- while holding exclusive purge_sys.latch.
- The purge_sys.view may also be modified by
- purge_sys_t::wake_if_not_active() while holding exclusive
- purge_sys.latch.
- The purge_sys.head may be read by
- purge_truncation_callback(). */
- ut_ad(last_trx_no == m_rsegs.trx_no);
- ut_a(purge_sys.hdr_page_no != FIL_NULL);
- ut_a(purge_sys.tail.trx_no <= last_trx_no);
- purge_sys.tail.trx_no = last_trx_no;
-
- return(true);
-}
-
/** Build a purge 'query' graph. The actual purge is performed by executing
this query graph.
@return own: the query graph */
@@ -345,7 +267,8 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
that is known to be corrupted. */
ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
uint16_t(page_offset(undo_header) +
- TRX_UNDO_HISTORY_NODE), mtr) == DB_SUCCESS);
+ TRX_UNDO_HISTORY_NODE), rseg->space->free_limit,
+ mtr) == DB_SUCCESS);
mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
undo_page->page.frame, undo_state);
@@ -396,9 +319,7 @@ static void trx_purge_free_segment(buf_block_t *rseg_hdr, buf_block_t *block,
void purge_sys_t::rseg_enable(trx_rseg_t &rseg)
{
ut_ad(this == &purge_sys);
-#ifndef SUX_LOCK_GENERIC
- ut_ad(rseg.latch.is_write_locked());
-#endif
+ ut_ad(rseg.latch.have_wr());
uint8_t skipped= skipped_rseg;
ut_ad(skipped < TRX_SYS_N_RSEGS);
if (&rseg == &trx_sys.rseg_array[skipped])
@@ -437,6 +358,19 @@ inline dberr_t purge_sys_t::iterator::free_history_rseg(trx_rseg_t &rseg) const
mtr_t mtr;
bool freed= false;
uint32_t rseg_ref= 0;
+ const auto last_boffset= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE;
+ /* Technically, rseg.space->free_limit is not protected by
+ rseg.latch, which we are holding, but rseg.space->latch. The value
+ that we are reading may become stale (too small) if other pages are
+ being allocated in this tablespace, for other rollback
+ segments. Nothing can be added to this rseg without holding
+ rseg.latch, and hence we can validate the entire file-based list
+ against the limit that we are reading here.
+
+ Note: The read here may look like a data race. On none of our target
+ architectures this should be an actual problem, because the uint32_t
+ value should always fit in a register and be correctly aligned. */
+ const auto last_page= rseg.space->free_limit;
mtr.start();
@@ -452,13 +386,23 @@ func_exit:
}
hdr_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY + rseg_hdr->page.frame);
- hdr_addr.boffset= static_cast<uint16_t>(hdr_addr.boffset -
- TRX_UNDO_HISTORY_NODE);
-loop:
if (hdr_addr.page == FIL_NULL)
goto func_exit;
+ if (hdr_addr.page >= last_page ||
+ hdr_addr.boffset < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE ||
+ hdr_addr.boffset >= last_boffset)
+ {
+ corrupted:
+ err= DB_CORRUPTION;
+ goto func_exit;
+ }
+
+ hdr_addr.boffset= static_cast<uint16_t>(hdr_addr.boffset -
+ TRX_UNDO_HISTORY_NODE);
+
+loop:
buf_block_t *b=
buf_page_get_gen(page_id_t(rseg.space->id, hdr_addr.page),
0, RW_X_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
@@ -507,11 +451,18 @@ loop:
fil_addr_t prev_hdr_addr=
flst_get_prev_addr(b->page.frame + hdr_addr.boffset +
TRX_UNDO_HISTORY_NODE);
+ if (prev_hdr_addr.page == FIL_NULL);
+ else if (prev_hdr_addr.page >= last_page ||
+ prev_hdr_addr.boffset < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE ||
+ prev_hdr_addr.boffset >= last_boffset)
+ goto corrupted;
+
prev_hdr_addr.boffset= static_cast<uint16_t>(prev_hdr_addr.boffset -
TRX_UNDO_HISTORY_NODE);
err= flst_remove(rseg_hdr, TRX_RSEG + TRX_RSEG_HISTORY, b,
- uint16_t(hdr_addr.boffset + TRX_UNDO_HISTORY_NODE), &mtr);
+ uint16_t(hdr_addr.boffset + TRX_UNDO_HISTORY_NODE),
+ last_page, &mtr);
if (UNIV_UNLIKELY(err != DB_SUCCESS))
goto func_exit;
@@ -571,45 +522,21 @@ loop:
ut_ad(rseg_hdr->page.id() == rseg.page_id());
mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX);
+ if (hdr_addr.page == FIL_NULL)
+ goto func_exit;
+
goto loop;
}
-/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
-marked for truncate.
-@param[in] space undo tablespace being truncated */
-static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+void purge_sys_t::cleanse_purge_queue(const fil_space_t &space)
{
- typedef std::vector<TrxUndoRsegs> purge_elem_list_t;
- purge_elem_list_t purge_elem_list;
-
- mysql_mutex_lock(&purge_sys.pq_mutex);
-
- /* Remove rseg instances that are in the purge queue before we start
- truncate of corresponding UNDO truncate. */
- while (!purge_sys.purge_queue.empty()) {
- purge_elem_list.push_back(purge_sys.purge_queue.top());
- purge_sys.purge_queue.pop();
- }
-
- for (purge_elem_list_t::iterator it = purge_elem_list.begin();
- it != purge_elem_list.end();
- ++it) {
-
- for (TrxUndoRsegs::iterator it2 = it->begin();
- it2 != it->end();
- ++it2) {
- if ((*it2)->space == &space) {
- it->erase(it2);
- break;
- }
- }
-
- if (!it->empty()) {
- purge_sys.purge_queue.push(*it);
- }
- }
-
- mysql_mutex_unlock(&purge_sys.pq_mutex);
+ mysql_mutex_lock(&pq_mutex);
+ auto purge_elem_list= clone_queue_container();
+ purge_queue.clear();
+ for (auto elem : purge_elem_list)
+ if (purge_queue::rseg(elem)->space != &space)
+ purge_queue.push_trx_no_rseg(elem);
+ mysql_mutex_unlock(&pq_mutex);
}
dberr_t purge_sys_t::iterator::free_history() const
@@ -672,7 +599,9 @@ fil_space_t *purge_sys_t::truncating_tablespace()
if (space || srv_undo_tablespaces_active < 2 || !srv_undo_log_truncate)
return space;
- const uint32_t size= uint32_t(srv_max_undo_log_size >> srv_page_size_shift);
+ const uint32_t size=
+ uint32_t(std::min(ulonglong{std::numeric_limits<uint32_t>::max()},
+ srv_max_undo_log_size >> srv_page_size_shift));
for (uint32_t i= truncate_undo_space.last, j= i;; )
{
if (fil_space_t *s= undo_truncate_try(srv_undo_space_id_start + i, size))
@@ -751,7 +680,7 @@ not_free:
const char *file_name= UT_LIST_GET_FIRST(space->chain)->name;
sql_print_information("InnoDB: Truncating %s", file_name);
- trx_purge_cleanse_purge_queue(*space);
+ purge_sys.cleanse_purge_queue(*space);
/* Lock all modified pages of the tablespace.
@@ -870,13 +799,11 @@ buf_block_t *purge_sys_t::get_page(page_id_t id)
return nullptr;
}
-void purge_sys_t::rseg_get_next_history_log()
+bool purge_sys_t::rseg_get_next_history_log()
{
fil_addr_t prev_log_addr;
-#ifndef SUX_LOCK_GENERIC
- ut_ad(rseg->latch.is_write_locked());
-#endif
+ ut_ad(rseg->latch.have_wr());
ut_a(rseg->last_page_no != FIL_NULL);
tail.trx_no= rseg->last_trx_no() + 1;
@@ -888,21 +815,24 @@ void purge_sys_t::rseg_get_next_history_log()
{
const byte *log_hdr= undo_page->page.frame + rseg->last_offset();
prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+ if (prev_log_addr.boffset < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE ||
+ prev_log_addr.boffset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE)
+ goto corrupted;
prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset -
TRX_UNDO_HISTORY_NODE);
}
else
- prev_log_addr.page= FIL_NULL;
+ goto corrupted;
- if (prev_log_addr.page == FIL_NULL)
+ if (prev_log_addr.page >= rseg->space->free_limit)
+ corrupted:
rseg->last_page_no= FIL_NULL;
else
{
/* Read the previous log header. */
trx_id_t trx_no= 0;
if (const buf_block_t* undo_page=
- get_page(page_id_t(rseg->space->id,
- prev_log_addr.page)))
+ get_page(page_id_t(rseg->space->id, prev_log_addr.page)))
{
const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
@@ -920,12 +850,13 @@ void purge_sys_t::rseg_get_next_history_log()
can never produce events from an empty rollback segment. */
mysql_mutex_lock(&pq_mutex);
- purge_queue.push(*rseg);
+ enqueue(*rseg);
mysql_mutex_unlock(&pq_mutex);
}
}
rseg->latch.wr_unlock();
+ return choose_next_log();
}
/** Position the purge sys "iterator" on the undo record to use for purging.
@@ -933,11 +864,37 @@ void purge_sys_t::rseg_get_next_history_log()
@retval true when purge_sys.rseg->latch was locked */
bool purge_sys_t::choose_next_log()
{
- if (!rseg_iter.set_next())
- return false;
+ ut_ad(!next_stored);
- hdr_offset= rseg->last_offset();
- hdr_page_no= rseg->last_page_no;
+ mysql_mutex_lock(&pq_mutex);
+ if (purge_queue.empty()) {
+ rseg = nullptr;
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+ return false;
+ }
+ rseg= purge_queue.pop();
+ mysql_mutex_unlock(&purge_sys.pq_mutex);
+
+ /* We assume in purge of externally stored fields that space
+ id is in the range of UNDO tablespace space ids */
+ ut_ad(rseg->space == fil_system.sys_space ||
+ srv_is_undo_tablespace(rseg->space->id));
+
+ rseg->latch.wr_lock(SRW_LOCK_CALL);
+ trx_id_t last_trx_no = rseg->last_trx_no();
+ hdr_offset = rseg->last_offset();
+ hdr_page_no = rseg->last_page_no;
+
+ /* Only the purge_coordinator_task will access this any of
+ purge_sys.hdr_page_no, purge_sys.tail. The field purge_sys.head and
+ purge_sys.view are modified by clone_end_view() in the
+ purge_coordinator_task while holding exclusive purge_sys.latch. The
+ purge_sys.view may also be modified by wake_if_not_active() while holding
+ exclusive purge_sys.latch. The purge_sys.head may be read by
+ purge_truncation_callback(). */
+ ut_a(hdr_page_no != FIL_NULL);
+ ut_a(tail.trx_no <= last_trx_no);
+ tail.trx_no = last_trx_no;
if (!rseg->needs_purge)
{
@@ -968,7 +925,7 @@ bool purge_sys_t::choose_next_log()
if (!b)
goto purge_nothing;
undo_rec=
- trx_undo_page_get_first_rec(b, page_no, hdr_offset);
+ trx_undo_page_get_first_rec(b, hdr_page_no, hdr_offset);
if (!undo_rec)
goto purge_nothing;
}
@@ -992,18 +949,13 @@ inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr)
{
ut_ad(next_stored);
ut_ad(tail.trx_no < low_limit_no());
-#ifndef SUX_LOCK_GENERIC
- ut_ad(rseg->latch.is_write_locked());
-#endif
+ ut_ad(rseg->latch.have_wr());
if (!offset)
{
- /* It is the dummy undo log record, which means that there is no
- need to purge this undo log */
- rseg_get_next_history_log();
-
- /* Look for the next undo log and record to purge */
- if (choose_next_log())
+ /* It is the dummy undo log record, which means that there is no need to
+ purge this undo log. Look for the next undo log and record to purge */
+ if (rseg_get_next_history_log())
rseg->latch.wr_unlock();
return {nullptr, 1};
}
@@ -1051,9 +1003,8 @@ inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr)
else
{
got_no_rec:
- rseg_get_next_history_log();
/* Look for the next undo log and record to purge */
- locked= choose_next_log();
+ locked= rseg_get_next_history_log();
}
if (locked)
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 87a2ac7b..964dca94 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -201,7 +201,7 @@ bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ rseg_header->page.frame, XIDDATASIZE);
- return true;
+ return wsrep_is_wsrep_xid(&xid);
}
/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
@@ -210,6 +210,11 @@ bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
@return whether the WSREP XID is present */
static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
{
+ if (memcmp(TRX_SYS + TRX_SYS_WSREP_XID_INFO + page,
+ field_ref_zero, TRX_SYS_WSREP_XID_LEN) == 0) {
+ return false;
+ }
+
if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_MAGIC_N_FLD
+ page)
@@ -232,7 +237,8 @@ static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
memcpy(xid.data,
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
- return true;
+
+ return wsrep_is_wsrep_xid(&xid);
}
/** Recover the latest WSREP checkpoint XID.
@@ -448,7 +454,14 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
{
if (!rseg->space)
return DB_TABLESPACE_NOT_FOUND;
+
+ /* Access the tablespace header page to recover rseg->space->free_limit */
+ page_id_t page_id{rseg->space->id, 0};
dberr_t err;
+ if (!buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err))
+ return err;
+ mtr->release_last_page();
+ page_id.set_page_no(rseg->page_no);
const buf_block_t *rseg_hdr=
buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
&err);
@@ -493,10 +506,17 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
trx_sys.recovered_binlog_offset= binlog_offset;
trx_sys.recovered_binlog_is_legacy_pos= false;
}
+ }
#ifdef WITH_WSREP
- trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
+ XID tmp_xid;
+ tmp_xid.null();
+ /* Update recovered wsrep xid only if we found wsrep xid from
+ rseg header page and read xid seqno is larger than currently
+ recovered xid seqno. */
+ if (trx_rseg_read_wsrep_checkpoint(rseg_hdr, tmp_xid) &&
+ wsrep_xid_seqno(&tmp_xid) > wsrep_xid_seqno(&trx_sys.recovered_wsrep_xid))
+ trx_sys.recovered_wsrep_xid.set(&tmp_xid);
#endif
- }
}
if (srv_operation == SRV_OPERATION_RESTORE)
@@ -518,6 +538,11 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY +
rseg_hdr->page.frame);
+ if (node_addr.page >= rseg->space->free_limit ||
+ node_addr.boffset < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE ||
+ node_addr.boffset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE)
+ return DB_CORRUPTION;
+
node_addr.boffset= static_cast<uint16_t>(node_addr.boffset -
TRX_UNDO_HISTORY_NODE);
rseg->last_page_no= node_addr.page;
@@ -544,7 +569,7 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
if (rseg->last_page_no != FIL_NULL)
/* There is no need to cover this operation by the purge
mutex because we are still bootstrapping. */
- purge_sys.purge_queue.push(*rseg);
+ purge_sys.enqueue(*rseg);
}
trx_sys.set_undo_non_empty(rseg->history_size > 0);
@@ -567,10 +592,6 @@ static void trx_rseg_init_binlog_info(const page_t* page)
+ TRX_SYS + page);
trx_sys.recovered_binlog_is_legacy_pos= true;
}
-
-#ifdef WITH_WSREP
- trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
-#endif
}
/** Initialize or recover the rollback segments at startup. */
@@ -589,7 +610,17 @@ dberr_t trx_rseg_array_init()
#endif
mtr_t mtr;
dberr_t err = DB_SUCCESS;
-
+ /* mariabackup --prepare only deals with the redo log and the data
+ files, not with transactions or the data dictionary, that's why
+ trx_lists_init_at_db_start() does not invoke purge_sys.create() and
+ purge queue mutex stays uninitialized, and trx_rseg_mem_restore() quits
+ before initializing undo log lists. */
+ if (srv_operation != SRV_OPERATION_RESTORE)
+ /* Acquiring purge queue mutex here should be fine from the
+ deadlock prevention point of view, because executing that
+ function is a prerequisite for starting the purge subsystem or
+ any transactions. */
+ purge_sys.queue_lock();
for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
mtr.start();
if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
@@ -602,7 +633,11 @@ dberr_t trx_rseg_array_init()
+ sys->page.frame);
trx_rseg_init_binlog_info(sys->page.frame);
#ifdef WITH_WSREP
- wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+ if (trx_rseg_init_wsrep_xid(
+ sys->page.frame, trx_sys.recovered_wsrep_xid)) {
+ wsrep_sys_xid.set(
+ &trx_sys.recovered_wsrep_xid);
+ }
#endif
}
@@ -655,7 +690,8 @@ dberr_t trx_rseg_array_init()
mtr.commit();
}
-
+ if (srv_operation != SRV_OPERATION_RESTORE)
+ purge_sys.queue_unlock();
if (err != DB_SUCCESS) {
for (auto& rseg : trx_sys.rseg_array) {
while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) {
@@ -667,7 +703,7 @@ dberr_t trx_rseg_array_init()
}
#ifdef WITH_WSREP
- if (!wsrep_sys_xid.is_null()) {
+ if (srv_operation == SRV_OPERATION_NORMAL && !wsrep_sys_xid.is_null()) {
/* Upgrade from a version prior to 10.3.5,
where WSREP XID was stored in TRX_SYS page.
If no rollback segment has a WSREP XID set,
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 942b8bd4..1d22b853 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -412,12 +412,12 @@ void trx_t::free()
#endif
read_view.mem_noaccess();
MEM_NOACCESS(&lock, sizeof lock);
- MEM_NOACCESS(&op_info, sizeof op_info);
- MEM_NOACCESS(&isolation_level, sizeof isolation_level);
- MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+ MEM_NOACCESS(&op_info, sizeof op_info +
+ sizeof(unsigned) /* isolation_level, snapshot_isolation,
+ check_foreigns, check_unique_secondary,
+ bulk_insert */);
MEM_NOACCESS(&is_registered, sizeof is_registered);
MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
- MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
MEM_NOACCESS(&duplicates, sizeof duplicates);
MEM_NOACCESS(&dict_operation, sizeof dict_operation);
@@ -1142,15 +1142,23 @@ inline void trx_t::write_serialisation_history(mtr_t *mtr)
}
else if (rseg->last_page_no == FIL_NULL)
{
- mysql_mutex_lock(&purge_sys.pq_mutex);
+ /* trx_sys.assign_new_trx_no() and
+ purge_sys.enqueue() must be invoked in the same
+ critical section protected with purge queue mutex to avoid rseg with
+ greater last commit number to be pushed to purge queue prior to rseg with
+ lesser last commit number. In other words pushing to purge queue must be
+ serialized along with assigning trx_no. Otherwise purge coordinator
+ thread can also fetch redo log records from rseg with greater last commit
+ number before rseg with lesser one. */
+ purge_sys.queue_lock();
trx_sys.assign_new_trx_no(this);
const trx_id_t end{rw_trx_hash_element->no};
+ rseg->last_page_no= undo->hdr_page_no;
/* end cannot be less than anything in rseg. User threads only
produce events when a rollback segment is empty. */
- purge_sys.purge_queue.push(TrxUndoRsegs{end, *rseg});
- mysql_mutex_unlock(&purge_sys.pq_mutex);
- rseg->last_page_no= undo->hdr_page_no;
rseg->set_last_commit(undo->hdr_offset, end);
+ purge_sys.enqueue(end, *rseg);
+ purge_sys.queue_unlock();
}
else
trx_sys.assign_new_trx_no(this);
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index ccc68dfe..c0f5b1fb 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -134,8 +134,9 @@ trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
uint16_t offset)
{
uint16_t start= trx_undo_page_get_start(block, page_no, offset);
- return start == trx_undo_page_get_end(block, page_no, offset)
- ? nullptr : block->page.frame + start;
+ uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+ ut_ad(start <= end);
+ return start >= end ? nullptr : block->page.frame + start;
}
/** Get the last undo log record on a page.
@@ -149,8 +150,10 @@ trx_undo_rec_t*
trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
uint16_t offset)
{
+ uint16_t start= trx_undo_page_get_start(block, page_no, offset);
uint16_t end= trx_undo_page_get_end(block, page_no, offset);
- return trx_undo_page_get_start(block, page_no, offset) == end
+ ut_ad(start <= end);
+ return start >= end
? nullptr
: block->page.frame + mach_read_from_2(block->page.frame + end - 2);
}
@@ -510,7 +513,7 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
*err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
- mtr);
+ space->free_limit, mtr);
*id = slot_no;
mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
@@ -693,7 +696,8 @@ buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
mtr->undo_create(*new_block);
trx_undo_page_init(*new_block);
*err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
- new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+ rseg->space->free_limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
new_block= nullptr;
else
@@ -744,9 +748,11 @@ trx_undo_free_page(
buf_page_make_young_if_needed(&header_block->page);
+ const uint32_t limit = rseg->space->free_limit;
+
*err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
- mtr);
+ limit, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
return FIL_NULL;
@@ -755,7 +761,13 @@ trx_undo_free_page(
const fil_addr_t last_addr = flst_get_last(
TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ header_block->page.frame);
- if (UNIV_UNLIKELY(last_addr.page == page_no)) {
+ if (UNIV_UNLIKELY(last_addr.page == page_no)
+ || UNIV_UNLIKELY(last_addr.page != FIL_NULL
+ && last_addr.page >= limit)
+ || UNIV_UNLIKELY(last_addr.boffset < TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE)
+ || UNIV_UNLIKELY(last_addr.boffset >= srv_page_size
+ - TRX_UNDO_LOG_OLD_HDR_SIZE)) {
*err = DB_CORRUPTION;
return FIL_NULL;
}
@@ -972,8 +984,8 @@ trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no)
ut_ad(id < TRX_RSEG_N_SLOTS);
mtr.start();
- const buf_block_t* block = buf_page_get(
- page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr);
+ const page_id_t page_id{rseg->space->id, page_no};
+ const buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
if (UNIV_UNLIKELY(!block)) {
corrupted:
mtr.commit();
@@ -1075,6 +1087,15 @@ corrupted_type:
fil_addr_t last_addr = flst_get_last(
TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+ if (last_addr.page >= rseg->space->free_limit
+ || last_addr.boffset < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE
+ || last_addr.boffset >= srv_page_size
+ - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+ corrupted_undo:
+ ut_free(undo);
+ goto corrupted;
+ }
+
undo->last_page_no = last_addr.page;
undo->top_page_no = last_addr.page;
@@ -1083,8 +1104,7 @@ corrupted_type:
RW_X_LATCH, &mtr);
if (UNIV_UNLIKELY(!last)) {
- ut_free(undo);
- goto corrupted;
+ goto corrupted_undo;
}
if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
diff --git a/storage/innobase/unittest/CMakeLists.txt b/storage/innobase/unittest/CMakeLists.txt
index 7dd7c111..9330d231 100644
--- a/storage/innobase/unittest/CMakeLists.txt
+++ b/storage/innobase/unittest/CMakeLists.txt
@@ -17,6 +17,10 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/unittest/mytap
${CMAKE_SOURCE_DIR}/storage/innobase/include
${CMAKE_SOURCE_DIR}/tpool)
+ADD_EXECUTABLE(innodb_rbt-t innodb_rbt-t.cc ../ut/ut0rbt.cc)
+TARGET_LINK_LIBRARIES(innodb_rbt-t mysys mytap)
+ADD_DEPENDENCIES(innodb_rbt-t GenError)
+MY_ADD_TEST(innodb_rbt)
ADD_EXECUTABLE(innodb_fts-t innodb_fts-t.cc)
TARGET_LINK_LIBRARIES(innodb_fts-t mysys mytap)
ADD_DEPENDENCIES(innodb_fts-t GenError)
diff --git a/storage/innobase/unittest/innodb_rbt-t.cc b/storage/innobase/unittest/innodb_rbt-t.cc
new file mode 100644
index 00000000..38b980da
--- /dev/null
+++ b/storage/innobase/unittest/innodb_rbt-t.cc
@@ -0,0 +1,83 @@
+#include "tap.h"
+#include "ut0rbt.h"
+#include "ut0new.h"
+
+const size_t alloc_max_retries= 0;
+void os_thread_sleep(ulint) { abort(); }
+void ut_dbg_assertion_failed(const char *, const char *, unsigned)
+{ abort(); }
+namespace ib { fatal_or_error::~fatal_or_error() { abort(); } }
+#ifdef UNIV_PFS_MEMORY
+PSI_memory_key mem_key_other, mem_key_std;
+PSI_memory_key ut_new_get_key_by_file(uint32_t) { return mem_key_std; }
+#endif
+
+static const uint64_t doc_ids[]=
+{
+ 103571, 104018, 106821, 108647, 109352, 109379,
+ 110325, 122868, 210682130, 231275441, 234172769, 366236849,
+ 526467159, 1675241735, 1675243405, 1947751899, 1949940363, 2033691953,
+ 2148227299, 2256289791, 2294223591, 2367501260, 2792700091, 2792701220,
+ 2817121627, 2820680352, 2821165664, 3253312130, 3404918378, 3532599429,
+ 3538712078, 3539373037, 3546479309, 3566641838, 3580209634, 3580871267,
+ 3693930556, 3693932734, 3693932983, 3781949558, 3839877411, 3930968983
+};
+
+static int fts_doc_id_cmp(const void *p1, const void *p2)
+{
+ uint64_t a= *static_cast<const uint64_t*>(p1),
+ b= *static_cast<const uint64_t*>(p2);
+ return b > a ? -1 : a > b;
+}
+
+
+static int fts_doc_id_buggy_cmp(const void *p1, const void *p2)
+{
+ return int(*static_cast<const uint64_t*>(p1) -
+ *static_cast<const uint64_t*>(p2));
+}
+
+typedef int (*comparator) (const void*, const void*);
+
+static void rbt_populate(ib_rbt_t *rbt)
+{
+ ib_rbt_bound_t parent;
+ for (const uint64_t &doc_id : doc_ids)
+ {
+ if (rbt_search(rbt, &parent, &doc_id))
+ rbt_add_node(rbt, &parent, &doc_id);
+ }
+}
+
+static void rbt_populate2(ib_rbt_t *rbt)
+{
+ for (const uint64_t &doc_id : doc_ids)
+ rbt_insert(rbt, &doc_id, &doc_id);
+}
+
+static bool rbt_search_all(ib_rbt_t *rbt)
+{
+ ib_rbt_bound_t parent;
+ for (const uint64_t &doc_id : doc_ids)
+ if (rbt_search(rbt, &parent, &doc_id))
+ return false;
+ return true;
+}
+
+static void rbt_test(comparator cmp, bool buggy)
+{
+ ib_rbt_t *rbt= rbt_create(sizeof(uint64_t), cmp);
+ rbt_populate(rbt);
+ ok(rbt_search_all(rbt) != buggy, "search after populate");
+ rbt_free(rbt);
+ rbt= rbt_create(sizeof(uint64_t), cmp);
+ rbt_populate2(rbt);
+ ok(rbt_search_all(rbt) != buggy, "search after populate2");
+ rbt_free(rbt);
+}
+
+int main ()
+{
+ rbt_test(fts_doc_id_buggy_cmp, true);
+ rbt_test(fts_doc_id_cmp, false);
+}
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
index 7b69042c..5b3bc185 100644
--- a/storage/innobase/ut/ut0ut.cc
+++ b/storage/innobase/ut/ut0ut.cc
@@ -258,47 +258,6 @@ ut_print_name(
}
}
-/** Format a table name, quoted as an SQL identifier.
-If the name contains a slash '/', the result will contain two
-identifiers separated by a period (.), as in SQL
-database_name.table_name.
-@see table_name_t
-@param[in] name table or index name
-@param[out] formatted formatted result, will be NUL-terminated
-@param[in] formatted_size size of the buffer in bytes
-@return pointer to 'formatted' */
-char*
-ut_format_name(
- const char* name,
- char* formatted,
- ulint formatted_size)
-{
- switch (formatted_size) {
- case 1:
- formatted[0] = '\0';
- /* FALL-THROUGH */
- case 0:
- return(formatted);
- }
-
- char* end;
-
- end = innobase_convert_name(formatted, formatted_size,
- name, strlen(name), NULL);
-
- /* If the space in 'formatted' was completely used, then sacrifice
- the last character in order to write '\0' at the end. */
- if ((ulint) (end - formatted) == formatted_size) {
- end--;
- }
-
- ut_a((ulint) (end - formatted) < formatted_size);
-
- *end = '\0';
-
- return(formatted);
-}
-
/**********************************************************************//**
Catenate files. */
void
@@ -353,14 +312,16 @@ ut_strerr(
return("Lock wait");
case DB_DEADLOCK:
return("Deadlock");
+ case DB_RECORD_CHANGED:
+ return("Record changed");
+#ifdef WITH_WSREP
case DB_ROLLBACK:
return("Rollback");
+#endif
case DB_DUPLICATE_KEY:
return("Duplicate key");
case DB_MISSING_HISTORY:
return("Required history data has been deleted");
- case DB_CLUSTER_NOT_FOUND:
- return("Cluster not found");
case DB_TABLE_NOT_FOUND:
return("Table not found");
case DB_TOO_BIG_RECORD: