diff options
Diffstat (limited to 'storage/innobase/include')
44 files changed, 581 insertions, 775 deletions
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index b42c543c..83bdaa97 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -189,13 +189,16 @@ btr_read_autoinc(dict_index_t* index) /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, or fall back to MAX(auto_increment_column). -@param[in] table table containing an AUTO_INCREMENT column -@param[in] col_no index of the AUTO_INCREMENT column -@return the AUTO_INCREMENT value -@retval 0 on error or if no AUTO_INCREMENT value was used yet */ -ib_uint64_t -btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) - MY_ATTRIBUTE((nonnull, warn_unused_result)); +@param table table containing an AUTO_INCREMENT column +@param col_no index of the AUTO_INCREMENT column +@param mysql_version TABLE_SHARE::mysql_version +@param max the maximum value of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +uint64_t btr_read_autoinc_with_fallback(const dict_table_t *table, + unsigned col_no, ulong mysql_version, + uint64_t max) + MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. @param[in,out] index clustered index diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h index c66a3bfa..5f84328d 100644 --- a/storage/innobase/include/btr0pcur.h +++ b/storage/innobase/include/btr0pcur.h @@ -28,7 +28,6 @@ Created 2/23/1996 Heikki Tuuri #include "dict0dict.h" #include "btr0cur.h" -#include "buf0block_hint.h" #include "btr0btr.h" #include "gis0rtree.h" @@ -332,8 +331,8 @@ struct btr_pcur_t /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the old_rec record */ btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0); - /** buffer block when the position was stored */ - buf::Block_hint block_when_stored; + /** the page identifier of old_rec */ + page_id_t old_page_id{0,0}; /** the modify clock value of the buffer block when the cursor position was stored */ ib_uint64_t modify_clock= 0; @@ -432,7 +431,8 @@ btr_pcur_open( } /** Open a cursor on the first user record satisfying the search condition; -in case of no match, after the last index record. */ +in case of no match, after the last index record. +@return DB_SUCCESS or error code */ MY_ATTRIBUTE((nonnull, warn_unused_result)) inline dberr_t diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h deleted file mode 100644 index d4fee7c1..00000000 --- a/storage/innobase/include/buf0block_hint.h +++ /dev/null @@ -1,76 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2020, MariaDB Corporation. -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License, version 2.0, as published by the -Free Software Foundation. - -This program is also distributed with certain software (including but not -limited to OpenSSL) that is licensed under separate terms, as designated in a -particular file or component or in included license documentation. The authors -of MySQL hereby grant you an additional permission to link the program and -your derivative works with the separately licensed software that they have -included with MySQL. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, -for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*****************************************************************************/ -#pragma once -#include "buf0buf.h" - -namespace buf { -class Block_hint { -public: - /** Stores the pointer to the block, which is currently buffer-fixed. - @param block a pointer to a buffer-fixed block to be stored */ - inline void store(buf_block_t *block) - { - ut_ad(block->page.buf_fix_count()); - m_block= block; - m_page_id= block->page.id(); - } - - /** Clears currently stored pointer. */ - inline void clear() { m_block= nullptr; } - - /** Invoke f on m_block(which may be null) - @param f The function to be executed. It will be passed the pointer. - If you wish to use the block pointer subsequently, - you need to ensure you buffer-fix it before returning from f. - @return the return value of f - */ - template <typename F> - bool run_with_hint(const F &f) - { - buffer_fix_block_if_still_valid(); - /* m_block could be changed during f() call, so we use local - variable to remember which block we need to unfix */ - buf_block_t *block= m_block; - bool res= f(block); - if (block) - block->page.unfix(); - return res; - } - - buf_block_t *block() const { return m_block; } - - private: - /** The block pointer stored by store(). */ - buf_block_t *m_block= nullptr; - /** If m_block is non-null, the m_block->page.id at time it was stored. */ - page_id_t m_page_id{0, 0}; - - /** A helper function which checks if m_block is not a dangling pointer and - still points to block with page with m_page_id and if so, buffer-fixes it, - otherwise clear()s it */ - void buffer_fix_block_if_still_valid(); -}; -} // namespace buf diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index cd7cc294..c291615c 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -158,14 +158,25 @@ buf_block_free( #define buf_page_get(ID, SIZE, LA, MTR) \ buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR) -/** Try to acquire a page latch. -@param rw_latch RW_S_LATCH or RW_X_LATCH +/** Try to buffer-fix a page. @param block guessed block +@param id expected block->page.id() +@return block if it was buffer-fixed +@retval nullptr if the block no longer is valid */ +buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Try to acquire a page latch after buf_page_optimistic_fix(). +@param block buffer-fixed block +@param rw_latch RW_S_LATCH or RW_X_LATCH @param modify_clock expected value of block->modify_clock @param mtr mini-transaction -@return whether the latch was acquired (the page is an allocated file page) */ -bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, - uint64_t modify_clock, mtr_t *mtr); +@return block if the latch was acquired +@retval nullptr if block->unfix() was called because it no longer is valid */ +buf_block_t *buf_page_optimistic_get(buf_block_t *block, + rw_lock_type_t rw_latch, + uint64_t modify_clock, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Try to S-latch a page. Suitable for using when holding the lock_sys latches (as it avoids deadlock). @@ -292,15 +303,6 @@ void buf_block_modify_clock_inc( /*=======================*/ buf_block_t* block); /*!< in: block */ -/********************************************************************//** -Returns the value of the modify clock. The caller must have an s-lock -or x-lock on the block. -@return value */ -UNIV_INLINE -ib_uint64_t -buf_block_get_modify_clock( -/*=======================*/ - buf_block_t* block); /*!< in: block */ #endif /* !UNIV_INNOCHECKSUM */ /** Check if a buffer is all zeroes. @@ -771,17 +773,16 @@ public: @retval DB_FAIL if the page contains the wrong ID */ dberr_t read_complete(const fil_node_t &node); - /** Note that a block is no longer dirty, while not removing - it from buf_pool.flush_list - @param temporary whether the page belongs to the temporary tablespace - @param error whether an error may have occurred while writing */ - inline void write_complete(bool temporary, bool error); + /** Release a write fix after a page write was completed. + @param persistent whether the page belongs to a persistent tablespace + @param error whether an error may have occurred while writing + @param state recently read state() value with the correct io-fix */ + void write_complete(bool persistent, bool error, uint32_t state); /** Write a flushable page to a file or free a freeable block. - @param evict whether to evict the page on write completion @param space tablespace @return whether a page write was initiated and buf_pool.mutex released */ - bool flush(bool evict, fil_space_t *space); + bool flush(fil_space_t *space); /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() @@ -1756,10 +1757,6 @@ public: /** Decrement the number of pending LRU flush */ inline void n_flush_dec(); - /** Decrement the number of pending LRU flush - while holding flush_list_mutex */ - inline void n_flush_dec_holding_mutex(); - /** @return whether flush_list flushing is active */ bool flush_list_active() const { @@ -1912,6 +1909,9 @@ public: /** Free a page whose underlying file page has been freed. */ ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept; + /** Issue a warning that we could not free up buffer pool pages. */ + ATTRIBUTE_COLD void LRU_warn(); + private: /** Temporary memory for page_compressed and encrypted I/O */ struct io_buf_t diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl index b3158cf1..050c8493 100644 --- a/storage/innobase/include/buf0buf.inl +++ b/storage/innobase/include/buf0buf.inl @@ -116,17 +116,3 @@ buf_block_modify_clock_inc( block->modify_clock++; } - -/********************************************************************//** -Returns the value of the modify clock. The caller must have an s-lock -or x-lock on the block. -@return value */ -UNIV_INLINE -ib_uint64_t -buf_block_get_modify_clock( -/*=======================*/ - buf_block_t* block) /*!< in: block */ -{ - ut_ad(block->page.lock.have_any()); - return(block->modify_clock); -} diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 0cce514b..cc32a38a 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -85,16 +85,6 @@ buf_flush_init_for_writing( bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) MY_ATTRIBUTE((warn_unused_result)); -/** Write out dirty blocks from buf_pool.LRU, -and move clean blocks to buf_pool.free. -The caller must invoke buf_dblwr.flush_buffered_writes() -after releasing buf_pool.mutex. -@param max_n wished maximum mumber of blocks flushed -@param evict whether to evict pages after flushing -@return evict ? number of processed pages : number of pages written -@retval 0 if a buf_pool.LRU batch is already running */ -ulint buf_flush_LRU(ulint max_n, bool evict); - /** Wait until a LRU flush batch ends. */ void buf_flush_wait_LRU_batch_end(); /** Wait until all persistent pages are flushed up to a limit. diff --git a/storage/innobase/include/cache.h b/storage/innobase/include/cache.h new file mode 100644 index 00000000..0647cbe6 --- /dev/null +++ b/storage/innobase/include/cache.h @@ -0,0 +1,33 @@ +/***************************************************************************** + +Copyright (c) 2024, MariaDB plc + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +#include <cstddef> + +#if defined __x86_64__ || defined __aarch64__ || defined __powerpc64__ +struct pmem_control +{ + void (*persist)(const void *, size_t); +public: + pmem_control(); +}; +extern const pmem_control pmem; +# define pmem_persist(buf, size) pmem.persist(buf, size) +#else +void pmem_persist(const void *buf, size_t size); +#endif diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h index a5356e0d..fcb543eb 100644 --- a/storage/innobase/include/data0data.h +++ b/storage/innobase/include/data0data.h @@ -339,15 +339,12 @@ dtuple_set_types_binary( dtuple_t* tuple, /*!< in: data tuple */ ulint n) /*!< in: number of fields to set */ MY_ATTRIBUTE((nonnull)); -/**********************************************************************//** -Checks if a dtuple contains an SQL null value. -@return TRUE if some field is SQL null */ +/** Checks if a dtuple contains an SQL null value. +@param tuple tuple +@param fields_number number of fields in the tuple to check +@return true if some field is SQL null */ UNIV_INLINE -ibool -dtuple_contains_null( -/*=================*/ - const dtuple_t* tuple) /*!< in: dtuple */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); +bool dtuple_contains_null(const dtuple_t *tuple, ulint fields_number = 0); /**********************************************************//** Checks that a data field is typed. Asserts an error if not. @return TRUE if ok */ diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl index 2d1bf5a2..b6c6ace8 100644 --- a/storage/innobase/include/data0data.inl +++ b/storage/innobase/include/data0data.inl @@ -596,28 +596,18 @@ data_write_sql_null( memset(data, 0, len); } -/**********************************************************************//** -Checks if a dtuple contains an SQL null value. -@return TRUE if some field is SQL null */ +/** Checks if a dtuple contains an SQL null value. +@param tuple tuple +@param fields_number number of fields in the tuple to check +@return true if some field is SQL null */ UNIV_INLINE -ibool -dtuple_contains_null( -/*=================*/ - const dtuple_t* tuple) /*!< in: dtuple */ +bool dtuple_contains_null(const dtuple_t *tuple, ulint fields_number) { - ulint n; - ulint i; - - n = dtuple_get_n_fields(tuple); - - for (i = 0; i < n; i++) { - if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { - - return(TRUE); - } - } - - return(FALSE); + ulint n= fields_number ? fields_number : dtuple_get_n_fields(tuple); + for (ulint i= 0; i < n; i++) + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) + return true; + return false; } /**************************************************************//** diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h index 64182aab..960ec390 100644 --- a/storage/innobase/include/db0err.h +++ b/storage/innobase/include/db0err.h @@ -32,23 +32,25 @@ Created 5/24/1996 Heikki Tuuri enum dberr_t { DB_SUCCESS, - DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new + DB_SUCCESS_LOCKED_REC= 9, /*!< like DB_SUCCESS, but a new explicit record lock was created */ /* The following are error codes */ - DB_ERROR = 11, + DB_RECORD_CHANGED, + DB_ERROR, DB_INTERRUPTED, DB_OUT_OF_MEMORY, DB_OUT_OF_FILE_SPACE, DB_LOCK_WAIT, DB_DEADLOCK, - DB_ROLLBACK, DB_DUPLICATE_KEY, DB_MISSING_HISTORY, /*!< required history data has been deleted due to lack of space in rollback segment */ - DB_CLUSTER_NOT_FOUND = 30, - DB_TABLE_NOT_FOUND, +#ifdef WITH_WSREP + DB_ROLLBACK, +#endif + DB_TABLE_NOT_FOUND= 31, DB_TOO_BIG_RECORD, /*!< a record in an index would not fit on a compressed page, or it would become bigger than 1/2 free space in diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 5fafb2c5..3baac658 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -35,6 +35,7 @@ Created 1/8/1996 Heikki Tuuri #include <my_sys.h> #include <deque> +class MDL_context; class MDL_ticket; /** the first table or index ID for other than hard-coded system tables */ @@ -139,6 +140,21 @@ dict_acquire_mdl_shared(dict_table_t *table, MDL_ticket **mdl, dict_table_op_t table_op= DICT_TABLE_OP_NORMAL); +/** Acquire MDL shared for the table name. +@tparam trylock whether to use non-blocking operation +@param[in,out] table table object +@param[in,out] mdl_context MDL context +@param[out] mdl MDL ticket +@param[in] table_op operation to perform when opening +@return table object after locking MDL shared +@retval nullptr if the table is not readable, or if trylock && MDL blocked */ +template<bool trylock> +__attribute__((nonnull, warn_unused_result)) +dict_table_t* +dict_acquire_mdl_shared(dict_table_t *table, + MDL_context *mdl_context, MDL_ticket **mdl, + dict_table_op_t table_op); + /** Look up a table by numeric identifier. @param[in] table_id table identifier @param[in] dict_locked data dictionary locked @@ -1314,13 +1330,7 @@ class dict_sys_t std::atomic<ulonglong> latch_ex_wait_start; /** the rw-latch protecting the data dictionary cache */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch; -#ifdef UNIV_DEBUG - /** whether latch is being held in exclusive mode (by any thread) */ - Atomic_relaxed<pthread_t> latch_ex; - /** number of S-latch holders */ - Atomic_counter<uint32_t> latch_readers; -#endif + alignas(CPU_LEVEL1_DCACHE_LINESIZE) IF_DBUG(srw_lock_debug,srw_lock) latch; public: /** Indexes of SYS_TABLE[] */ enum @@ -1488,15 +1498,12 @@ public: } #ifdef UNIV_DEBUG - /** @return whether any thread (not necessarily the current thread) - is holding the latch; that is, this check may return false - positives */ - bool frozen() const { return latch_readers || latch_ex; } - /** @return whether any thread (not necessarily the current thread) - is holding a shared latch */ - bool frozen_not_locked() const { return latch_readers; } + /** @return whether the current thread is holding the latch */ + bool frozen() const { return latch.have_any(); } + /** @return whether the current thread is holding a shared latch */ + bool frozen_not_locked() const { return latch.have_rd(); } /** @return whether the current thread holds the exclusive latch */ - bool locked() const { return latch_ex == pthread_self(); } + bool locked() const { return latch.have_wr(); } #endif private: /** Acquire the exclusive latch */ @@ -1511,13 +1518,7 @@ public: /** Exclusively lock the dictionary cache. */ void lock(SRW_LOCK_ARGS(const char *file, unsigned line)) { - if (latch.wr_lock_try()) - { - ut_ad(!latch_readers); - ut_ad(!latch_ex); - ut_d(latch_ex= pthread_self()); - } - else + if (!latch.wr_lock_try()) lock_wait(SRW_LOCK_ARGS(file, line)); } @@ -1530,27 +1531,11 @@ public: ATTRIBUTE_NOINLINE void unfreeze(); #else /** Unlock the data dictionary cache. */ - void unlock() - { - ut_ad(latch_ex == pthread_self()); - ut_ad(!latch_readers); - ut_d(latch_ex= 0); - latch.wr_unlock(); - } + void unlock() { latch.wr_unlock(); } /** Acquire a shared lock on the dictionary cache. */ - void freeze() - { - latch.rd_lock(); - ut_ad(!latch_ex); - ut_d(latch_readers++); - } + void freeze() { latch.rd_lock(); } /** Release a shared lock on the dictionary cache. */ - void unfreeze() - { - ut_ad(!latch_ex); - ut_ad(latch_readers--); - latch.rd_unlock(); - } + void unfreeze() { latch.rd_unlock(); } #endif /** Estimate the used memory occupied by the data dictionary diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index fde2a714..0268a280 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -1010,8 +1010,6 @@ struct dict_index_t { /*!< number of columns the user defined to be in the index: in the internal representation we add more columns */ - unsigned nulls_equal:1; - /*!< if true, SQL NULL == SQL NULL */ unsigned n_uniq:10;/*!< number of fields from the beginning which are enough to determine an index entry uniquely */ @@ -2448,6 +2446,9 @@ public: /** @return number of unique columns in FTS_DOC_ID index */ unsigned fts_n_uniq() const { return versioned() ? 2 : 1; } + /** @return the index for that starts with a specific column */ + dict_index_t *get_index(const dict_col_t &col) const; + /** Create metadata. @param name table name @param space tablespace diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl index d60ee5d9..edb7cf92 100644 --- a/storage/innobase/include/dict0mem.inl +++ b/storage/innobase/include/dict0mem.inl @@ -63,6 +63,5 @@ dict_mem_fill_index_struct( & index->MAX_N_FIELDS; /* The '1 +' above prevents allocation of an empty mem block */ - index->nulls_equal = false; ut_d(index->magic_n = DICT_INDEX_MAGIC_N); } diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h index 06af4dcc..7a4e6760 100644 --- a/storage/innobase/include/dyn0buf.h +++ b/storage/innobase/include/dyn0buf.h @@ -57,11 +57,7 @@ public: /** Gets the number of used bytes in a block. @return number of bytes used */ - ulint used() const - MY_ATTRIBUTE((warn_unused_result)) - { - return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG)); - } + uint32_t used() const { return m_used; } /** Gets pointer to the start of data. @@ -153,8 +149,7 @@ public: /** Storage */ byte m_data[MAX_DATA_SIZE]; - /** number of data bytes used in this block; - DYN_BLOCK_FULL_FLAG is set when the block becomes full */ + /** number of data bytes used in this block */ uint32_t m_used; friend class mtr_buf_t; diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h index 83d0b0d6..af7f663d 100644 --- a/storage/innobase/include/dyn0types.h +++ b/storage/innobase/include/dyn0types.h @@ -33,7 +33,4 @@ Created 2013-03-16 Sunny Bains /** This is the initial 'payload' size of a dynamic array */ #define DYN_ARRAY_DATA_SIZE 512 -/** Flag for dyn_block_t::used that indicates a full block */ -#define DYN_BLOCK_FULL_FLAG 0x1000000UL - #endif /* dyn0types_h */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index cdc32515..dfda1178 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -63,7 +63,7 @@ enum srv_flush_t SRV_LITTLESYNC, /** do not flush after writing */ SRV_NOSYNC, - /** invoke os_file_set_nocache() on data files. This implies using + /** Open or create files with O_DIRECT. This implies using unbuffered I/O but still fdatasync(), because some filesystems might not flush meta-data on write completion */ SRV_O_DIRECT, @@ -347,7 +347,6 @@ struct fil_space_t final ~fil_space_t() { ut_ad(!latch_owner); - ut_ad(!latch_count); latch.destroy(); } @@ -411,9 +410,9 @@ private: /** The reference count */ static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC); /** latch protecting all page allocation bitmap pages */ - srw_lock latch; + IF_DBUG(srw_lock_debug, srw_lock) latch; + /** the thread that holds the exclusive latch, or 0 */ pthread_t latch_owner; - ut_d(Atomic_relaxed<uint32_t> latch_count;) public: /** MariaDB encryption data */ fil_space_crypt_t *crypt_data; @@ -1004,40 +1003,32 @@ public: bool recheck, bool encrypt); #ifdef UNIV_DEBUG - bool is_latched() const { return latch_count != 0; } + bool is_latched() const { return latch.have_any(); } #endif - bool is_owner() const { return latch_owner == pthread_self(); } + bool is_owner() const + { + const bool owner{latch_owner == pthread_self()}; + ut_ad(owner == latch.have_wr()); + return owner; + } /** Acquire the allocation latch in exclusive mode */ void x_lock() { latch.wr_lock(SRW_LOCK_CALL); ut_ad(!latch_owner); latch_owner= pthread_self(); - ut_ad(!latch_count.fetch_add(1)); } /** Release the allocation latch from exclusive mode */ void x_unlock() { - ut_ad(latch_count.fetch_sub(1) == 1); ut_ad(latch_owner == pthread_self()); latch_owner= 0; latch.wr_unlock(); } /** Acquire the allocation latch in shared mode */ - void s_lock() - { - ut_ad(!is_owner()); - latch.rd_lock(SRW_LOCK_CALL); - ut_ad(!latch_owner); - ut_d(latch_count.fetch_add(1)); - } + void s_lock() { latch.rd_lock(SRW_LOCK_CALL); } /** Release the allocation latch from shared mode */ - void s_unlock() - { - ut_ad(latch_count.fetch_sub(1)); - ut_ad(!latch_owner); - latch.rd_unlock(); - } + void s_unlock() { latch.rd_unlock(); } typedef span<const char> name_type; @@ -1637,17 +1628,34 @@ void fil_close_tablespace(uint32_t id); /*******************************************************************//** Allocates and builds a file name from a path, a table or tablespace name and a suffix. The string must be freed by caller with ut_free(). -@param[in] path NULL or the directory path or the full path and filename. +@param[in] path nullptr or the directory path or the full path and filename @param[in] name {} if path is full, or Table/Tablespace name -@param[in] ext the file extension to use -@param[in] trim_name true if the last name on the path should be trimmed. +@param[in] extension the file extension to use +@param[in] trim_name true if the last name on the path should be trimmed @return own: file name */ -char* fil_make_filepath(const char *path, const fil_space_t::name_type &name, - ib_extention ext, bool trim_name); +char* fil_make_filepath_low(const char *path, + const fil_space_t::name_type &name, + ib_extention extension, bool trim_name); char *fil_make_filepath(const char* path, const table_name_t name, ib_extention suffix, bool strip_name); +/** Wrapper function over fil_make_filepath_low to build file name. +@param path nullptr or the directory path or the full path and filename +@param name {} if path is full, or Table/Tablespace name +@param extension the file extension to use +@param trim_name true if the last name on the path should be trimmed +@return own: file name */ +static inline char* +fil_make_filepath(const char* path, const fil_space_t::name_type &name, + ib_extention extension, bool trim_name) +{ + /* If we are going to strip a name off the path, there better be a + path and a new name to put back on. */ + ut_ad(!trim_name || (path && name.data())); + return fil_make_filepath_low(path, name, extension, trim_name); +} + /** Create a tablespace file. @param[in] space_id Tablespace ID @param[in] name Tablespace name in dbname/tablename format. diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 26261554..99459bcb 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -209,24 +209,6 @@ typedef byte fseg_inode_t; static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2}; -#define FSEG_FILLFACTOR 8 /* If the number of unused but reserved - pages in a segment is less than - reserved pages / FSEG_FILLFACTOR, - and there are - at least FSEG_FRAG_LIMIT used pages, - then we allow a new empty extent to - be added to the segment in - fseg_alloc_free_page_general(). - Otherwise, we - use unused pages of the segment. */ - -#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS - /* If the segment has >= this many - used pages, it may be expanded by - allocating extents to the segment; - until that only individual fragment - pages are allocated from the space */ - #define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment is at least this many extents, we allow extents to be put to the free @@ -294,7 +276,7 @@ Determine if a page is marked free. @param[in] descr extent descriptor @param[in] offset page offset within extent @return whether the page is free */ -inline bool xdes_is_free(const xdes_t *descr, ulint offset) +inline bool xdes_is_free(const xdes_t *descr, uint32_t offset) { ut_ad(offset < FSP_EXTENT_SIZE); ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset; diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index c0151b44..1d2b409b 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -163,6 +163,9 @@ struct fts_token_t; struct fts_doc_ids_t; struct fts_index_cache_t; +/** Compare two DOC_ID. */ +int fts_doc_id_cmp(const void *p1, const void *p2) + __attribute__((nonnull, warn_unused_result)); /** Initialize the "fts_table" for internal query into FTS auxiliary tables */ @@ -412,6 +415,9 @@ inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids) mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg)); } +/** Sort an array of doc_id */ +void fts_doc_ids_sort(ib_vector_t *doc_ids); + /******************************************************************//** Notify the FTS system about an operation on an FTS-indexed table. */ void diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index ae0bb036..04faceb9 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -271,27 +271,6 @@ fts_index_fetch_nodes( word, /*!< in: the word to fetch */ fts_fetch_t* fetch) /*!< in: fetch callback.*/ MY_ATTRIBUTE((nonnull)); -/******************************************************************//** -Compare two fts_trx_table_t instances, we actually compare the -table id's here. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_cmp( -/*==============*/ - const void* v1, /*!< in: id1 */ - const void* v2) /*!< in: id2 */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); -/******************************************************************//** -Compare a table id with a trx_table_t table id. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_id_cmp( -/*=================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); #define fts_sql_commit(trx) trx_commit_for_mysql(trx) #define fts_sql_rollback(trx) (trx)->rollback() /******************************************************************//** diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl index 3cb09c92..3d937bb3 100644 --- a/storage/innobase/include/fts0priv.inl +++ b/storage/innobase/include/fts0priv.inl @@ -52,47 +52,3 @@ fts_read_object_id( if the id is HEX or DEC and do the right thing with it. */ return(sscanf(str, UINT64PFx, id) == 1); } - -/******************************************************************//** -Compare two fts_trx_table_t instances. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_cmp( -/*==============*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const dict_table_t* table1 - = (*static_cast<const fts_trx_table_t* const*>(p1))->table; - - const dict_table_t* table2 - = (*static_cast<const fts_trx_table_t* const*>(p2))->table; - - return((table1->id > table2->id) - ? 1 - : (table1->id == table2->id) - ? 0 - : -1); -} - -/******************************************************************//** -Compare a table id with a fts_trx_table_t table id. -@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_table_id_cmp( -/*=================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const uintmax_t* table_id = static_cast<const uintmax_t*>(p1); - const dict_table_t* table2 - = (*static_cast<const fts_trx_table_t* const*>(p2))->table; - - return((*table_id > table2->id) - ? 1 - : (*table_id == table2->id) - ? 0 - : -1); -} diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h index fb278d54..7b95348b 100644 --- a/storage/innobase/include/fts0types.h +++ b/storage/innobase/include/fts0types.h @@ -278,44 +278,6 @@ struct fts_token_t { extern const fts_index_selector_t fts_index_selector[]; /******************************************************************//** -Compare two fts_trx_row_t instances doc_ids. */ -UNIV_INLINE -int -fts_trx_row_doc_id_cmp( -/*===================*/ - /*!< out: - < 0 if n1 < n2, - 0 if n1 == n2, - > 0 if n1 > n2 */ - const void* p1, /*!< in: id1 */ - const void* p2); /*!< in: id2 */ - -/******************************************************************//** -Compare two fts_ranking_t instances doc_ids. */ -UNIV_INLINE -int -fts_ranking_doc_id_cmp( -/*===================*/ - /*!< out: - < 0 if n1 < n2, - 0 if n1 == n2, - > 0 if n1 > n2 */ - const void* p1, /*!< in: id1 */ - const void* p2); /*!< in: id2 */ - -/******************************************************************//** -Compare two doc_ids. */ -UNIV_INLINE -int fts_doc_id_cmp( -/*==================*/ - /*!< out: - < 0 if n1 < n2, - 0 if n1 == n2, - > 0 if n1 > n2 */ - const void* p1, /*!< in: id1 */ - const void* p2); /*!< in: id2 */ - -/******************************************************************//** Duplicate a string. */ UNIV_INLINE void diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl index facc1e5c..5b57cad7 100644 --- a/storage/innobase/include/fts0types.inl +++ b/storage/innobase/include/fts0types.inl @@ -47,53 +47,6 @@ fts_string_dup( } /******************************************************************//** -Compare two fts_trx_row_t doc_ids. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_trx_row_doc_id_cmp( -/*===================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; - const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; - - return((int)(tr1->doc_id - tr2->doc_id)); -} - -/******************************************************************//** -Compare two fts_ranking_t doc_ids. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_ranking_doc_id_cmp( -/*===================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; - const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; - - return((int)(rk1->doc_id - rk2->doc_id)); -} - -/******************************************************************//** -Compare two doc_ids. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int fts_doc_id_cmp( -/*==================*/ - const void* p1, /*!< in: id1 */ - const void* p2) /*!< in: id2 */ -{ - const doc_id_t* up1 = static_cast<const doc_id_t*>(p1); - const doc_id_t* up2 = static_cast<const doc_id_t*>(p2); - - return static_cast<int>(*up1 - *up2); -} - -/******************************************************************//** Get the first character's code position for FTS index partition */ extern ulint diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h index 746dab80..1adec365 100644 --- a/storage/innobase/include/fut0lst.h +++ b/storage/innobase/include/fut0lst.h @@ -78,34 +78,40 @@ void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr) MY_ATTRIBUTE((nonnull)); /** Append a file list node to a list. -@param[in,out] base base node block -@param[in] boffset byte offset of the base node -@param[in,out] add block to be added -@param[in] aoffset byte offset of the node to be added -@param[in,out] mtr mini-transaction +@param base base node block +@param boffset byte offset of the base node +@param add block to be added +@param aoffset byte offset of the node to be added +@param limit fil_space_t::free_limit +@param mtr mini-transaction @return error code */ dberr_t flst_add_last(buf_block_t *base, uint16_t boffset, - buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + buf_block_t *add, uint16_t aoffset, + uint32_t limit, mtr_t *mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Prepend a file list node to a list. -@param[in,out] base base node block -@param[in] boffset byte offset of the base node -@param[in,out] add block to be added -@param[in] aoffset byte offset of the node to be added -@param[in,out] mtr mini-transaction +@param base base node block +@param boffset byte offset of the base node +@param add block to be added +@param aoffset byte offset of the node to be added +@param limit fil_space_t::free_limit +@param mtr mini-transaction @return error code */ dberr_t flst_add_first(buf_block_t *base, uint16_t boffset, - buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + buf_block_t *add, uint16_t aoffset, + uint32_t limit, mtr_t *mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** Remove a file list node. -@param[in,out] base base node block -@param[in] boffset byte offset of the base node -@param[in,out] cur block to be removed -@param[in] coffset byte offset of the current record to be removed -@param[in,out] mtr mini-transaction +@param base base node block +@param boffset byte offset of the base node +@param cur block to be removed +@param coffset byte offset of the current record to be removed +@param limit fil_space_t::free_limit +@param mtr mini-transaction @return error code */ dberr_t flst_remove(buf_block_t *base, uint16_t boffset, - buf_block_t *cur, uint16_t coffset, mtr_t *mtr) + buf_block_t *cur, uint16_t coffset, + uint32_t limit, mtr_t *mtr) MY_ATTRIBUTE((nonnull, warn_unused_result)); /** @return the length of a list */ @@ -117,11 +123,9 @@ inline uint32_t flst_get_len(const flst_base_node_t *base) /** @return a file address */ inline fil_addr_t flst_read_addr(const byte *faddr) { - fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE), - mach_read_from_2(faddr + FIL_ADDR_BYTE) }; - ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); - ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); - return addr; + ut_ad(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); + return fil_addr_t{mach_read_from_4(faddr + FIL_ADDR_PAGE), + mach_read_from_2(faddr + FIL_ADDR_BYTE)}; } /** @return list first node address */ diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h index d6a4ef67..2dc25a89 100644 --- a/storage/innobase/include/gis0type.h +++ b/storage/innobase/include/gis0type.h @@ -66,10 +66,7 @@ typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> > rtr_rec_vector; /* Structure for matched records on the leaf page */ typedef struct matched_rec { - byte* bufp; /*!< aligned buffer point */ - byte rec_buf[UNIV_PAGE_SIZE_MAX * 2]; - /*!< buffer used to copy matching rec */ - buf_block_t block; /*!< the shadow buffer block */ + buf_block_t* block; /*!< the shadow buffer block */ ulint used; /*!< memory used */ rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */ mysql_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs @@ -107,7 +104,6 @@ typedef struct rtr_info{ /*!< mutex protect the "path" vector */ rtr_mbr_t mbr; /*!< the search MBR */ que_thr_t* thr; /*!< the search thread */ - mem_heap_t* heap; /*!< memory heap */ btr_cur_t* cursor; /*!< cursor used for search */ dict_index_t* index; /*!< index it is searching */ bool need_prdt_lock; diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 59ee7f55..08b9f4bc 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -438,6 +438,13 @@ dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode, bool no_wait= false) MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Lock the child tables of a table. +@param table parent table +@param trx transaction +@return error code */ +dberr_t lock_table_children(dict_table_t *table, trx_t *trx) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + /** Exclusively lock the data dictionary tables. @param trx dictionary transaction @return error code @@ -724,13 +731,8 @@ private: bool m_initialised; /** mutex proteting the locks */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch; -#ifdef UNIV_DEBUG - /** The owner of exclusive latch (0 if none); protected by latch */ - std::atomic<pthread_t> writer{0}; - /** Number of shared latches */ - std::atomic<ulint> readers{0}; -#endif + alignas(CPU_LEVEL1_DCACHE_LINESIZE) + IF_DBUG(srw_lock_debug,srw_spin_lock) latch; #ifdef SUX_LOCK_GENERIC protected: /** mutex for hash_latch::wait() */ @@ -789,71 +791,35 @@ public: void wr_lock() { mysql_mutex_assert_not_owner(&wait_mutex); - ut_ad(!is_writer()); latch.wr_lock(); - ut_ad(!writer.exchange(pthread_self(), - std::memory_order_relaxed)); } /** Release exclusive lock_sys.latch */ - void wr_unlock() - { - ut_ad(writer.exchange(0, std::memory_order_relaxed) == - pthread_self()); - latch.wr_unlock(); - } + void wr_unlock() { latch.wr_unlock(); } /** Acquire shared lock_sys.latch */ void rd_lock() { mysql_mutex_assert_not_owner(&wait_mutex); - ut_ad(!is_writer()); latch.rd_lock(); - ut_ad(!writer.load(std::memory_order_relaxed)); - ut_d(readers.fetch_add(1, std::memory_order_relaxed)); } /** Release shared lock_sys.latch */ - void rd_unlock() - { - ut_ad(!is_writer()); - ut_ad(readers.fetch_sub(1, std::memory_order_relaxed)); - latch.rd_unlock(); - } + void rd_unlock() { latch.rd_unlock(); } #endif /** Try to acquire exclusive lock_sys.latch @return whether the latch was acquired */ - bool wr_lock_try() - { - ut_ad(!is_writer()); - if (!latch.wr_lock_try()) return false; - ut_ad(!writer.exchange(pthread_self(), - std::memory_order_relaxed)); - return true; - } + bool wr_lock_try() { return latch.wr_lock_try(); } /** Try to acquire shared lock_sys.latch @return whether the latch was acquired */ - bool rd_lock_try() - { - ut_ad(!is_writer()); - if (!latch.rd_lock_try()) return false; - ut_ad(!writer.load(std::memory_order_relaxed)); - ut_d(readers.fetch_add(1, std::memory_order_relaxed)); - return true; - } + bool rd_lock_try() { return latch.rd_lock_try(); } /** Assert that wr_lock() has been invoked by this thread */ - void assert_locked() const { ut_ad(is_writer()); } + void assert_locked() const { ut_ad(latch.have_wr()); } /** Assert that wr_lock() has not been invoked by this thread */ - void assert_unlocked() const { ut_ad(!is_writer()); } + void assert_unlocked() const { ut_ad(!latch.have_wr()); } #ifdef UNIV_DEBUG /** @return whether the current thread is the lock_sys.latch writer */ - bool is_writer() const - { -# ifdef SUX_LOCK_GENERIC - return writer.load(std::memory_order_relaxed) == pthread_self(); -# else - return writer.load(std::memory_order_relaxed) == pthread_self() || - (xtest() && !latch.is_locked_or_waiting()); -# endif - } + bool is_writer() const { return latch.have_wr(); } + /** @return whether the current thread is holding lock_sys.latch */ + bool is_holder() const { return latch.have_any(); } /** Assert that a lock shard is exclusively latched (by some thread) */ void assert_locked(const lock_t &lock) const; /** Assert that a table lock shard is exclusively latched by this thread */ @@ -965,14 +931,14 @@ extern lock_sys_t lock_sys; /** @return the index of an array element */ inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const { - ut_ad(lock_sys.is_writer() || lock_sys.readers); + ut_ad(lock_sys.is_holder()); return calc_hash(fold, n_cells); } /** Get a hash table cell. */ inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const { - ut_ad(lock_sys.is_writer() || lock_sys.readers); + ut_ad(lock_sys.is_holder()); return &array[calc_hash(fold)]; } diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h index 22c0c963..2500ac05 100644 --- a/storage/innobase/include/log0crypt.h +++ b/storage/innobase/include/log0crypt.h @@ -28,6 +28,9 @@ MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. #include "log0log.h" +/** innodb_encrypt_log: whether to encrypt the redo log */ +extern my_bool srv_encrypt_log; + /** Initialize the redo log encryption key and random parameters when creating a new redo log. The random parameters will be persisted in the log header. diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 54851ca0..cef0dcae 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -132,6 +132,9 @@ public: /** Redo log buffer */ struct log_t { + /** The maximum buf_size */ + static constexpr unsigned buf_size_max= os_file_request_size_max; + /** The original (not version-tagged) InnoDB redo log format */ static constexpr uint32_t FORMAT_3_23= 0; /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ @@ -165,60 +168,92 @@ struct log_t static constexpr lsn_t FIRST_LSN= START_OFFSET; private: - /** The log sequence number of the last change of durable InnoDB files */ + /** the lock bit in buf_free */ + static constexpr size_t buf_free_LOCK= ~(~size_t{0} >> 1); alignas(CPU_LEVEL1_DCACHE_LINESIZE) + /** first free offset within buf used; + the most significant bit is set by lock_lsn() to protect this field + as well as write_to_buf, waits */ + std::atomic<size_t> buf_free; +public: + /** number of write requests (to buf); protected by lock_lsn() or lsn_lock */ + size_t write_to_buf; + /** log record buffer, written to by mtr_t::commit() */ + byte *buf; +private: + /** The log sequence number of the last change of durable InnoDB files; + protected by lock_lsn() or lsn_lock or latch.wr_lock() */ std::atomic<lsn_t> lsn; /** the first guaranteed-durable log sequence number */ std::atomic<lsn_t> flushed_to_disk_lsn; - /** log sequence number when log resizing was initiated, or 0 */ - std::atomic<lsn_t> resize_lsn; - /** set when there may be need to initiate a log checkpoint. - This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ - std::atomic<bool> need_checkpoint; +public: + /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */ + size_t waits; + /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ + unsigned buf_size; + /** log file size in bytes, including the header */ + lsn_t file_size; + +#ifdef LOG_LATCH_DEBUG + typedef srw_lock_debug log_rwlock; + typedef srw_mutex log_lsn_lock; -#if defined(__aarch64__) - /* On ARM, we do more spinning */ + bool latch_have_wr() const { return latch.have_wr(); } + bool latch_have_rd() const { return latch.have_rd(); } + bool latch_have_any() const { return latch.have_any(); } +#else +# ifndef UNIV_DEBUG +# elif defined SUX_LOCK_GENERIC + bool latch_have_wr() const { return true; } + bool latch_have_rd() const { return true; } + bool latch_have_any() const { return true; } +# else + bool latch_have_wr() const { return latch.is_write_locked(); } + bool latch_have_rd() const { return latch.is_locked(); } + bool latch_have_any() const { return latch.is_locked(); } +# endif +# ifdef __aarch64__ + /* On ARM, we spin more */ typedef srw_spin_lock log_rwlock; typedef pthread_mutex_wrapper<true> log_lsn_lock; -#else +# else typedef srw_lock log_rwlock; typedef srw_mutex log_lsn_lock; +# endif #endif - -public: - /** rw-lock protecting writes to buf; normal mtr_t::commit() - outside any log checkpoint is covered by a shared latch */ + /** exclusive latch for checkpoint, shared for mtr_t::commit() to buf */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch; -private: - /** mutex protecting buf_free et al, together with latch */ - log_lsn_lock lsn_lock; -public: - /** first free offset within buf use; protected by lsn_lock */ - Atomic_relaxed<size_t> buf_free; - /** number of write requests (to buf); protected by lsn_lock */ - size_t write_to_buf; - /** number of append_prepare_wait(); protected by lsn_lock */ - size_t waits; -private: + + /** number of std::swap(buf, flush_buf) and writes from buf to log; + protected by latch.wr_lock() */ + ulint write_to_log; + /** Last written LSN */ lsn_t write_lsn; -public: - /** log record buffer, written to by mtr_t::commit() */ - byte *buf; + /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() In write_buf(), buf and flush_buf are swapped */ byte *flush_buf; - /** number of std::swap(buf, flush_buf) and writes from buf to log; - protected by latch.wr_lock() */ - ulint write_to_log; - + /** set when there may be need to initiate a log checkpoint. + This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ + std::atomic<bool> need_checkpoint; + /** whether a checkpoint is pending; protected by latch.wr_lock() */ + Atomic_relaxed<bool> checkpoint_pending; + /** next checkpoint number (protected by latch.wr_lock()) */ + byte next_checkpoint_no; + /** recommended maximum buf_free size, after which the buffer is flushed */ + unsigned max_buf_free; /** Log sequence number when a log file overwrite (broken crash recovery) was noticed. Protected by latch.wr_lock(). */ lsn_t overwrite_warned; - /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ - size_t buf_size; + /** latest completed checkpoint (protected by latch.wr_lock()) */ + Atomic_relaxed<lsn_t> last_checkpoint_lsn; + /** next checkpoint LSN (protected by latch.wr_lock()) */ + lsn_t next_checkpoint_lsn; + /** Log file */ + log_file_t log; private: /** Log file being constructed during resizing; protected by latch */ log_file_t resize_log; @@ -229,18 +264,14 @@ private: /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; - void init_lsn_lock() {lsn_lock.init(); } - void lock_lsn() { lsn_lock.wr_lock(); } - void unlock_lsn() {lsn_lock.wr_unlock(); } - void destroy_lsn_lock() { lsn_lock.destroy(); } - -public: - /** recommended maximum size of buf, after which the buffer is flushed */ - size_t max_buf_free; + /** Special implementation of lock_lsn() for IA-32 and AMD64 */ + void lsn_lock_bts() noexcept; + /** Acquire a lock for updating buf_free and related fields. + @return the value of buf_free */ + size_t lock_lsn() noexcept; - /** log file size in bytes, including the header */ - lsn_t file_size; -private: + /** log sequence number when log resizing was initiated, or 0 */ + std::atomic<lsn_t> resize_lsn; /** the log sequence number at the start of the log file */ lsn_t first_lsn; #if defined __linux__ || defined _WIN32 @@ -250,8 +281,6 @@ private: public: /** format of the redo log: e.g., FORMAT_10_8 */ uint32_t format; - /** Log file */ - log_file_t log; #if defined __linux__ || defined _WIN32 /** whether file system caching is enabled for the log */ my_bool log_buffered; @@ -279,21 +308,29 @@ public: /*!< this is the maximum allowed value for lsn - last_checkpoint_lsn when a new query step is started */ - /** latest completed checkpoint (protected by latch.wr_lock()) */ - Atomic_relaxed<lsn_t> last_checkpoint_lsn; - /** next checkpoint LSN (protected by log_sys.latch) */ - lsn_t next_checkpoint_lsn; - /** next checkpoint number (protected by latch.wr_lock()) */ - ulint next_checkpoint_no; - /** whether a checkpoint is pending */ - Atomic_relaxed<bool> checkpoint_pending; /** buffer for checkpoint header */ byte *checkpoint_buf; /* @} */ +private: + /** A lock when the spin-only lock_lsn() is not being used */ + log_lsn_lock lsn_lock; +public: + bool is_initialised() const noexcept { return max_buf_free != 0; } + /** whether there is capacity in the log buffer */ + bool buf_free_ok() const noexcept + { + ut_ad(!is_pmem()); + return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) < + max_buf_free; + } + + void set_buf_free(size_t f) noexcept + { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } + #ifdef HAVE_PMEM bool is_pmem() const noexcept { return !flush_buf; } #else @@ -302,7 +339,7 @@ public: bool is_opened() const noexcept { return log.is_opened(); } - /** @return target write LSN to react on buf_free >= max_buf_free */ + /** @return target write LSN to react on !buf_free_ok() */ inline lsn_t get_write_target() const; /** @return LSN at which log resizing was started and is still in progress @@ -402,9 +439,7 @@ public: void set_recovered_lsn(lsn_t lsn) noexcept { -#ifndef SUX_LOCK_GENERIC - ut_ad(latch.is_write_locked()); -#endif /* SUX_LOCK_GENERIC */ + ut_ad(latch_have_wr()); write_lsn= lsn; this->lsn.store(lsn, std::memory_order_relaxed); flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); @@ -444,17 +479,23 @@ public: private: /** Wait in append_prepare() for buffer to become available - @param lsn log sequence number to write up to - @param ex whether log_sys.latch is exclusively locked */ - ATTRIBUTE_COLD void append_prepare_wait(lsn_t lsn, bool ex) noexcept; + @tparam spin whether to use the spin-only lock_lsn() + @param b the value of buf_free + @param ex whether log_sys.latch is exclusively locked + @param lsn log sequence number to write up to + @return the new value of buf_free */ + template<bool spin> + ATTRIBUTE_COLD size_t append_prepare_wait(size_t b, bool ex, lsn_t lsn) + noexcept; public: /** Reserve space in the log buffer for appending data. + @tparam spin whether to use the spin-only lock_lsn() @tparam pmem log_sys.is_pmem() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template<bool pmem> - inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept; + template<bool spin,bool pmem> + std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. @param d destination @@ -462,9 +503,7 @@ public: @param size length of str, in bytes */ void append(byte *&d, const void *s, size_t size) noexcept { -#ifndef SUX_LOCK_GENERIC - ut_ad(latch.is_locked()); -#endif + ut_ad(latch_have_any()); ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size)); memcpy(d, s, size); d+= size; diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index c916edc9..bfa66216 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -695,14 +695,40 @@ private: /** Encrypt the log */ ATTRIBUTE_NOINLINE void encrypt(); + /** Commit the mini-transaction log. + @tparam pmem log_sys.is_pmem() + @param mtr mini-transaction + @param lsns {start_lsn,flush_ahead} */ + template<bool pmem> + static void commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns); + /** Append the redo log records to the redo log buffer. @return {start_lsn,flush_ahead} */ std::pair<lsn_t,page_flush_ahead> do_write(); /** Append the redo log records to the redo log buffer. + @tparam spin whether to use the spin-only log_sys.lock_lsn() + @tparam pmem log_sys.is_pmem() + @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead} */ - std::pair<lsn_t,page_flush_ahead> finish_write(size_t len); + template<bool spin,bool pmem> static + std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len); + + /** The applicable variant of commit_log() */ + static void (*commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>); + /** The applicable variant of finish_writer() */ + static std::pair<lsn_t,page_flush_ahead> (*finisher)(mtr_t *, size_t); + + std::pair<lsn_t,page_flush_ahead> finish_write(size_t len) + { return finisher(this, len); } +public: + /** Poll interval in log_sys.lock_lsn(); 0 to use log_sys.lsn_lock. + Protected by LOCK_global_system_variables and log_sys.latch. */ + static unsigned spin_wait_delay; + /** Update finisher when spin_wait_delay is changing to or from 0. */ + static void finisher_update(); +private: /** Release all latches. */ void release(); diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index c8374515..7eba359f 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -46,6 +46,18 @@ Created 10/21/1995 Heikki Tuuri #include <time.h> #endif /* !_WIN32 */ +/** The maximum size of a read or write request. + +According to Linux "man 2 read" and "man 2 write" this applies to +both 32-bit and 64-bit systems. + +On FreeBSD, the limit is close to the Linux one, INT_MAX. + +On Microsoft Windows, the limit is UINT_MAX (4 GiB - 1). + +On other systems, the limit typically is up to SSIZE_T_MAX. */ +static constexpr unsigned os_file_request_size_max= 0x7ffff000; + extern bool os_has_said_disk_full; /** File offset in bytes */ @@ -109,25 +121,21 @@ struct pfs_os_file_t /** Options for os_file_create_func @{ */ enum os_file_create_t { - OS_FILE_OPEN = 51, /*!< to open an existing file (if - doesn't exist, error) */ - OS_FILE_CREATE, /*!< to create new file (if - exists, error) */ - OS_FILE_OVERWRITE, /*!< to create a new file, if exists - the overwrite old file */ - OS_FILE_OPEN_RAW, /*!< to open a raw device or disk - partition */ - OS_FILE_CREATE_PATH, /*!< to create the directories */ - OS_FILE_OPEN_RETRY, /*!< open with retry */ - - /** Flags that can be combined with the above values. Please ensure - that the above values stay below 128. */ - - OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */ - OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to - the log unless it is a fatal error, - this flag is only used if - ON_ERROR_NO_EXIT is set */ + /** create a new file */ + OS_FILE_CREATE= 0, + /** open an existing file */ + OS_FILE_OPEN, + /** retry opening an existing file */ + OS_FILE_OPEN_RETRY, + /** open a raw block device */ + OS_FILE_OPEN_RAW, + + /** do not display diagnostic messages */ + OS_FILE_ON_ERROR_SILENT= 4, + + OS_FILE_CREATE_SILENT= OS_FILE_CREATE | OS_FILE_ON_ERROR_SILENT, + OS_FILE_OPEN_SILENT= OS_FILE_OPEN | OS_FILE_ON_ERROR_SILENT, + OS_FILE_OPEN_RETRY_SILENT= OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_SILENT }; static const ulint OS_FILE_READ_ONLY = 333; @@ -144,7 +152,7 @@ static const ulint OS_FILE_NORMAL = 62; /** Types for file create @{ */ static constexpr ulint OS_DATA_FILE = 100; static constexpr ulint OS_LOG_FILE = 101; -#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +#if defined _WIN32 || defined O_DIRECT static constexpr ulint OS_DATA_FILE_NO_O_DIRECT = 103; #endif /* @} */ @@ -191,14 +199,10 @@ public: WRITE_ASYNC= WRITE_SYNC | 1, /** A doublewrite batch */ DBLWR_BATCH= WRITE_ASYNC | 8, - /** Write data; evict the block on write completion */ - WRITE_LRU= WRITE_ASYNC | 32, /** Write data and punch hole for the rest */ - PUNCH= WRITE_ASYNC | 64, - /** Write data and punch hole; evict the block on write completion */ - PUNCH_LRU= PUNCH | WRITE_LRU, + PUNCH= WRITE_ASYNC | 16, /** Zero out a range of bytes in fil_space_t::io() */ - PUNCH_RANGE= WRITE_SYNC | 128, + PUNCH_RANGE= WRITE_SYNC | 32, }; constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, @@ -211,7 +215,6 @@ public: bool is_read() const { return (type & READ_SYNC) != 0; } bool is_write() const { return (type & WRITE_SYNC) != 0; } - bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; } bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } void write_complete(int io_error) const; @@ -349,7 +352,7 @@ A simple function to open or create a file. pfs_os_file_t os_file_create_simple_func( const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success); @@ -358,7 +361,7 @@ os_file_create_simple_func( os_file_create_simple_no_error_handling(), not directly this function! A simple function to open or create a file. @param[in] name name of the file or path as a null-terminated string -@param[in] create_mode create mode +@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file @@ -369,28 +372,12 @@ A simple function to open or create a file. pfs_os_file_t os_file_create_simple_no_error_handling_func( const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success) MY_ATTRIBUTE((warn_unused_result)); -#ifndef HAVE_FCNTL_DIRECT -#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0) -#else -/** Tries to disable OS caching on an opened file descriptor. -@param[in] fd file descriptor to alter -@param[in] file_name file name, used in the diagnostic message -@param[in] name "open" or "create"; used in the diagnostic - message */ -void -os_file_set_nocache( -/*================*/ - int fd, /*!< in: file descriptor to alter */ - const char* file_name, - const char* operation_name); -#endif - #ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */ /** Obtain an exclusive lock on a file. @param fd file descriptor @@ -419,7 +406,7 @@ Opens an existing file or creates a new. pfs_os_file_t os_file_create_func( const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint purpose, ulint type, bool read_only, @@ -617,7 +604,7 @@ pfs_os_file_t pfs_os_file_create_simple_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success, @@ -633,7 +620,7 @@ monitor file creation/open. @param[in] key Performance Schema Key @param[in] name name of the file or path as a null-terminated string -@param[in] create_mode create mode +@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file @@ -648,7 +635,7 @@ pfs_os_file_t pfs_os_file_create_simple_no_error_handling_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success, @@ -681,7 +668,7 @@ pfs_os_file_t pfs_os_file_create_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint purpose, ulint type, bool read_only, diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl index 7de31505..a7603028 100644 --- a/storage/innobase/include/os0file.inl +++ b/storage/innobase/include/os0file.inl @@ -45,7 +45,7 @@ pfs_os_file_t pfs_os_file_create_simple_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success, @@ -80,7 +80,7 @@ monitor file creation/open. @param[in] key Performance Schema Key @param[in] name name of the file or path as a null-terminated string -@param[in] create_mode create mode +@param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file @@ -95,7 +95,7 @@ pfs_os_file_t pfs_os_file_create_simple_no_error_handling_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint access_type, bool read_only, bool* success, @@ -146,7 +146,7 @@ pfs_os_file_t pfs_os_file_create_func( mysql_pfs_key_t key, const char* name, - ulint create_mode, + os_file_create_t create_mode, ulint purpose, ulint type, bool read_only, diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index 93ea650d..1c2af128 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -165,14 +165,11 @@ row_merge_drop_indexes( prepare_inplace_alter_table_dict(). */ void row_merge_drop_temp_indexes(); -/** Create temporary merge files in the given paramater path, and if -UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param[in] path location for creating temporary merge files, or NULL +/** Create a temporary file at the specified path. +@param path location for creating temporary merge files, or nullptr @return File descriptor */ -pfs_os_file_t -row_merge_file_create_low( - const char* path) - MY_ATTRIBUTE((warn_unused_result)); +pfs_os_file_t row_merge_file_create_low(const char *path) + MY_ATTRIBUTE((warn_unused_result)); /*********************************************************************//** Destroy a merge file. And de-register the file from Performance Schema if UNIV_PFS_IO is defined. */ diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h index a1350740..7056c77f 100644 --- a/storage/innobase/include/row0row.h +++ b/storage/innobase/include/row0row.h @@ -370,6 +370,12 @@ row_search_index_entry( mtr_t* mtr) /*!< in: mtr */ MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Get the byte offset of the DB_TRX_ID column +@param[in] rec clustered index record +@param[in] index clustered index +@return the byte offset of DB_TRX_ID, from the start of rec */ +ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index); + #define ROW_COPY_DATA 1 #define ROW_COPY_POINTERS 2 diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h index 8134c60f..54e4a1d2 100644 --- a/storage/innobase/include/row0sel.h +++ b/storage/innobase/include/row0sel.h @@ -115,8 +115,8 @@ row_sel_convert_mysql_key_to_innobase( ulint buf_len, /*!< in: buffer length */ dict_index_t* index, /*!< in: index of the key value */ const byte* key_ptr, /*!< in: MySQL key value */ - ulint key_len); /*!< in: MySQL key value length */ - + ulint key_len) /*!< in: MySQL key value length */ + MY_ATTRIBUTE((nonnull(1,4,5))); /** Search for rows in the database using cursor. Function is mainly used for tables that are shared across connections and diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 51f3049b..2ed26748 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -194,7 +194,6 @@ enum monitor_id_t { MONITOR_FLUSH_ADAPTIVE_AVG_PASS, MONITOR_LRU_GET_FREE_LOOPS, - MONITOR_LRU_GET_FREE_WAITS, MONITOR_FLUSH_AVG_PAGE_RATE, MONITOR_FLUSH_LSN_AVG_RATE, @@ -215,7 +214,6 @@ enum monitor_id_t { MONITOR_LRU_BATCH_SCANNED_PER_CALL, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, - MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT, MONITOR_LRU_GET_FREE_SEARCH, MONITOR_LRU_SEARCH_SCANNED, MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 457d9ab5..5e6bfc33 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -121,10 +121,6 @@ struct srv_stats_t ulint_ctr_n_t n_temp_blocks_decrypted; }; -/** We are prepared for a situation that we have this many threads waiting for -a transactional lock inside InnoDB. srv_start() sets the value. */ -extern ulint srv_max_n_threads; - extern const char* srv_main_thread_op_info; /** Prefix used by MySQL to indicate pre-5.1 table name encoding */ diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h index 01067322..98c256d3 100644 --- a/storage/innobase/include/srw_lock.h +++ b/storage/innobase/include/srw_lock.h @@ -153,7 +153,7 @@ template<bool spinloop> class srw_lock_impl; /** Slim shared-update-exclusive lock with no recursion */ template<bool spinloop> -class ssux_lock_impl final +class ssux_lock_impl { #ifdef UNIV_PFS_RWLOCK friend class ssux_lock; @@ -550,3 +550,51 @@ typedef srw_lock_impl<false> srw_lock; typedef srw_lock_impl<true> srw_spin_lock; #endif + +#ifdef UNIV_DEBUG +# include <unordered_set> + +class srw_lock_debug : private srw_lock +{ + /** The owner of the exclusive lock (0 if none) */ + std::atomic<pthread_t> writer; + /** Protects readers */ + mutable srw_mutex readers_lock; + /** Threads that hold the lock in shared mode */ + std::atomic<std::unordered_multiset<pthread_t>*> readers; + + /** Register a read lock. */ + void readers_register(); + +public: + void SRW_LOCK_INIT(mysql_pfs_key_t key); + void destroy(); + +#ifndef SUX_LOCK_GENERIC + /** @return whether any lock may be held by any thread */ + bool is_locked_or_waiting() const noexcept + { return srw_lock::is_locked_or_waiting(); } + /** @return whether an exclusive lock may be held by any thread */ + bool is_write_locked() const noexcept { return srw_lock::is_write_locked(); } +#endif + + /** Acquire an exclusive lock */ + void wr_lock(SRW_LOCK_ARGS(const char *file, unsigned line)); + /** @return whether an exclusive lock was acquired */ + bool wr_lock_try(); + /** Release after wr_lock() */ + void wr_unlock(); + /** Acquire a shared lock */ + void rd_lock(SRW_LOCK_ARGS(const char *file, unsigned line)); + /** @return whether a shared lock was acquired */ + bool rd_lock_try(); + /** Release after rd_lock() */ + void rd_unlock(); + /** @return whether this thread is between rd_lock() and rd_unlock() */ + bool have_rd() const noexcept; + /** @return whether this thread is between wr_lock() and wr_unlock() */ + bool have_wr() const noexcept; + /** @return whether this thread is holding rd_lock() or wr_lock() */ + bool have_any() const noexcept; +}; +#endif diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 0f4f8afa..1fb6cd68 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -55,80 +55,74 @@ Run a purge batch. @return number of undo log pages handled in the batch */ ulint trx_purge(ulint n_tasks, ulint history_size); -/** Rollback segements from a given transaction with trx-no -scheduled for purge. */ -class TrxUndoRsegs { -private: - typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> > - trx_rsegs_t; -public: - typedef trx_rsegs_t::iterator iterator; - typedef trx_rsegs_t::const_iterator const_iterator; - - TrxUndoRsegs() = default; - - /** Constructor */ - TrxUndoRsegs(trx_rseg_t& rseg) - : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {} - /** Constructor */ - TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg) - : trx_no(trx_no), m_rsegs(1, &rseg) {} - - bool operator!=(const TrxUndoRsegs& other) const - { return trx_no != other.trx_no; } - bool empty() const { return m_rsegs.empty(); } - void erase(iterator& it) { m_rsegs.erase(it); } - iterator begin() { return(m_rsegs.begin()); } - iterator end() { return(m_rsegs.end()); } - const_iterator begin() const { return m_rsegs.begin(); } - const_iterator end() const { return m_rsegs.end(); } - - /** Compare two TrxUndoRsegs based on trx_no. - @param elem1 first element to compare - @param elem2 second element to compare - @return true if elem1 > elem2 else false.*/ - bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs) - { - return(lhs.trx_no > rhs.trx_no); - } - - /** Copy of trx_rseg_t::last_trx_no() */ - trx_id_t trx_no= 0; -private: - /** Rollback segments of a transaction, scheduled for purge. */ - trx_rsegs_t m_rsegs{}; -}; - -typedef std::priority_queue< - TrxUndoRsegs, - std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >, - TrxUndoRsegs> purge_pq_t; - -/** Chooses the rollback segment with the oldest committed transaction */ -struct TrxUndoRsegsIterator { - /** Constructor */ - TrxUndoRsegsIterator(); - /** Sets the next rseg to purge in purge_sys. - Executed in the purge coordinator thread. - @retval false when nothing is to be purged - @retval true when purge_sys.rseg->latch was locked */ - inline bool set_next(); - -private: - // Disable copying - TrxUndoRsegsIterator(const TrxUndoRsegsIterator&); - TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&); - - /** The current element to process */ - TrxUndoRsegs m_rsegs; - /** Track the current element in m_rsegs */ - TrxUndoRsegs::const_iterator m_iter; -}; - /** The control structure used in the purge operation */ class purge_sys_t { - friend TrxUndoRsegsIterator; + /** Min-heap based priority queue of (trx_no, trx_sys.rseg_array index) + pairs, ordered on trx_no. The highest 64-TRX_NO_SHIFT bits of each element is + trx_no, the lowest 8 bits is rseg's index in trx_sys.rseg_array. */ + class purge_queue + { + public: + typedef std::vector<uint64_t, ut_allocator<uint64_t>> container_type; + /** Number of bits reseved to shift trx_no in purge queue element */ + static constexpr unsigned TRX_NO_SHIFT= 8; + + bool empty() const { return m_array.empty(); } + void clear() { m_array.clear(); } + + /** Push (trx_no, trx_sys.rseg_array index) into min-heap. + @param trx_no_rseg (trx_no << TRX_NO_SHIFT | (trx_sys.rseg_array index)) */ + void push_trx_no_rseg(container_type::value_type trx_no_rseg) + { + m_array.push_back(trx_no_rseg); + std::push_heap(m_array.begin(), m_array.end(), + std::greater<container_type::value_type>()); + } + + /** Push rseg to priority queue. + @param trx_no trx_no of committed transaction + @param rseg rseg of committed transaction*/ + void push(trx_id_t trx_no, const trx_rseg_t &rseg) + { + ut_ad(trx_no < 1ULL << (DATA_TRX_ID_LEN * CHAR_BIT)); + ut_ad(&rseg >= trx_sys.rseg_array); + ut_ad(&rseg < trx_sys.rseg_array + TRX_SYS_N_RSEGS); + push_trx_no_rseg(trx_no << TRX_NO_SHIFT | + byte(&rseg - trx_sys.rseg_array)); + } + + /** Extracts rseg from (trx_no, trx_sys.rseg_array index) pair. + @param trx_no_rseg (trx_no << TRX_NO_SHIFT | (trx_sys.rseg_array index) + @return pointer to rseg in trx_sys.rseg_array */ + static trx_rseg_t *rseg(container_type::value_type trx_no_rseg) { + byte i= static_cast<byte>(trx_no_rseg); + ut_ad(i < TRX_SYS_N_RSEGS); + return &trx_sys.rseg_array[i]; + } + + /** Pop rseg from priority queue. + @return pointer to popped trx_rseg_t object */ + trx_rseg_t *pop() + { + ut_ad(!empty()); + std::pop_heap(m_array.begin(), m_array.end(), + std::greater<container_type::value_type>()); + trx_rseg_t *r = rseg(m_array.back()); + m_array.pop_back(); + return r; + } + + /** Clone m_array. + @return m_array clone */ + container_type clone_container() const{ return m_array; } + + private: + /** Array of (trx_no, trx_sys.rseg_array index) pairs. */ + container_type m_array; + }; + + public: /** latch protecting view, m_enabled */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch; @@ -244,15 +238,36 @@ private: record */ uint16_t hdr_offset; /*!< Header byte offset on the page */ + /** Binary min-heap of (trx_no, trx_sys.rseg_array index) pairs, ordered on + trx_no. It is protected by the pq_mutex */ + purge_queue purge_queue; + + /** Mutex protecting purge_queue */ + mysql_mutex_t pq_mutex; - TrxUndoRsegsIterator - rseg_iter; /*!< Iterator to get the next rseg - to process */ public: - purge_pq_t purge_queue; /*!< Binary min-heap, ordered on - TrxUndoRsegs::trx_no. It is protected - by the pq_mutex */ - mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */ + + void enqueue(trx_id_t trx_no, const trx_rseg_t &rseg) { + mysql_mutex_assert_owner(&pq_mutex); + purge_queue.push(trx_no, rseg); + } + + /** Push to purge queue without acquiring pq_mutex. + @param rseg rseg to push */ + void enqueue(const trx_rseg_t &rseg) { enqueue(rseg.last_trx_no(), rseg); } + + /** Clone purge queue container. + @return purge queue container clone */ + purge_queue::container_type clone_queue_container() const { + mysql_mutex_assert_owner(&pq_mutex); + return purge_queue.clone_container(); + } + + /** Acquare purge_queue_mutex */ + void queue_lock() { mysql_mutex_lock(&pq_mutex); } + + /** Release purge queue mutex */ + void queue_unlock() { mysql_mutex_unlock(&pq_mutex); } /** innodb_undo_log_truncate=ON state; only modified by purge_coordinator_callback() */ @@ -332,8 +347,9 @@ private: /** Update the last not yet purged history log info in rseg when we have purged a whole undo log. Advances also purge_trx_no - past the purged log. */ - void rseg_get_next_history_log(); + past the purged log. + @return whether anything is to be purged */ + bool rseg_get_next_history_log(); public: /** @@ -438,6 +454,11 @@ public: @param already_stopped True indicates purge threads were already stopped */ void stop_FTS(const dict_table_t &table, bool already_stopped=false); + + /** Cleanse purge queue to remove the rseg that reside in undo-tablespace + marked for truncate. + @param space undo tablespace being truncated */ + void cleanse_purge_queue(const fil_space_t &space); }; /** The global data structure coordinating a purge */ diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 7fa43047..e0051b2a 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -59,7 +59,7 @@ struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t /** tablespace containing the rollback segment; constant after init() */ fil_space_t *space; /** latch protecting everything except page_no, space */ - srw_spin_lock latch; + IF_DBUG(srw_lock_debug,srw_spin_lock) latch; /** rollback segment header page number; constant after init() */ uint32_t page_no; /** length of the TRX_RSEG_HISTORY list (number of transactions) */ @@ -170,19 +170,21 @@ public: /** Last not yet purged undo log header; FIL_NULL if all purged */ uint32_t last_page_no; - /** trx_t::no | last_offset << 48 */ + /** trx_t::no << 16 | last_offset */ uint64_t last_commit_and_offset; /** @return the commit ID of the last committed transaction */ trx_id_t last_trx_no() const - { return last_commit_and_offset & ((1ULL << 48) - 1); } + { return last_commit_and_offset >> 16; } /** @return header offset of the last committed transaction */ uint16_t last_offset() const - { return static_cast<uint16_t>(last_commit_and_offset >> 48); } + { + return static_cast<uint16_t>(last_commit_and_offset); + } void set_last_commit(uint16_t last_offset, trx_id_t trx_no) { - last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no; + last_commit_and_offset= trx_no << 16 | static_cast<uint64_t>(last_offset); } /** @return the page identifier */ diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 0a3e0d62..15255354 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -785,13 +785,19 @@ public: const char* op_info; /*!< English text describing the current operation, or an empty string */ - uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */ - bool check_foreigns; /*!< normally TRUE, but if the user - wants to suppress foreign key checks, - (in table imports, for example) we - set this FALSE */ + /** TRX_ISO_REPEATABLE_READ, ... */ + unsigned isolation_level:2; + /** when set, REPEATABLE READ will actually be Snapshot Isolation, due to + detecting write/write conflicts and disabling "semi-consistent read" */ + unsigned snapshot_isolation:1; + /** normally set; "SET foreign_key_checks=0" can be issued to suppress + foreign key checks, in table imports, for example */ + unsigned check_foreigns:1; + /** normally set; "SET unique_checks=0, foreign_key_checks=0" + enables bulk insert into an empty table */ + unsigned check_unique_secondary:1; /** whether an insert into an empty table is active */ - bool bulk_insert; + unsigned bulk_insert:1; /*------------------------------*/ /* MySQL has a transaction coordinator to coordinate two phase commit between multiple storage engines and the binary log. When @@ -805,13 +811,6 @@ public: /** whether this is holding the prepare mutex */ bool active_commit_ordered; /*------------------------------*/ - bool check_unique_secondary; - /*!< normally TRUE, but if the user - wants to speed up inserts by - suppressing unique key checks - for secondary indexes when we decide - if we can use the insert buffer for - them, we set this FALSE */ bool flush_log_later;/* In 2PC, we hold the prepare_commit mutex across both phases. In that case, we @@ -1189,10 +1188,16 @@ public: return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS; } + /** Do the bulk insert for the buffered insert operation of a table. + @param table bulk insert operation + @return DB_SUCCESS or error code. */ + dberr_t bulk_insert_apply_for_table(dict_table_t *table); private: /** Apply the buffered bulk inserts. */ dberr_t bulk_insert_apply_low(); + /** Rollback the bulk insert operation for the transaction */ + void bulk_rollback_low(); /** Assign a rollback segment for modifying temporary tables. @return the assigned rollback segment */ trx_rseg_t *assign_temp_rseg(); diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl index 9f05989f..023e2b98 100644 --- a/storage/innobase/include/trx0undo.inl +++ b/storage/innobase/include/trx0undo.inl @@ -125,5 +125,6 @@ trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec, { uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset); uint16_t next= mach_read_from_2(undo_page->page.frame + rec); - return next == end ? nullptr : undo_page->page.frame + next; + ut_ad(next <= end); + return next >= end ? nullptr : undo_page->page.frame + next; } diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h index f4183e4c..3ff5f885 100644 --- a/storage/innobase/include/ut0new.h +++ b/storage/innobase/include/ut0new.h @@ -1071,9 +1071,8 @@ static inline void *ut_malloc_dontdump(size_t n_bytes, ...) { void *ptr = my_large_malloc(&n_bytes, MYF(0)); - ut_dontdump(ptr, n_bytes, true); - if (ptr) { + ut_dontdump(ptr, n_bytes, true); os_total_large_mem_allocated += n_bytes; } return ptr; diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index fe16ce14..500b6455 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -242,20 +242,6 @@ ut_print_name( FILE* ef, /*!< in: stream */ const trx_t* trx, /*!< in: transaction */ const char* name); /*!< in: table name to print */ -/** Format a table name, quoted as an SQL identifier. -If the name contains a slash '/', the result will contain two -identifiers separated by a period (.), as in SQL -database_name.table_name. -@see table_name_t -@param[in] name table or index name -@param[out] formatted formatted result, will be NUL-terminated -@param[in] formatted_size size of the buffer in bytes -@return pointer to 'formatted' */ -char* -ut_format_name( - const char* name, - char* formatted, - ulint formatted_size); /**********************************************************************//** Catenate files. */ diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h index f4660f96..ad43e1c8 100644 --- a/storage/innobase/include/ut0vec.h +++ b/storage/innobase/include/ut0vec.h @@ -201,15 +201,6 @@ ib_vector_last_const( const ib_vector_t* vec); /* in: vector */ /******************************************************************** -Sort the vector elements. */ -UNIV_INLINE -void -ib_vector_sort( -/*===========*/ - ib_vector_t* vec, /* in/out: vector */ - ib_compare_t compare); /* in: the comparator to use for sort */ - -/******************************************************************** The default ib_vector_t heap free. Does nothing. */ UNIV_INLINE void diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl index 531f0f22..1a844dd8 100644 --- a/storage/innobase/include/ut0vec.inl +++ b/storage/innobase/include/ut0vec.inl @@ -305,19 +305,6 @@ ib_vector_remove( } /******************************************************************** -Sort the vector elements. */ -UNIV_INLINE -void -ib_vector_sort( -/*===========*/ - /* out: void */ - ib_vector_t* vec, /* in: vector */ - ib_compare_t compare)/* in: the comparator to use for sort */ -{ - qsort(vec->data, vec->used, vec->sizeof_value, compare); -} - -/******************************************************************** Destroy the vector. Make sure the vector owns the allocator, e.g., the heap in the the heap allocator. */ UNIV_INLINE |