diff options
Diffstat (limited to 'storage/innobase/buf')
-rw-r--r-- | storage/innobase/buf/buf0block_hint.cc | 59 | ||||
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 264 | ||||
-rw-r--r-- | storage/innobase/buf/buf0flu.cc | 208 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.cc | 184 | ||||
-rw-r--r-- | storage/innobase/buf/buf0rea.cc | 2 |
5 files changed, 275 insertions, 442 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc deleted file mode 100644 index 6bd01faa..00000000 --- a/storage/innobase/buf/buf0block_hint.cc +++ /dev/null @@ -1,59 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2020, 2021, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License, version 2.0, as published by the -Free Software Foundation. - -This program is also distributed with certain software (including but not -limited to OpenSSL) that is licensed under separate terms, as designated in a -particular file or component or in included license documentation. The authors -of MySQL hereby grant you an additional permission to link the program and -your derivative works with the separately licensed software that they have -included with MySQL. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, -for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*****************************************************************************/ - -#include "buf0block_hint.h" -namespace buf { - -TRANSACTIONAL_TARGET -void Block_hint::buffer_fix_block_if_still_valid() -{ - /* To check if m_block belongs to the current buf_pool, we must - prevent freeing memory while we check, and until we buffer-fix the - block. For this purpose it is enough to latch any of the many - latches taken by buf_pool_t::resize(). - - Similar to buf_page_optimistic_get(), we must validate - m_block->page.id() after acquiring the hash_lock, because the object - may have been freed and not actually attached to buf_pool.page_hash - at the moment. (The block could have been reused to store a - different page, and that slice of buf_pool.page_hash could be protected - by another hash_lock that we are not holding.) - - Finally, we must ensure that the block is not being freed. */ - if (m_block) - { - auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold()); - transactional_shared_lock_guard<page_hash_latch> g - {buf_pool.page_hash.lock_get(cell)}; - if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() && - m_block->page.frame && m_block->page.in_file()) - m_block->page.fix(); - else - clear(); - } -} -} // namespace buf diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 23b5b776..49f73105 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -77,6 +77,8 @@ struct set_numa_interleave_t if (srv_numa_interleave) { struct bitmask *numa_mems_allowed = numa_get_mems_allowed(); + MEM_MAKE_DEFINED(numa_mems_allowed, + sizeof *numa_mems_allowed); ib::info() << "Setting NUMA memory policy to" " MPOL_INTERLEAVE"; if (set_mempolicy(MPOL_INTERLEAVE, @@ -1062,6 +1064,7 @@ inline bool buf_pool_t::chunk_t::create(size_t bytes) if (srv_numa_interleave) { struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed); if (mbind(mem, mem_size(), MPOL_INTERLEAVE, numa_mems_allowed->maskp, numa_mems_allowed->size, MPOL_MF_MOVE)) @@ -1591,17 +1594,14 @@ inline bool buf_pool_t::withdraw_blocks() /* reserve free_list length */ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { - buf_flush_LRU( - std::max<ulint>(withdraw_target - - UT_LIST_GET_LEN(withdraw), - srv_LRU_scan_depth), - true); - mysql_mutex_unlock(&buf_pool.mutex); - buf_dblwr.flush_buffered_writes(); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_flush_wait_LRU_batch_end(); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mysql_mutex_lock(&buf_pool.mutex); + try_LRU_scan = false; + mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + page_cleaner_wakeup(true); + my_cond_wait(&done_flush_list, + &flush_list_mutex.m_mutex); + mysql_mutex_unlock(&flush_list_mutex); + mysql_mutex_lock(&mutex); } /* relocate blocks/buddies in withdrawn area */ @@ -2298,7 +2298,10 @@ buf_page_t *buf_pool_t::watch_set(const page_id_t id, got_block: bpage->fix(); if (watch_is_sentinel(*bpage)) + { + ut_ad(!bpage->oldest_modification()); bpage= nullptr; + } page_hash.lock_get(chain).unlock(); return bpage; } @@ -2370,6 +2373,7 @@ void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain) } else { + ut_ad(!w->oldest_modification()); const auto state= w->state(); ut_ad(~buf_page_t::LRU_MASK & state); ut_ad(state >= buf_page_t::UNFIXED + 1); @@ -2856,9 +2860,10 @@ got_block_fixed: if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) { if (mode == BUF_PEEK_IF_IN_POOL) { ignore_block: + block->unfix(); +ignore_unfixed: ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); - block->unfix(); if (err) { *err = DB_CORRUPTION; } @@ -2872,16 +2877,32 @@ ignore_block: in buf_page_t::read_complete() or buf_pool_t::corrupted_evict(), or after buf_zip_decompress() in this function. */ - block->page.lock.s_lock(); + if (rw_latch != RW_NO_LATCH) { + block->page.lock.s_lock(); + } else if (!block->page.lock.s_lock_try()) { + /* For RW_NO_LATCH, we should not try to acquire S or X + latch directly as we could be violating the latching + order resulting in deadlock. Instead we try latching the + page and retry in case of a failure. */ + goto wait_for_read; + } state = block->page.state(); ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); const page_id_t id{block->page.id()}; block->page.lock.s_unlock(); - if (UNIV_UNLIKELY(id != page_id)) { + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + if (UNIV_UNLIKELY(id == page_id)) { + /* The page read was completed, and + another thread marked the page as free + while we were waiting. */ + goto ignore_block; + } + ut_ad(id == page_id_t{~0ULL}); block->page.unfix(); + if (++retries < BUF_PAGE_READ_MAX_RETRIES) { goto loop; } @@ -2892,6 +2913,7 @@ ignore_block: return nullptr; } + ut_ad(id == page_id); } else if (mode != BUF_PEEK_IF_IN_POOL) { } else if (!mtr) { ut_ad(!block->page.oldest_modification()); @@ -2918,6 +2940,7 @@ free_unfixed_block: if (UNIV_UNLIKELY(!block->page.frame)) { if (!block->page.lock.x_lock_try()) { wait_for_unzip: +wait_for_read: /* The page is being read or written, or another thread is executing buf_zip_decompress() in buf_page_get_low() on it. */ @@ -3098,83 +3121,72 @@ re_evict_fail: #endif /* UNIV_DEBUG */ ut_ad(block->page.frame); + /* The state = block->page.state() may be stale at this point, + and in fact, at any point of time if we consider its + buffer-fix component. If the block is being read into the + buffer pool, it is possible that buf_page_t::read_complete() + will invoke buf_pool_t::corrupted_evict() and therefore + invalidate it (invoke buf_page_t::set_corrupt_id() and set the + state to FREED). Therefore, after acquiring the page latch we + must recheck the state. */ + if (state >= buf_page_t::UNFIXED && allow_ibuf_merge && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && page_is_leaf(block->page.frame)) { block->page.lock.x_lock(); - ut_ad(block->page.id() == page_id - || (state >= buf_page_t::READ_FIX - && state < buf_page_t::WRITE_FIX)); - -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif /* BTR_CUR_HASH_ADAPT */ - - dberr_t e; - - if (UNIV_UNLIKELY(block->page.id() != page_id)) { -page_id_mismatch: - state = block->page.state(); - e = DB_CORRUPTION; -ibuf_merge_corrupted: - if (err) { - *err = e; - } - - if (block->page.id().is_corrupted()) { - buf_pool.corrupted_evict(&block->page, state); - } - return nullptr; - } - state = block->page.state(); ut_ad(state < buf_page_t::READ_FIX); if (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT) { block->page.clear_ibuf_exist(); - e = ibuf_merge_or_delete_for_page(block, page_id, - block->zip_size()); - if (UNIV_UNLIKELY(e != DB_SUCCESS)) { - goto ibuf_merge_corrupted; + if (dberr_t local_err = + ibuf_merge_or_delete_for_page(block, page_id, + block->zip_size())) { + if (err) { + *err = local_err; + } + goto release_and_ignore_block; } + } else if (state < buf_page_t::UNFIXED) { +release_and_ignore_block: + block->page.lock.x_unlock(); + goto ignore_block; } - if (rw_latch == RW_X_LATCH) { - goto get_latch_valid; - } else { +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + + switch (rw_latch) { + case RW_NO_LATCH: + block->page.lock.x_unlock(); + break; + case RW_S_LATCH: block->page.lock.x_unlock(); - goto get_latch; + block->page.lock.s_lock(); + break; + case RW_SX_LATCH: + block->page.lock.x_u_downgrade(); + break; + default: + ut_ad(rw_latch == RW_X_LATCH); } + + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); } else { -get_latch: switch (rw_latch) { case RW_NO_LATCH: mtr->memo_push(block, MTR_MEMO_BUF_FIX); return block; case RW_S_LATCH: block->page.lock.s_lock(); - ut_ad(!block->page.is_read_fixed()); - if (UNIV_UNLIKELY(block->page.id() != page_id)) { - block->page.lock.s_unlock(); - block->page.lock.x_lock(); - goto page_id_mismatch; - } -get_latch_valid: - mtr->memo_push(block, mtr_memo_type_t(rw_latch)); -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif /* BTR_CUR_HASH_ADAPT */ break; case RW_SX_LATCH: block->page.lock.u_lock(); ut_ad(!block->page.is_io_fixed()); - if (UNIV_UNLIKELY(block->page.id() != page_id)) { - block->page.lock.u_x_upgrade(); - goto page_id_mismatch; - } - goto get_latch_valid; + break; default: ut_ad(rw_latch == RW_X_LATCH); if (block->page.lock.x_lock_upgraded()) { @@ -3183,17 +3195,26 @@ get_latch_valid: mtr->page_lock_upgrade(*block); return block; } - if (UNIV_UNLIKELY(block->page.id() != page_id)) { - goto page_id_mismatch; - } - goto get_latch_valid; } - ut_ad(page_id_t(page_get_space_id(block->page.frame), - page_get_page_no(block->page.frame)) - == page_id); + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + state = block->page.state(); + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + mtr->release_last_page(); + goto ignore_unfixed; + } + + ut_ad(state < buf_page_t::READ_FIX + || state > buf_page_t::WRITE_FIX); + +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ } + ut_ad(page_id_t(page_get_space_id(block->page.frame), + page_get_page_no(block->page.frame)) == page_id); return block; } @@ -3289,83 +3310,76 @@ buf_page_get_gen( return block; } -/********************************************************************//** -This is the general function used to get optimistic access to a database -page. -@return TRUE if success */ TRANSACTIONAL_TARGET -bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, - uint64_t modify_clock, mtr_t *mtr) +buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id) +{ + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + if (UNIV_UNLIKELY(!buf_pool.is_uncompressed(block) || + id != block->page.id() || !block->page.frame)) + return nullptr; + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return nullptr; + block->page.fix(); + return block; +} + +buf_block_t *buf_page_optimistic_get(buf_block_t *block, + rw_lock_type_t rw_latch, + uint64_t modify_clock, mtr_t *mtr) { - ut_ad(block); - ut_ad(mtr); ut_ad(mtr->is_active()); ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + ut_ad(block->page.buf_fix_count()); - if (have_transactional_memory); - else if (UNIV_UNLIKELY(!block->page.frame)) - return false; - else + if (rw_latch == RW_S_LATCH) { - const auto state= block->page.state(); - if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || - state >= buf_page_t::READ_FIX)) - return false; - } + if (!block->page.lock.s_lock_try()) + { + fail: + block->page.unfix(); + return nullptr; + } - bool success; - const page_id_t id{block->page.id()}; - buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); - bool have_u_not_x= false; + ut_ad(!ibuf_inside(mtr) || + ibuf_page(block->page.id(), block->zip_size(), nullptr)); - { - transactional_shared_lock_guard<page_hash_latch> g - {buf_pool.page_hash.lock_get(chain)}; - if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame)) - return false; - const auto state= block->page.state(); - if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || - state >= buf_page_t::READ_FIX)) - return false; - - if (rw_latch == RW_S_LATCH) - success= block->page.lock.s_lock_try(); - else + if (modify_clock != block->modify_clock || block->page.is_freed()) { - have_u_not_x= block->page.lock.have_u_not_x(); - success= have_u_not_x || block->page.lock.x_lock_try(); + block->page.lock.s_unlock(); + goto fail; } - } - if (!success) - return false; - - if (have_u_not_x) + ut_ad(!block->page.is_read_fixed()); + buf_page_make_young_if_needed(&block->page); + mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX); + } + else if (block->page.lock.have_u_not_x()) { block->page.lock.u_x_upgrade(); + block->page.unfix(); mtr->page_lock_upgrade(*block); - ut_ad(id == block->page.id()); ut_ad(modify_clock == block->modify_clock); } + else if (!block->page.lock.x_lock_try()) + goto fail; else { - ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed()); - ut_ad(id == block->page.id()); - ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr)); + ut_ad(!block->page.is_io_fixed()); + ut_ad(!ibuf_inside(mtr) || + ibuf_page(block->page.id(), block->zip_size(), nullptr)); if (modify_clock != block->modify_clock || block->page.is_freed()) { - if (rw_latch == RW_S_LATCH) - block->page.lock.s_unlock(); - else - block->page.lock.x_unlock(); - return false; + block->page.lock.x_unlock(); + goto fail; } - block->page.fix(); - ut_ad(!block->page.is_read_fixed()); buf_page_make_young_if_needed(&block->page); - mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); } ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); @@ -3375,7 +3389,7 @@ bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, ut_ad(~buf_page_t::LRU_MASK & state); ut_ad(block->page.frame); - return true; + return block; } /** Try to S-latch a page. diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index d4628985..d364be31 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -274,30 +274,22 @@ buf_flush_relocate_on_flush_list( ut_d(buf_flush_validate_low()); } -/** Note that a block is no longer dirty, while not removing -it from buf_pool.flush_list -@param temporary whether the page belongs to the temporary tablespace -@param error whether an error may have occurred while writing */ -inline void buf_page_t::write_complete(bool temporary, bool error) +void buf_page_t::write_complete(bool persistent, bool error, uint32_t state) { - ut_ad(temporary == fsp_is_system_temporary(id().space())); - if (UNIV_UNLIKELY(error)); - else if (temporary) - { - ut_ad(oldest_modification() == 2); - oldest_modification_= 0; - } - else + ut_ad(!persistent == fsp_is_system_temporary(id().space())); + ut_ad(state >= WRITE_FIX); + + if (UNIV_LIKELY(!error)) { + ut_d(lsn_t om= oldest_modification()); + ut_ad(om >= 2); + ut_ad(persistent == (om > 2)); /* We use release memory order to guarantee that callers of oldest_modification_acquire() will observe the block as being detached from buf_pool.flush_list, after reading the value 0. */ - ut_ad(oldest_modification() > 2); - oldest_modification_.store(1, std::memory_order_release); + oldest_modification_.store(persistent, std::memory_order_release); } - const auto s= state(); - ut_ad(s >= WRITE_FIX); - zip.fix.fetch_sub((s >= WRITE_FIX_REINIT) + zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) ? (WRITE_FIX_REINIT - UNFIXED) : (WRITE_FIX - UNFIXED)); lock.u_unlock(true); @@ -311,18 +303,10 @@ inline void buf_pool_t::n_flush_inc() inline void buf_pool_t::n_flush_dec() { - mysql_mutex_lock(&flush_list_mutex); + mysql_mutex_assert_owner(&flush_list_mutex); ut_ad(page_cleaner_status >= LRU_FLUSH); if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH) pthread_cond_broadcast(&done_flush_LRU); - mysql_mutex_unlock(&flush_list_mutex); -} - -inline void buf_pool_t::n_flush_dec_holding_mutex() -{ - mysql_mutex_assert_owner(&flush_list_mutex); - ut_ad(page_cleaner_status >= LRU_FLUSH); - page_cleaner_status-= LRU_FLUSH; } /** Complete write of a file page from buf_pool. @@ -352,28 +336,26 @@ void buf_page_write_complete(const IORequest &request, bool error) mysql_mutex_assert_not_owner(&buf_pool.mutex); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - if (request.is_LRU()) + const bool persistent= bpage->oldest_modification() != 2; + + if (UNIV_UNLIKELY(!persistent) && UNIV_LIKELY(!error)) { - const bool temp= bpage->oldest_modification() == 2; - if (!temp && state < buf_page_t::WRITE_FIX_REINIT && - request.node->space->use_doublewrite()) - buf_dblwr.write_completed(); /* We must hold buf_pool.mutex while releasing the block, so that no other thread can access it before we have freed it. */ mysql_mutex_lock(&buf_pool.mutex); - bpage->write_complete(temp, error); - if (!error) - buf_LRU_free_page(bpage, true); + bpage->write_complete(persistent, error, state); + buf_LRU_free_page(bpage, true); mysql_mutex_unlock(&buf_pool.mutex); - - buf_pool.n_flush_dec(); } else { + bpage->write_complete(persistent, error, state); if (state < buf_page_t::WRITE_FIX_REINIT && request.node->space->use_doublewrite()) + { + ut_ad(persistent); buf_dblwr.write_completed(); - bpage->write_complete(false, error); + } } } @@ -740,17 +722,15 @@ ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept } /** Write a flushable page to a file or free a freeable block. -@param evict whether to evict the page on write completion @param space tablespace @return whether a page write was initiated and buf_pool.mutex released */ -bool buf_page_t::flush(bool evict, fil_space_t *space) +bool buf_page_t::flush(fil_space_t *space) { mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(in_file()); ut_ad(in_LRU_list); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == (space == fil_system.temp_space)); - ut_ad(evict || space != fil_system.temp_space); ut_ad(space->referenced()); const auto s= state(); @@ -797,22 +777,11 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) mysql_mutex_unlock(&buf_pool.mutex); IORequest::Type type= IORequest::WRITE_ASYNC; - if (UNIV_UNLIKELY(evict)) - { - type= IORequest::WRITE_LRU; - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_pool.n_flush_inc(); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - } /* Apart from the U-lock, this block will also be protected by is_write_fixed() and oldest_modification()>1. Thus, it cannot be relocated or removed. */ - DBUG_PRINT("ib_buf", ("%s %u page %u:%u", - evict ? "LRU" : "flush_list", - id().space(), id().page_no())); - buf_block_t *block= reinterpret_cast<buf_block_t*>(this); page_t *write_frame= zip.data; @@ -864,10 +833,7 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) { switch (space->chain.start->punch_hole) { case 1: - static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH == - IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, ""); - type= - IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC)); + type= IORequest::PUNCH; break; case 2: size= orig_size; @@ -894,10 +860,8 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) /** Check whether a page can be flushed from the buf_pool. @param id page identifier @param fold id.fold() -@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return whether the page can be flushed */ -static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, - bool evict) +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold) { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(fold == id.fold()); @@ -906,26 +870,16 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, const buf_page_t *bpage= buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold)); - if (!bpage || buf_pool.watch_is_sentinel(*bpage)) - return false; - - /* We avoid flushing 'non-old' blocks in an eviction flush, because the - flushed blocks are soon freed */ - if (evict && !bpage->is_old()) - return false; - - return bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); + return bpage && bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); } /** Check which neighbors of a page can be flushed from the buf_pool. @param space tablespace @param id page identifier of a dirty page @param contiguous whether to consider contiguous areas of pages -@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return last page number that can be flushed */ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, - page_id_t &id, bool contiguous, - bool evict) + page_id_t &id, bool contiguous) { ut_ad(id.page_no() < space.size + (space.physical_size() == 2048 ? 1 @@ -958,7 +912,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, for (page_id_t i= id - 1;; --i) { fold--; - if (!buf_flush_check_neighbor(i, fold, evict)) + if (!buf_flush_check_neighbor(i, fold)) { low= i + 1; break; @@ -974,7 +928,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, while (++i < high) { ++fold; - if (!buf_flush_check_neighbor(i, fold, evict)) + if (!buf_flush_check_neighbor(i, fold)) break; } @@ -1051,14 +1005,13 @@ and also write zeroes or punch the hole for the freed ranges of pages. @param page_id page identifier @param bpage buffer page @param contiguous whether to consider contiguous areas of pages -@param evict true=buf_pool.LRU; false=buf_pool.flush_list @param n_flushed number of pages flushed so far in this batch @param n_to_flush maximum number of pages we are allowed to flush @return number of pages flushed */ static ulint buf_flush_try_neighbors(fil_space_t *space, const page_id_t page_id, buf_page_t *bpage, - bool contiguous, bool evict, + bool contiguous, ulint n_flushed, ulint n_to_flush) { ut_ad(space->id == page_id.space()); @@ -1072,7 +1025,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ut_ad(lsn >= bpage->oldest_modification()); if (UNIV_UNLIKELY(lsn < space->get_create_lsn())) { - ut_a(!bpage->flush(evict, space)); + ut_a(!bpage->flush(space)); mysql_mutex_unlock(&buf_pool.mutex); return 0; } @@ -1082,7 +1035,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ulint count= 0; page_id_t id= page_id; - page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous); ut_ad(page_id >= id); ut_ad(page_id < high); @@ -1119,7 +1072,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ut_ad(!buf_pool.watch_is_sentinel(*b)); ut_ad(b->oldest_modification() > 1); flush: - if (b->flush(evict, space)) + if (b->flush(space)) { ++count; continue; @@ -1127,9 +1080,10 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, } /* We avoid flushing 'non-old' blocks in an eviction flush, because the flushed blocks are soon freed */ - else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) && - b->oldest_modification() > 1 && b->lock.u_lock_try(true)) + else if (b->oldest_modification() > 1 && b->lock.u_lock_try(true)) { + /* For the buf_pool.watch[] sentinels, oldest_modification() == 0 */ + ut_ad(!buf_pool.watch_is_sentinel(*b)); if (b->oldest_modification() < 2) b->lock.u_unlock(true); else @@ -1251,10 +1205,8 @@ static void buf_flush_discard_page(buf_page_t *bpage) /** Flush dirty blocks from the end buf_pool.LRU, and move clean blocks to buf_pool.free. @param max maximum number of blocks to flush -@param evict whether dirty pages are to be evicted after flushing them @param n counts of flushed and evicted pages */ -static void buf_flush_LRU_list_batch(ulint max, bool evict, - flush_counters_t *n) +static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) { ulint scanned= 0; ulint free_limit= srv_LRU_scan_depth; @@ -1302,8 +1254,12 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) { ut_ad(!bpage->is_io_fixed()); - bool do_evict= evict; switch (bpage->oldest_modification()) { + case 2: + /* LRU flushing will always evict pages of the temporary tablespace, + in buf_page_write_complete(). */ + ++n->evicted; + break; case 1: mysql_mutex_lock(&buf_pool.flush_list_mutex); if (ut_d(lsn_t lsn=) bpage->oldest_modification()) @@ -1316,12 +1272,8 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, case 0: bpage->lock.u_unlock(true); goto evict; - case 2: - /* LRU flushing will always evict pages of the temporary tablespace. */ - do_evict= true; } - /* Block is ready for flush. Dispatch an IO request. - If do_evict, the page may be evicted by buf_page_write_complete(). */ + /* Block is ready for flush. Dispatch an IO request. */ const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) @@ -1356,6 +1308,7 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, no_space: mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_discard_page(bpage); + ++n->evicted; continue; } @@ -1368,8 +1321,8 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, if (neighbors && space->is_rotational()) n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1, - do_evict, n->flushed, max); - else if (bpage->flush(do_evict, space)) + n->flushed, max); + else if (bpage->flush(space)) ++n->flushed; else continue; @@ -1387,24 +1340,25 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, space->release(); if (scanned) + { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED, MONITOR_LRU_BATCH_SCANNED_NUM_CALL, MONITOR_LRU_BATCH_SCANNED_PER_CALL, scanned); + } } /** Flush and move pages from LRU or unzip_LRU list to the free list. Whether LRU or unzip_LRU is used depends on the state of the system. @param max maximum number of blocks to flush -@param evict whether dirty pages are to be evicted after flushing them @param n counts of flushed and evicted pages */ -static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) +static void buf_do_LRU_batch(ulint max, flush_counters_t *n) { if (buf_LRU_evict_from_unzip_LRU()) buf_free_from_unzip_LRU_list_batch(); n->evicted= 0; n->flushed= 0; - buf_flush_LRU_list_batch(max, evict, n); + buf_flush_LRU_list_batch(max, n); mysql_mutex_assert_owner(&buf_pool.mutex); buf_lru_freed_page_count+= n->evicted; @@ -1516,8 +1470,8 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) { if (neighbors && space->is_rotational()) count+= buf_flush_try_neighbors(space, page_id, bpage, - neighbors == 1, false, count, max_n); - else if (bpage->flush(false, space)) + neighbors == 1, count, max_n); + else if (bpage->flush(space)) ++count; else continue; @@ -1536,10 +1490,13 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) space->release(); if (scanned) + { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, scanned); + } + return count; } @@ -1683,7 +1640,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) goto was_freed; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (bpage->flush(false, space)) + if (bpage->flush(space)) { ++n_flush; if (!--max_n_flush) @@ -1741,27 +1698,22 @@ and move clean blocks to buf_pool.free. The caller must invoke buf_dblwr.flush_buffered_writes() after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed -@param evict whether to evict pages after flushing -@return evict ? number of processed pages : number of pages written */ -ulint buf_flush_LRU(ulint max_n, bool evict) +@return number of pages written */ +static ulint buf_flush_LRU(ulint max_n) { mysql_mutex_assert_owner(&buf_pool.mutex); flush_counters_t n; - buf_do_LRU_batch(max_n, evict, &n); + buf_do_LRU_batch(max_n, &n); ulint pages= n.flushed; if (n.evicted) { - if (evict) - pages+= n.evicted; buf_pool.try_LRU_scan= true; pthread_cond_broadcast(&buf_pool.done_free); } - else if (!pages && !buf_pool.try_LRU_scan && - !buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) - { + else if (!pages && !buf_pool.try_LRU_scan) /* For example, with the minimum innodb_buffer_pool_size=5M and the default innodb_page_size=16k there are only a little over 316 pages in the buffer pool. The buffer pool can easily be exhausted @@ -1775,18 +1727,13 @@ ulint buf_flush_LRU(ulint max_n, bool evict) (3) This thread is the only one that could make progress, but we fail to do so because all the pages that we scanned are buffer-fixed or latched by some thread. */ - sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!" - " %zu blocks are in use and %zu free." - " Consider increasing innodb_buffer_pool_size.", - UT_LIST_GET_LEN(buf_pool.LRU), - UT_LIST_GET_LEN(buf_pool.free)); - } + buf_pool.LRU_warn(); return pages; } #ifdef HAVE_PMEM -# include <libpmem.h> +# include "cache.h" #endif /** Write checkpoint information to the log header and release mutex. @@ -1900,8 +1847,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept ut_ad(!log.is_opened()); bool success; log.m_file= - os_file_create_func(get_log_file_path().c_str(), - OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + os_file_create_func(get_log_file_path().c_str(), OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, false, &success); ut_a(success); ut_a(log.is_opened()); @@ -1916,7 +1862,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept { my_munmap(buf, file_size); buf= resize_buf; - buf_free= START_OFFSET + (get_lsn() - resizing); + set_buf_free(START_OFFSET + (get_lsn() - resizing)); } else #endif @@ -1958,9 +1904,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) { ut_ad(!srv_read_only_mode); -#ifndef SUX_LOCK_GENERIC - ut_ad(log_sys.latch.is_write_locked()); -#endif + ut_ad(log_sys.latch_have_wr()); ut_ad(oldest_lsn <= end_lsn); ut_ad(end_lsn == log_sys.get_lsn()); @@ -2327,7 +2271,7 @@ func_exit: sum_pages += last_pages_in; - const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1); + const ulint time_elapsed = std::max<ulint>(ulint(curr_time - prev_time), 1); /* We update our variables every innodb_flushing_avg_loops iterations to smooth out transition in workload. */ @@ -2541,26 +2485,16 @@ static void buf_flush_page_cleaner() { buf_pool.page_cleaner_set_idle(false); buf_pool.n_flush_inc(); - /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */ - for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; ) - { - const lsn_t lsn{p->oldest_modification()}; - ut_ad(lsn > 2 || lsn == 1); - buf_page_t *n= UT_LIST_GET_NEXT(list, p); - if (lsn <= 1) - buf_pool.delete_from_flush_list(p); - p= n; - } mysql_mutex_unlock(&buf_pool.flush_list_mutex); n= srv_max_io_capacity; mysql_mutex_lock(&buf_pool.mutex); LRU_flush: - n= buf_flush_LRU(n, false); + n= buf_flush_LRU(n); mysql_mutex_unlock(&buf_pool.mutex); last_pages+= n; check_oldest_and_set_idle: mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_pool.n_flush_dec_holding_mutex(); + buf_pool.n_flush_dec(); oldest_lsn= buf_pool.get_oldest_modification(0); if (!oldest_lsn) goto fully_unemployed; @@ -2693,6 +2627,16 @@ static void buf_flush_page_cleaner() #endif } +ATTRIBUTE_COLD void buf_pool_t::LRU_warn() +{ + mysql_mutex_assert_owner(&mutex); + if (!LRU_warned.test_and_set(std::memory_order_acquire)) + sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!" + " %zu blocks are in use and %zu free." + " Consider increasing innodb_buffer_pool_size.", + UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free)); +} + /** Initialize page_cleaner. */ ATTRIBUTE_COLD void buf_flush_page_cleaner_init() { diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 2a8d6ff2..33d01b6b 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -385,142 +385,76 @@ we put it to free list to be used. @return the free control block, in state BUF_BLOCK_MEMORY */ buf_block_t *buf_LRU_get_free_block(bool have_mutex) { - ulint n_iterations = 0; - ulint flush_failures = 0; - MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); - if (have_mutex) { - mysql_mutex_assert_owner(&buf_pool.mutex); - goto got_mutex; - } - DBUG_EXECUTE_IF("recv_ran_out_of_buffer", - if (recv_recovery_is_on() - && recv_sys.apply_log_recs) { - mysql_mutex_lock(&buf_pool.mutex); - goto flush_lru; - }); -get_mutex: - mysql_mutex_lock(&buf_pool.mutex); -got_mutex: - buf_LRU_check_size_of_non_data_objects(); - buf_block_t* block; + bool waited= false; + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); + if (!have_mutex) + mysql_mutex_lock(&buf_pool.mutex); + + buf_LRU_check_size_of_non_data_objects(); - IF_DBUG(static bool buf_lru_free_blocks_error_printed,); - DBUG_EXECUTE_IF("ib_lru_force_no_free_page", - if (!buf_lru_free_blocks_error_printed) { - n_iterations = 21; - goto not_found;}); + buf_block_t *block; retry: - /* If there is a block in the free list, take it */ - if ((block = buf_LRU_get_free_only()) != nullptr) { + /* If there is a block in the free list, take it */ + block= buf_LRU_get_free_only(); + if (block) + { got_block: - const ulint LRU_size = UT_LIST_GET_LEN(buf_pool.LRU); - const ulint available = UT_LIST_GET_LEN(buf_pool.free); - const ulint scan_depth = srv_LRU_scan_depth / 2; - ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth - || buf_pool.need_LRU_eviction()); - - if (!have_mutex) { - mysql_mutex_unlock(&buf_pool.mutex); - } - - if (UNIV_UNLIKELY(available < scan_depth) - && LRU_size > BUF_LRU_MIN_LEN) { - mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (!buf_pool.page_cleaner_active()) { - buf_pool.page_cleaner_wakeup(true); - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - } - - block->page.zip.clear(); - return block; - } + const ulint LRU_size= UT_LIST_GET_LEN(buf_pool.LRU); + const ulint available= UT_LIST_GET_LEN(buf_pool.free); + const ulint scan_depth= srv_LRU_scan_depth / 2; + ut_ad(LRU_size <= BUF_LRU_MIN_LEN || + available >= scan_depth || buf_pool.need_LRU_eviction()); - MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS ); - if (n_iterations || buf_pool.try_LRU_scan) { - /* If no block was in the free list, search from the - end of the LRU list and try to free a block there. - If we are doing for the first time we'll scan only - tail of the LRU list otherwise we scan the whole LRU - list. */ - if (buf_LRU_scan_and_free_block(n_iterations - ? ULINT_UNDEFINED : 100)) { - goto retry; - } + if (UNIV_UNLIKELY(available < scan_depth) && LRU_size > BUF_LRU_MIN_LEN) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.page_cleaner_active()) + buf_pool.page_cleaner_wakeup(true); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } - /* Tell other threads that there is no point - in scanning the LRU list. */ - buf_pool.try_LRU_scan = false; - } + if (!have_mutex) + mysql_mutex_unlock(&buf_pool.mutex); - for (;;) { - if ((block = buf_LRU_get_free_only()) != nullptr) { - goto got_block; - } - const bool wake = buf_pool.need_LRU_eviction(); - mysql_mutex_unlock(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const auto n_flush = buf_pool.n_flush(); - if (wake && !buf_pool.page_cleaner_active()) { - buf_pool.page_cleaner_wakeup(true); - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mysql_mutex_lock(&buf_pool.mutex); - if (!n_flush) { - goto not_found; - } - if (!buf_pool.try_LRU_scan) { - my_cond_wait(&buf_pool.done_free, - &buf_pool.mutex.m_mutex); - } - } - -not_found: - if (n_iterations > 1) { - MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); - } + block->page.zip.clear(); + return block; + } - if (n_iterations == 21 - && srv_buf_pool_old_size == srv_buf_pool_size - && buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) { - IF_DBUG(buf_lru_free_blocks_error_printed = true,); - mysql_mutex_unlock(&buf_pool.mutex); - ib::warn() << "Difficult to find free blocks in the buffer pool" - " (" << n_iterations << " search iterations)! " - << flush_failures << " failed attempts to" - " flush a page!" - " Consider increasing innodb_buffer_pool_size." - " Pending flushes (fsync): " - << fil_n_pending_tablespace_flushes - << ". " << os_n_file_reads << " OS file reads, " - << os_n_file_writes << " OS file writes, " - << os_n_fsyncs - << " OS fsyncs."; - mysql_mutex_lock(&buf_pool.mutex); - } + MONITOR_INC(MONITOR_LRU_GET_FREE_LOOPS); + if (waited || buf_pool.try_LRU_scan) + { + /* If no block was in the free list, search from the end of the + LRU list and try to free a block there. If we are doing for the + first time we'll scan only tail of the LRU list otherwise we scan + the whole LRU list. */ + if (buf_LRU_scan_and_free_block(waited ? ULINT_UNDEFINED : 100)) + goto retry; + + /* Tell other threads that there is no point in scanning the LRU + list. */ + buf_pool.try_LRU_scan= false; + } - /* No free block was found: try to flush the LRU list. - The freed blocks will be up for grabs for all threads. + waited= true; - TODO: A more elegant way would have been to return one freed - up block to the caller here but the code that deals with - removing the block from buf_pool.page_hash and buf_pool.LRU is fairly - involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We - can do that in a separate patch sometime in future. */ -#ifndef DBUG_OFF -flush_lru: -#endif - if (!buf_flush_LRU(innodb_lru_flush_size, true)) { - MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); - ++flush_failures; - } + while (!(block= buf_LRU_get_free_only())) + { + buf_pool.stat.LRU_waits++; + + timespec abstime; + set_timespec(abstime, 1); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.page_cleaner_active()) + buf_pool.page_cleaner_wakeup(true); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (my_cond_timedwait(&buf_pool.done_free, &buf_pool.mutex.m_mutex, + &abstime)) + buf_pool.LRU_warn(); + } - n_iterations++; - buf_pool.stat.LRU_waits++; - mysql_mutex_unlock(&buf_pool.mutex); - buf_dblwr.flush_buffered_writes(); - goto get_mutex; + goto got_block; } /** Move the LRU_old pointer so that the length of the old blocks list diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 9041c6a2..76a5e710 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -575,7 +575,7 @@ fail: hash_lock.lock_shared(); const buf_page_t* bpage= buf_pool.page_hash.get(i, chain); - if (!bpage) + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) { hash_lock.unlock_shared(); if (i == page_id) |