diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:33:02 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:33:02 +0000 |
commit | 4fa488fb0159c629483b7994aa84e73926b132b9 (patch) | |
tree | 182a19db69cdcb92be54cc6a5b0b9bfab28f80fd /storage/innobase | |
parent | Adding debian version 1:10.11.6-2. (diff) | |
download | mariadb-4fa488fb0159c629483b7994aa84e73926b132b9.tar.xz mariadb-4fa488fb0159c629483b7994aa84e73926b132b9.zip |
Merging upstream version 1:10.11.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase')
66 files changed, 1769 insertions, 1005 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 08be1991..705ff035 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -216,10 +216,11 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index) @param[in] merge whether change buffer merge should be attempted @param[in,out] mtr mini-transaction @param[out] err error code +@param[out] first set if this is a first-time access to the page @return block */ buf_block_t *btr_block_get(const dict_index_t &index, uint32_t page, rw_lock_type_t mode, bool merge, - mtr_t *mtr, dberr_t *err) + mtr_t *mtr, dberr_t *err, bool *first) { ut_ad(mode != RW_NO_LATCH); dberr_t local_err; @@ -242,6 +243,8 @@ buf_block_t *btr_block_get(const dict_index_t &index, *err= DB_PAGE_CORRUPTED; block= nullptr; } + else if (!buf_page_make_young_if_needed(&block->page) && first) + *first= true; } else if (*err == DB_DECRYPTION_FAILED) btr_decryption_failed(index); @@ -302,6 +305,8 @@ btr_root_block_get( *err= DB_CORRUPTION; block= nullptr; } + else + buf_page_make_young_if_needed(&block->page); } else if (*err == DB_DECRYPTION_FAILED) btr_decryption_failed(*index); @@ -553,8 +558,11 @@ btr_page_alloc_for_ibuf( root->page.frame)), 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); if (new_block) + { + buf_page_make_young_if_needed(&new_block->page); *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + } ut_d(if (*err == DB_SUCCESS) flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); return new_block; @@ -873,7 +881,8 @@ static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap, /************************************************************//** Returns the upper level node pointer to a page. It is assumed that mtr holds an x-latch on the tree. -@return rec_get_offsets() of the node pointer record */ +@return rec_get_offsets() of the node pointer record +@retval nullptr on corruption */ static rec_offs* btr_page_get_father_block( @@ -1351,6 +1360,7 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page), space->zip_size(), RW_SX_LATCH, &mtr)) { + buf_page_make_young_if_needed(&root->page); mtr.set_named_space(space); page_set_autoinc(root, autoinc, &mtr, reset); } @@ -2542,6 +2552,11 @@ btr_attach_half_pages( offsets = btr_page_get_father_block(nullptr, heap, mtr, &cursor); + if (UNIV_UNLIKELY(!offsets)) { + mem_heap_free(heap); + return DB_CORRUPTION; + } + /* Replace the address of the old child node (= page) with the address of the new lower half */ @@ -3478,6 +3493,14 @@ btr_lift_page_up( offsets = btr_page_get_father_block(offsets, heap, mtr, &cursor); } + + if (UNIV_UNLIKELY(!offsets)) { +parent_corrupted: + mem_heap_free(heap); + *err = DB_CORRUPTION; + return nullptr; + } + father_block = btr_cur_get_block(&cursor); father_page_zip = buf_block_get_page_zip(father_block); @@ -3502,6 +3525,10 @@ btr_lift_page_up( &cursor); } + if (UNIV_UNLIKELY(!offsets)) { + goto parent_corrupted; + } + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); } @@ -3717,6 +3744,11 @@ btr_compress( NULL, heap, mtr, &father_cursor); } + if (UNIV_UNLIKELY(!offsets)) { + err = DB_CORRUPTION; + goto func_exit; + } + if (adjust) { nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) { diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index 013cd131..5bf68c58 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -52,6 +52,7 @@ PageBulk::init() if (m_page_no == FIL_NULL) { mtr_t alloc_mtr; + dberr_t err= DB_SUCCESS; /* We commit redo log for allocation by a separate mtr, because we don't guarantee pages are committed following @@ -60,28 +61,15 @@ PageBulk::init() alloc_mtr.start(); m_index->set_modified(alloc_mtr); - uint32_t n_reserved; - dberr_t err = fsp_reserve_free_extents( - &n_reserved, m_index->table->space, 1, FSP_NORMAL, - &alloc_mtr); - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { -oom: - alloc_mtr.commit(); - m_mtr.commit(); - return err; - } - /* Allocate a new page. */ new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level, &alloc_mtr, &m_mtr, &err); + alloc_mtr.commit(); if (!new_block) { - goto oom; + m_mtr.commit(); + return err; } - m_index->table->space->release_free_extents(n_reserved); - - alloc_mtr.commit(); - new_page = buf_block_get_frame(new_block); m_page_no = new_block->page.id().page_no(); @@ -969,10 +957,10 @@ BtrBulk::pageCommit( /** Log free check */ inline void BtrBulk::logFreeCheck() { - if (log_sys.check_flush_or_checkpoint()) { + if (log_sys.check_for_checkpoint()) { release(); - log_check_margins(); + log_free_check(); latch(); } diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index e736f338..46afb73b 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1156,6 +1156,19 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, mtr_s_lock_index(index(), mtr); } + dberr_t err; + + if (!index()->table->space) + { + corrupted: + ut_ad("corrupted" == 0); // FIXME: remove this + err= DB_CORRUPTION; + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + } + const ulint zip_size= index()->table->space->zip_size(); /* Start with the root page. */ @@ -1169,7 +1182,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, low_bytes= 0; ulint buf_mode= BUF_GET; search_loop: - dberr_t err; auto block_savepoint= mtr->get_savepoint(); buf_block_t *block= buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr, @@ -1181,10 +1193,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, btr_decryption_failed(*index()); /* fall through */ default: - func_exit: - if (UNIV_LIKELY_NULL(heap)) - mem_heap_free(heap); - return err; + goto func_exit; case DB_SUCCESS: /* This must be a search to perform an insert, delete mark, or delete; try using the change buffer */ @@ -1251,16 +1260,11 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, btr_page_get_index_id(block->page.frame) != index()->id || fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || !fil_page_index_page_check(block->page.frame)) - { - corrupted: - ut_ad("corrupted" == 0); // FIXME: remove this - err= DB_CORRUPTION; - goto func_exit; - } + goto corrupted; page_cur.block= block; ut_ad(block == mtr->at_savepoint(block_savepoint)); - ut_ad(rw_latch != RW_NO_LATCH); + const bool not_first_access{buf_page_make_young_if_needed(&block->page)}; #ifdef UNIV_ZIP_DEBUG if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) ut_a(page_zip_validate(page_zip, block->page.frame, index())); @@ -1539,6 +1543,9 @@ release_tree: case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + if (!not_first_access) + buf_read_ahead_linear(page_id, zip_size, false); + if (page_has_prev(block->page.frame) && page_rec_is_first(page_cur.rec, block->page.frame)) { @@ -1578,6 +1585,8 @@ release_tree: buf_mode= btr_op == BTR_DELETE_OP ? BUF_GET_IF_IN_POOL_OR_WATCH : BUF_GET_IF_IN_POOL; + else if (!not_first_access) + buf_read_ahead_linear(page_id, zip_size, false); break; case BTR_MODIFY_TREE: ut_ad(rw_latch == RW_X_LATCH); @@ -1611,6 +1620,14 @@ ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() slot.type= MTR_MEMO_X_LOCK; } +/** Mark a non-leaf page "least recently used", but avoid invoking +buf_page_t::set_accessed(), because we do not want linear read-ahead */ +static void btr_cur_nonleaf_make_young(buf_page_t *bpage) +{ + if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) + buf_page_make_young(bpage); +} + ATTRIBUTE_COLD dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, mtr_t *mtr) @@ -1713,6 +1730,8 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, if (height != btr_page_get_level(block->page.frame)) goto corrupted; + btr_cur_nonleaf_make_young(&block->page); + #ifdef UNIV_ZIP_DEBUG const page_zip_des_t *page_zip= buf_block_get_page_zip(block); ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index())); @@ -1799,6 +1818,8 @@ search_loop: btr_decryption_failed(*index); goto func_exit; } + else + btr_cur_nonleaf_make_young(&block->page); #ifdef UNIV_ZIP_DEBUG if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) @@ -1934,18 +1955,15 @@ index_locked: ut_ad(n_blocks < BTR_MAX_LEVELS); ut_ad(savepoint + n_blocks == mtr->get_savepoint()); + bool first_access= false; buf_block_t* block= btr_block_get(*index, page, height ? upper_rw_latch : root_leaf_rw_latch, - !height, mtr, &err); + !height, mtr, &err, &first_access); ut_ad(!block == (err != DB_SUCCESS)); if (!block) - { - if (err == DB_DECRYPTION_FAILED) - btr_decryption_failed(*index); break; - } if (first) page_cur_set_before_first(block, &page_cur); @@ -2029,10 +2047,16 @@ index_locked: offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, &heap); + page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); - if (latch_mode != BTR_MODIFY_TREE); + if (latch_mode != BTR_MODIFY_TREE) + { + if (!height && first && first_access) + buf_read_ahead_linear(page_id_t(block->page.id().space(), page), + block->page.zip_size(), false); + } else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), lock_intention, node_ptr_max_size, compress_limit, @@ -2070,7 +2094,6 @@ index_locked: } /* Go to the child node */ - page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); n_blocks++; } @@ -3837,22 +3860,14 @@ btr_cur_pess_upd_restore_supremum( const page_id_t block_id{block->page.id()}; const page_id_t prev_id(block_id.space(), prev_page_no); - dberr_t err; buf_block_t* prev_block - = buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr, - BUF_PEEK_IF_IN_POOL, mtr, &err); - /* Since we already held an x-latch on prev_block, it must - be available and not be corrupted unless the buffer pool got - corrupted somehow. */ + = mtr->get_already_latched(prev_id, MTR_MEMO_PAGE_X_FIX); if (UNIV_UNLIKELY(!prev_block)) { - return err; + return DB_CORRUPTION; } ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, 4)); - /* We must already have an x-latch on prev_block! */ - ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX)); - lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id, PAGE_HEAP_NO_SUPREMUM, page_rec_get_heap_no(rec)); @@ -6660,6 +6675,10 @@ btr_copy_blob_prefix( mtr.commit(); return copied_len; } + if (!buf_page_make_young_if_needed(&block->page)) { + buf_read_ahead_linear(id, 0, false); + } + page = buf_block_get_frame(block); blob_header = page + offset; diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 54dd15ac..2131fb94 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -25,9 +25,10 @@ Created 2/23/1996 Heikki Tuuri *******************************************************/ #include "btr0pcur.h" -#include "ut0byte.h" +#include "buf0rea.h" #include "rem0cmp.h" #include "trx0trx.h" +#include "ibuf0ibuf.h" /**************************************************************//** Resets a persistent cursor object, freeing ::old_rec_buf if it is @@ -261,13 +262,15 @@ static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block, buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size, mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr); - if (left_block && - btr_page_get_next(left_block->page.frame) != id.page_no()) + if (!left_block); + else if (btr_page_get_next(left_block->page.frame) != id.page_no()) { release_left_block: mtr->release_last_page(); return false; } + else + buf_page_make_young_if_needed(&left_block->page); } if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr)) @@ -539,10 +542,11 @@ btr_pcur_move_to_next_page( } dberr_t err; + bool first_access = false; buf_block_t* next_block = btr_block_get( *cursor->index(), next_page_no, rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)), - page_is_leaf(page), mtr, &err); + page_is_leaf(page), mtr, &err, &first_access); if (UNIV_UNLIKELY(!next_block)) { return err; @@ -561,6 +565,11 @@ btr_pcur_move_to_next_page( const auto s = mtr->get_savepoint(); mtr->rollback_to_savepoint(s - 2, s - 1); + if (first_access) { + buf_read_ahead_linear(next_block->page.id(), + next_block->zip_size(), + ibuf_inside(mtr)); + } return DB_SUCCESS; } diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index 8435047c..1c5928c4 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -1143,7 +1143,6 @@ block_and_ahi_release_and_fail: } block->page.fix(); - block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF}, ""); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 8ef18ee0..23b5b776 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -404,7 +404,7 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, if (id.space() == SRV_TMP_SPACE_ID && innodb_encrypt_temporary_tables) { - slot = buf_pool.io_buf_reserve(); + slot = buf_pool.io_buf_reserve(false); slot->allocate(); bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); slot->release(); @@ -426,7 +426,7 @@ decompress: return false; } - slot = buf_pool.io_buf_reserve(); + slot = buf_pool.io_buf_reserve(false); slot->allocate(); decompress_with_slot: @@ -449,7 +449,7 @@ decrypt_failed: return false; } - slot = buf_pool.io_buf_reserve(); + slot = buf_pool.io_buf_reserve(false); slot->allocate(); /* decrypt using crypt_buf to dst_frame */ @@ -742,6 +742,205 @@ bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf, #ifndef UNIV_INNOCHECKSUM +#ifdef __linux__ +#include <poll.h> +#include <sys/eventfd.h> +#include <fstream> + +/** Memory Pressure + +based off https://www.kernel.org/doc/html/latest/accounting/psi.html#pressure-interface +and https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory */ +class mem_pressure +{ + /* triggers + eventfd */ + struct pollfd m_fds[3]; + nfds_t m_num_fds; + int m_event_fd= -1; + Atomic_relaxed<bool> m_abort= false; + + std::thread m_thd; + /* mem pressure garbage collection restricted to interval */ + static constexpr ulonglong max_interval_us= 60*1000000; + +public: + mem_pressure() : m_num_fds(0) {} + + bool setup() + { + static_assert(array_elements(m_fds) == (array_elements(m_triggers) + 1), + "insufficient fds"); + std::string memcgroup{"/sys/fs/cgroup"}; + std::string cgroup; + { + std::ifstream selfcgroup("/proc/self/cgroup"); + std::getline(selfcgroup, cgroup, '\n'); + } + + cgroup.erase(0, 3); // Remove "0::" + memcgroup+= cgroup + "/memory.pressure"; + + m_num_fds= 0; + for (auto trig= std::begin(m_triggers); trig!= std::end(m_triggers); ++trig) + { + if ((m_fds[m_num_fds].fd= + open(memcgroup.c_str(), O_RDWR | O_NONBLOCK | O_CLOEXEC)) < 0) + { + /* User can't do anything about it, no point giving warning */ + shutdown(); + return false; + } + my_register_filename(m_fds[m_num_fds].fd, memcgroup.c_str(), FILE_BY_OPEN, 0, MYF(0)); + ssize_t slen= strlen(*trig); + if (write(m_fds[m_num_fds].fd, *trig, slen) < slen) + { + /* we may fail this one, but continue to the next */ + my_close(m_fds[m_num_fds].fd, MYF(MY_WME)); + continue; + } + m_fds[m_num_fds].events= POLLPRI; + m_num_fds++; + } + if (m_num_fds < 1) + return false; + + if ((m_event_fd= eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK)) == -1) + { + /* User can't do anything about it, no point giving warning */ + shutdown(); + return false; + } + my_register_filename(m_event_fd, "mem_pressure_eventfd", FILE_BY_DUP, 0, MYF(0)); + m_fds[m_num_fds].fd= m_event_fd; + m_fds[m_num_fds].events= POLLIN; + m_num_fds++; + m_thd= std::thread(pressure_routine, this); + sql_print_information("InnoDB: Initialized memory pressure event listener"); + return true; + } + + void shutdown() + { + /* m_event_fd is in this list */ + while (m_num_fds) + { + m_num_fds--; + my_close(m_fds[m_num_fds].fd, MYF(MY_WME)); + m_fds[m_num_fds].fd= -1; + } + } + + static void pressure_routine(mem_pressure *m); + +#ifdef UNIV_DEBUG + void trigger_collection() + { + uint64_t u= 1; + if (m_event_fd >=0 && write(m_event_fd, &u, sizeof(uint64_t)) != sizeof(uint64_t)) + sql_print_information("InnoDB: (Debug) Failed to trigger memory pressure"); + else /* assumed failed to meet intialization criteria, so trigger directy */ + buf_pool.garbage_collect(); + } +#endif + + void quit() + { + uint64_t u= 1; + m_abort= true; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-result" + /* return result ignored, cannot do anything with it */ + write(m_event_fd, &u, sizeof(uint64_t)); +#pragma GCC diagnostic pop + } + + void join() + { + if (m_thd.joinable()) + { + quit(); + m_thd.join(); + } + } + + static const char* const m_triggers[2]; +}; + + +/* + ref: https://docs.kernel.org/accounting/psi.html + maximum window size (second number) 10 seconds. + window size in multiples of 2 second interval required (for Unprivileged) + Time is in usec. +*/ +const char* const mem_pressure::m_triggers[]= + {"some 5000000 10000000", /* 5s out of 10s */ + "full 10000 2000000"}; /* 10ms out of 2s */ + +static mem_pressure mem_pressure_obj; + +void mem_pressure::pressure_routine(mem_pressure *m) +{ + DBUG_ASSERT(m == &mem_pressure_obj); + if (my_thread_init()) + { + m->shutdown(); + return; + } + + ulonglong last= microsecond_interval_timer() - max_interval_us; + while (!m->m_abort) + { + if (poll(&m->m_fds[0], m->m_num_fds, -1) < 0) + { + if (errno == EINTR) + continue; + else + break; + } + if (!m->m_abort) + break; + + for (pollfd &p : st_::span<pollfd>(m->m_fds, m->m_num_fds)) + { + if (p.revents & POLLPRI) + { + ulonglong now= microsecond_interval_timer(); + if ((now - last) > max_interval_us) + { + last= now; + buf_pool.garbage_collect(); + } + } + +#ifdef UNIV_DEBUG + if (p.revents & POLLIN) + { + uint64_t u; + /* we haven't aborted, so this must be a debug trigger */ + if (read(p.fd, &u, sizeof(u)) >=0) + buf_pool.garbage_collect(); + } +#endif + } + } + m->shutdown(); + + my_thread_end(); +} + +/** Initialize mem pressure. */ +ATTRIBUTE_COLD void buf_mem_pressure_detect_init() +{ + mem_pressure_obj.setup(); +} + +ATTRIBUTE_COLD void buf_mem_pressure_shutdown() +{ + mem_pressure_obj.join(); +} +#endif /* __linux__ */ + #if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) /** Enable buffers to be dumped to core files @@ -1099,6 +1298,11 @@ bool buf_pool_t::create() chunk_t::map_ref= chunk_t::map_reg; buf_LRU_old_ratio_update(100 * 3 / 8, false); btr_search_sys_create(); + +#ifdef __linux__ + if (srv_operation == SRV_OPERATION_NORMAL) + buf_mem_pressure_detect_init(); +#endif ut_ad(is_initialised()); return false; } @@ -1300,14 +1504,17 @@ void buf_pool_t::io_buf_t::close() n_slots= 0; } -buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve(bool wait_for_reads) { for (;;) { for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) if (s->acquire()) return s; + buf_dblwr.flush_buffered_writes(); os_aio_wait_until_no_pending_writes(true); + if (!wait_for_reads) + continue; for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) if (s->acquire()) return s; @@ -1536,6 +1743,7 @@ struct find_interesting_trx inline void buf_pool_t::resize() { ut_ad(this == &buf_pool); + ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); bool warning = false; @@ -1878,6 +2086,100 @@ calc_buf_pool_size: return; } +#ifdef __linux__ +inline void buf_pool_t::garbage_collect() +{ + mysql_mutex_lock(&mutex); + size_t freed= 0; + +#ifdef BTR_CUR_HASH_ADAPT + /* buf_LRU_free_page() will temporarily release and reacquire + buf_pool.mutex for invoking btr_search_drop_page_hash_index(). Thus, + we must protect ourselves with the hazard pointer. */ +rescan: +#else + lru_hp.set(nullptr); +#endif + for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev; bpage; bpage= prev) + { + prev= UT_LIST_GET_PREV(LRU, bpage); +#ifdef BTR_CUR_HASH_ADAPT + lru_hp.set(prev); +#endif + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(bpage->in_LRU_list); + + /* We try to free any pages that can be freed without writing out + anything. */ + switch (bpage->oldest_modification()) { + case 0: + try_to_evict: + if (buf_LRU_free_page(bpage, true)) + { + evicted: + freed++; +#ifdef BTR_CUR_HASH_ADAPT + bpage= prev; + prev= lru_hp.get(); + if (!prev && bpage) + goto rescan; +#endif + } + continue; + case 1: + break; + default: + if (state >= buf_page_t::UNFIXED) + continue; + } + + if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) + { + ut_ad(!bpage->is_io_fixed()); + lsn_t oldest_modification= bpage->oldest_modification(); + switch (oldest_modification) { + case 1: + mysql_mutex_lock(&flush_list_mutex); + oldest_modification= bpage->oldest_modification(); + if (oldest_modification) + { + ut_ad(oldest_modification == 1); + delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&flush_list_mutex); + /* fall through */ + case 0: + bpage->lock.u_unlock(true); + goto try_to_evict; + default: + if (bpage->state() < buf_page_t::UNFIXED && + oldest_modification <= log_sys.get_flushed_lsn()) + { + release_freed_page(bpage); + goto evicted; + } + else + bpage->lock.u_unlock(true); + } + } + } + +#if defined MADV_FREE + /* FIXME: Issue fewer calls for larger contiguous blocks of + memory. For now, we assume that this is acceptable, because this + code should be executed rarely. */ + for (buf_page_t *bpage= UT_LIST_GET_FIRST(free); bpage; + bpage= UT_LIST_GET_NEXT(list, bpage)) + madvise(bpage->frame, srv_page_size, MADV_FREE); +#endif + mysql_mutex_unlock(&mutex); + sql_print_information("InnoDB: Memory pressure event freed %zu pages", + freed); + return; +} +#endif /* __linux__ */ + /** Thread pool task invoked by innodb_buffer_pool_size changes. */ static void buf_resize_callback(void *) { @@ -1906,12 +2208,23 @@ static tpool::waitable_task buf_resize_task(buf_resize_callback, void buf_resize_start() { - srv_thread_pool->submit_task(&buf_resize_task); +#if !defined(DBUG_OFF) && defined(__linux__) + DBUG_EXECUTE_IF("trigger_garbage_collection", + { + mem_pressure_obj.trigger_collection(); + } + ); +#endif + + srv_thread_pool->submit_task(&buf_resize_task); } void buf_resize_shutdown() { - buf_resize_task.wait(); +#ifdef __linux__ + buf_mem_pressure_shutdown(); +#endif + buf_resize_task.wait(); } @@ -2220,14 +2533,21 @@ lookup: if (discard_attempted || !bpage->frame) { - /* Even when we are holding a hash_lock, it should be - acceptable to wait for a page S-latch here, because - buf_page_t::read_complete() will not wait for buf_pool.mutex, - and because S-latch would not conflict with a U-latch - that would be protecting buf_page_t::write_complete(). */ - bpage->lock.s_lock(); + const bool got_s_latch= bpage->lock.s_lock_try(); hash_lock.unlock_shared(); - break; + if (UNIV_LIKELY(got_s_latch)) + break; + /* We may fail to acquire bpage->lock because + buf_page_t::read_complete() may be invoking + buf_pool_t::corrupted_evict() on this block, which it would + hold an exclusive latch on. + + Let us aqcuire and release buf_pool.mutex to ensure that any + buf_pool_t::corrupted_evict() will proceed before we reacquire + the hash_lock that it could be waiting for. */ + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_unlock(&buf_pool.mutex); + goto lookup; } hash_lock.unlock_shared(); @@ -2246,7 +2566,6 @@ lookup: ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); } - bpage->set_accessed(); buf_page_make_young_if_needed(bpage); #ifdef UNIV_DEBUG @@ -2873,18 +3192,6 @@ get_latch_valid: ut_ad(page_id_t(page_get_space_id(block->page.frame), page_get_page_no(block->page.frame)) == page_id); - - if (mode == BUF_GET_POSSIBLY_FREED - || mode == BUF_PEEK_IF_IN_POOL) { - return block; - } - - const bool not_first_access{block->page.set_accessed()}; - buf_page_make_young_if_needed(&block->page); - if (!not_first_access) { - buf_read_ahead_linear(page_id, block->zip_size(), - ibuf_inside(mtr)); - } } return block; @@ -3057,7 +3364,6 @@ bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, block->page.fix(); ut_ad(!block->page.is_read_fixed()); - block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); mtr->memo_push(block, mtr_memo_type_t(rw_latch)); } diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index e9aea355..e2702adc 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -336,11 +336,14 @@ func_exit: os_file_flush(file); } else - for (ulint i= 0; i < size * 2; i++, page += srv_page_size) - if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN))) - /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */ + { + alignas(8) char checkpoint[8]; + mach_write_to_8(checkpoint, log_sys.next_checkpoint_lsn); + for (auto i= size * 2; i--; page += srv_page_size) + if (memcmp_aligned<8>(page + FIL_PAGE_LSN, checkpoint, 8) >= 0) + /* Valid pages are not older than the log checkpoint. */ recv_sys.dblwr.add(page); - + } err= DB_SUCCESS; goto func_exit; } diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index 957632db..cc51f8c6 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -33,7 +33,7 @@ Created April 08, 2011 Vasil Dimov #include "buf0rea.h" #include "buf0dump.h" -#include "dict0dict.h" +#include "dict0load.h" #include "os0file.h" #include "srv0srv.h" #include "srv0start.h" @@ -180,7 +180,7 @@ static void buf_dump_generate_path(char *path, size_t path_size) char buf[FN_REFLEN]; mysql_mutex_lock(&LOCK_global_system_variables); - snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(), + snprintf(buf, sizeof buf, "%s" FN_ROOTDIR "%s", get_buf_dump_dir(), srv_buf_dump_filename); mysql_mutex_unlock(&LOCK_global_system_variables); @@ -214,7 +214,7 @@ static void buf_dump_generate_path(char *path, size_t path_size) format = "%s%s"; break; default: - format = "%s/%s"; + format = "%s" FN_ROOTDIR "%s"; } snprintf(path, path_size, format, @@ -562,6 +562,22 @@ buf_load() if (!SHUTTING_DOWN()) { std::sort(dump, dump + dump_n); + std::set<uint32_t> missing; + for (const page_id_t id : st_::span<const page_id_t> + (dump, dump_n)) { + missing.emplace(id.space()); + } + for (std::set<uint32_t>::iterator i = missing.begin(); + i != missing.end(); ) { + auto j = i++; + if (fil_space_t* space = fil_space_t::get(*j)) { + space->release(); + missing.erase(j); + } + } + if (!missing.empty()) { + dict_check_tablespaces_and_store_max_id(&missing); + } } /* Avoid calling the expensive fil_space_t::get() for each diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index b6357989..d4628985 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -655,7 +655,7 @@ static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s, ut_ad(!bpage->zip_size() || !page_compressed); /* Find free slot from temporary memory array */ - *slot= buf_pool.io_buf_reserve(); + *slot= buf_pool.io_buf_reserve(true); ut_a(*slot); (*slot)->allocate(); @@ -754,16 +754,20 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) ut_ad(space->referenced()); const auto s= state(); - ut_a(s >= FREED); + + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); + ut_ad(lsn + ? lsn >= oldest_modification() || oldest_modification() == 2 + : space->purpose != FIL_TYPE_TABLESPACE); if (s < UNFIXED) { + ut_a(s >= FREED); if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) { - const lsn_t lsn= - mach_read_from_8(my_assume_aligned<8> - (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); - ut_ad(lsn >= oldest_modification()); + freed: if (lsn > log_sys.get_flushed_lsn()) { mysql_mutex_unlock(&buf_pool.mutex); @@ -775,6 +779,12 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) return false; } + if (UNIV_UNLIKELY(lsn < space->get_create_lsn())) + { + ut_ad(space->purpose == FIL_TYPE_TABLESPACE); + goto freed; + } + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_ad(f >= UNFIXED); ut_ad(f < READ_FIX); @@ -869,15 +879,9 @@ bool buf_page_t::flush(bool evict, fil_space_t *space) if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) { - if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) - { - const lsn_t lsn= - mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN + - (write_frame ? write_frame - : frame))); - ut_ad(lsn >= oldest_modification()); + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE) && + lsn > log_sys.get_flushed_lsn()) log_write_up_to(lsn, true); - } space->io(IORequest{type, this, slot}, physical_offset(), size, write_frame, this); } @@ -1057,11 +1061,25 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, bool contiguous, bool evict, ulint n_flushed, ulint n_to_flush) { - mysql_mutex_unlock(&buf_pool.mutex); - ut_ad(space->id == page_id.space()); ut_ad(bpage->id() == page_id); + { + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + + (bpage->zip.data ? bpage->zip.data : bpage->frame))); + ut_ad(lsn >= bpage->oldest_modification()); + if (UNIV_UNLIKELY(lsn < space->get_create_lsn())) + { + ut_a(!bpage->flush(evict, space)); + mysql_mutex_unlock(&buf_pool.mutex); + return 0; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + ulint count= 0; page_id_t id= page_id; page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); @@ -1741,6 +1759,28 @@ ulint buf_flush_LRU(ulint max_n, bool evict) buf_pool.try_LRU_scan= true; pthread_cond_broadcast(&buf_pool.done_free); } + else if (!pages && !buf_pool.try_LRU_scan && + !buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) + { + /* For example, with the minimum innodb_buffer_pool_size=5M and + the default innodb_page_size=16k there are only a little over 316 + pages in the buffer pool. The buffer pool can easily be exhausted + by a workload of some dozen concurrent connections. The system could + reach a deadlock like the following: + + (1) Many threads are waiting in buf_LRU_get_free_block() + for buf_pool.done_free. + (2) Some threads are waiting for a page latch which is held by + another thread that is waiting in buf_LRU_get_free_block(). + (3) This thread is the only one that could make progress, but + we fail to do so because all the pages that we scanned are + buffer-fixed or latched by some thread. */ + sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!" + " %zu blocks are in use and %zu free." + " Consider increasing innodb_buffer_pool_size.", + UT_LIST_GET_LEN(buf_pool.LRU), + UT_LIST_GET_LEN(buf_pool.free)); + } return pages; } @@ -2124,6 +2164,8 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) limit= lsn; buf_pool.page_cleaner_set_idle(false); pthread_cond_signal(&buf_pool.do_flush_list); + if (furious) + log_sys.set_check_for_checkpoint(); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); } @@ -2371,11 +2413,19 @@ func_exit: goto func_exit; } +TPOOL_SUPPRESS_TSAN +bool buf_pool_t::need_LRU_eviction() const +{ + /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting + for buf_flush_page_cleaner() to evict some blocks */ + return UNIV_UNLIKELY(!try_LRU_scan || + (UT_LIST_GET_LEN(LRU) > BUF_LRU_MIN_LEN && + UT_LIST_GET_LEN(free) < srv_LRU_scan_depth / 2)); +} + #if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ -/* Avoid GCC 4.8.5 internal compiler error "could not split insn". -We would only need this for buf_flush_page_cleaner(), -but GCC 4.8.5 does not support pop_options. */ -# pragma GCC optimize ("O0") +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */ +__attribute__((optimize(0))) #endif /** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one coordinator. */ @@ -2409,21 +2459,24 @@ static void buf_flush_page_cleaner() } mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (buf_pool.ran_out()) - goto no_wait; - else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) - break; + if (!buf_pool.need_LRU_eviction()) + { + if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; - if (buf_pool.page_cleaner_idle() && - (!UT_LIST_GET_LEN(buf_pool.flush_list) || - srv_max_dirty_pages_pct_lwm == 0.0)) - /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ - my_cond_wait(&buf_pool.do_flush_list, - &buf_pool.flush_list_mutex.m_mutex); - else - my_cond_timedwait(&buf_pool.do_flush_list, - &buf_pool.flush_list_mutex.m_mutex, &abstime); - no_wait: + if (buf_pool.page_cleaner_idle() && + (!UT_LIST_GET_LEN(buf_pool.flush_list) || + srv_max_dirty_pages_pct_lwm == 0.0)) + { + buf_pool.LRU_warned.clear(std::memory_order_release); + /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ + my_cond_wait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + } + else + my_cond_timedwait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex, &abstime); + } set_timespec(abstime, 1); lsn_limit= buf_flush_sync_lsn; @@ -2445,9 +2498,9 @@ static void buf_flush_page_cleaner() do { - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); - + IF_DBUG(if (_db_keyword_(nullptr, "ib_log_checkpoint_avoid", 1) || + _db_keyword_(nullptr, "ib_log_checkpoint_avoid_hard", 1)) + continue,); if (!recv_recovery_is_on() && !srv_startup_is_before_trx_rollback_phase && srv_operation <= SRV_OPERATION_EXPORT_RESTORED) @@ -2455,7 +2508,7 @@ static void buf_flush_page_cleaner() } while (false); - if (!buf_pool.ran_out()) + if (!buf_pool.need_LRU_eviction()) continue; mysql_mutex_lock(&buf_pool.flush_list_mutex); oldest_lsn= buf_pool.get_oldest_modification(0); @@ -2484,7 +2537,7 @@ static void buf_flush_page_cleaner() if (oldest_lsn >= soft_lsn_limit) buf_flush_async_lsn= soft_lsn_limit= 0; } - else if (buf_pool.ran_out()) + else if (buf_pool.need_LRU_eviction()) { buf_pool.page_cleaner_set_idle(false); buf_pool.n_flush_inc(); @@ -2549,10 +2602,11 @@ static void buf_flush_page_cleaner() else { maybe_unemployed: - const bool below{dirty_pct < pct_lwm}; - pct_lwm= 0.0; - if (below) + if (dirty_pct < pct_lwm) + { + pct_lwm= 0.0; goto possibly_unemployed; + } } } else if (dirty_pct < srv_max_buf_pool_modified_pct) @@ -2598,9 +2652,13 @@ static void buf_flush_page_cleaner() MONITOR_FLUSH_ADAPTIVE_PAGES, n_flushed); } - else if (buf_flush_async_lsn <= oldest_lsn) + else if (buf_flush_async_lsn <= oldest_lsn && + !buf_pool.need_LRU_eviction()) goto check_oldest_and_set_idle; + else + mysql_mutex_lock(&buf_pool.mutex); + n= srv_max_io_capacity; n= n >= n_flushed ? n - n_flushed : 0; goto LRU_flush; } diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 65ee8fa3..2a8d6ff2 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -60,10 +60,6 @@ static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20; frames in the buffer pool, we set this to TRUE */ static bool buf_lru_switched_on_innodb_mon = false; -/** True if diagnostic message about difficult to find free blocks -in the buffer bool has already printed. */ -static bool buf_lru_free_blocks_error_printed; - /******************************************************************//** These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O and page_zip_decompress() operations. Based on the statistics, @@ -408,6 +404,7 @@ got_mutex: buf_LRU_check_size_of_non_data_objects(); buf_block_t* block; + IF_DBUG(static bool buf_lru_free_blocks_error_printed,); DBUG_EXECUTE_IF("ib_lru_force_no_free_page", if (!buf_lru_free_blocks_error_printed) { n_iterations = 21; @@ -417,9 +414,25 @@ retry: /* If there is a block in the free list, take it */ if ((block = buf_LRU_get_free_only()) != nullptr) { got_block: + const ulint LRU_size = UT_LIST_GET_LEN(buf_pool.LRU); + const ulint available = UT_LIST_GET_LEN(buf_pool.free); + const ulint scan_depth = srv_LRU_scan_depth / 2; + ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth + || buf_pool.need_LRU_eviction()); + if (!have_mutex) { mysql_mutex_unlock(&buf_pool.mutex); } + + if (UNIV_UNLIKELY(available < scan_depth) + && LRU_size > BUF_LRU_MIN_LEN) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.page_cleaner_active()) { + buf_pool.page_cleaner_wakeup(true); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + block->page.zip.clear(); return block; } @@ -445,10 +458,11 @@ got_block: if ((block = buf_LRU_get_free_only()) != nullptr) { goto got_block; } + const bool wake = buf_pool.need_LRU_eviction(); mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex); const auto n_flush = buf_pool.n_flush(); - if (!buf_pool.try_LRU_scan) { + if (wake && !buf_pool.page_cleaner_active()) { buf_pool.page_cleaner_wakeup(true); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -467,9 +481,10 @@ not_found: MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); } - if (n_iterations == 21 && !buf_lru_free_blocks_error_printed - && srv_buf_pool_old_size == srv_buf_pool_size) { - buf_lru_free_blocks_error_printed = true; + if (n_iterations == 21 + && srv_buf_pool_old_size == srv_buf_pool_size + && buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) { + IF_DBUG(buf_lru_free_blocks_error_printed = true,); mysql_mutex_unlock(&buf_pool.mutex); ib::warn() << "Difficult to find free blocks in the buffer pool" " (" << n_iterations << " search iterations)! " @@ -787,6 +802,14 @@ void buf_page_make_young(buf_page_t *bpage) mysql_mutex_unlock(&buf_pool.mutex); } +bool buf_page_make_young_if_needed(buf_page_t *bpage) +{ + const bool not_first{bpage->set_accessed()}; + if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) + buf_page_make_young(bpage); + return not_first; +} + /** Try to free a block. If bpage is a descriptor of a compressed-only ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. The caller must hold buf_pool.mutex. diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index c4f07738..9041c6a2 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -597,6 +597,12 @@ failed: uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV)); uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT)); hash_lock.unlock_shared(); + /* The underlying file page of this buffer pool page could actually + be marked as freed, or a read of the page into the buffer pool might + be in progress. We may read uninitialized data here. + Suppress warnings of comparing uninitialized values. */ + MEM_MAKE_DEFINED(&prev, sizeof prev); + MEM_MAKE_DEFINED(&next, sizeof next); if (prev == FIL_NULL || next == FIL_NULL) goto fail; page_id_t id= page_id; diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 5516bce9..cb60d813 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -42,7 +42,10 @@ static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO}; static buf_block_t *dict_hdr_get(mtr_t *mtr) { /* We assume that the DICT_HDR page is always readable and available. */ - return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr); + buf_block_t *b= + buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr); + buf_page_make_young_if_needed(&b->page); + return b; } /**********************************************************************//** diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index cce5f2f2..dd858287 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -353,9 +353,6 @@ dict_build_table_def_step( /* Always set this bit for all new created tables */ DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - DICT_TF2_FLAG_UNSET(table, - DICT_TF2_FTS_AUX_HEX_NAME);); if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) { /* This table will need a new tablespace. */ diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 5bc7ab6e..5d3cab17 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2809,8 +2809,7 @@ dict_foreign_find_index( for (dict_index_t* index = dict_table_get_first_index(table); index; index = dict_table_get_next_index(index)) { - if (types_idx != index - && !index->to_be_dropped + if (!index->to_be_dropped && !dict_index_is_online_ddl(index) && dict_foreign_qualify_index( table, col_names, columns, n_cols, @@ -3530,6 +3529,7 @@ dict_foreign_parse_drop_constraints( const char* ptr1; const char* id; CHARSET_INFO* cs; + bool if_exists = false; ut_a(trx->mysql_thd); @@ -3583,6 +3583,7 @@ loop: ptr1 = dict_accept(cs, ptr1, "EXISTS", &success); if (success) { ptr = ptr1; + if_exists = true; } } @@ -3593,14 +3594,14 @@ loop: goto syntax_error; } - ut_a(*n < 1000); - (*constraints_to_drop)[*n] = id; - (*n)++; - if (std::find_if(table->foreign_set.begin(), - table->foreign_set.end(), - dict_foreign_matches_id(id)) - == table->foreign_set.end()) { + table->foreign_set.end(), + dict_foreign_matches_id(id)) + == table->foreign_set.end()) { + + if (if_exists) { + goto loop; + } if (!srv_read_only_mode) { FILE* ef = dict_foreign_err_file; @@ -3622,6 +3623,9 @@ loop: return(DB_CANNOT_DROP_CONSTRAINT); } + ut_a(*n < 1000); + (*constraints_to_drop)[*n] = id; + (*n)++; goto loop; syntax_error: diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc index f769839d..e7735586 100644 --- a/storage/innobase/dict/dict0load.cc +++ b/storage/innobase/dict/dict0load.cc @@ -33,8 +33,8 @@ Created 4/24/1996 Heikki Tuuri #include "dict0boot.h" #include "dict0crea.h" #include "dict0dict.h" -#include "dict0mem.h" #include "dict0stats.h" +#include "ibuf0ibuf.h" #include "fsp0file.h" #include "fts0priv.h" #include "mach0data.h" @@ -865,18 +865,30 @@ err_exit: return READ_OK; } -/** Check each tablespace found in the data dictionary. -Then look at each table defined in SYS_TABLES that has a space_id > 0 -to find all the file-per-table tablespaces. +/** @return SELECT MAX(space) FROM sys_tables */ +static uint32_t dict_find_max_space_id(btr_pcur_t *pcur, mtr_t *mtr) +{ + uint32_t max_space_id= 0; -In a crash recovery we already have some tablespace objects created from -processing the REDO log. We will compare the -space_id information in the data dictionary to what we find in the -tablespace file. In addition, more validation will be done if recovery -was needed and force_recovery is not set. + for (const rec_t *rec= dict_startscan_system(pcur, mtr, dict_sys.sys_tables); + rec; rec= dict_getnext_system_low(pcur, mtr)) + if (!dict_sys_tables_rec_check(rec)) + { + ulint len; + const byte *field= + rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &len); + ut_ad(len == 4); + max_space_id= std::max(max_space_id, mach_read_from_4(field)); + } + + return max_space_id; +} -We also scan the biggest space id, and store it to fil_system. */ -void dict_check_tablespaces_and_store_max_id() +/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system. +Open each data file if an encryption plugin has been loaded. + +@param spaces set of tablespace files to open */ +void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces) { uint32_t max_space_id = 0; btr_pcur_t pcur; @@ -888,6 +900,12 @@ void dict_check_tablespaces_and_store_max_id() dict_sys.lock(SRW_LOCK_CALL); + if (!spaces && ibuf.empty + && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) { + max_space_id = dict_find_max_space_id(&pcur, &mtr); + goto done; + } + for (const rec_t *rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_tables); rec; rec = dict_getnext_system_low(&pcur, &mtr)) { @@ -919,14 +937,6 @@ void dict_check_tablespaces_and_store_max_id() continue; } - if (flags2 & DICT_TF2_DISCARDED) { - sql_print_information("InnoDB: Ignoring tablespace" - " for %.*s because " - "the DISCARD flag is set", - static_cast<int>(len), field); - continue; - } - /* For tables or partitions using .ibd files, the flag DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN before MySQL 5.6.5. The flag should not have been @@ -939,6 +949,19 @@ void dict_check_tablespaces_and_store_max_id() continue; } + if (spaces && spaces->find(uint32_t(space_id)) + == spaces->end()) { + continue; + } + + if (flags2 & DICT_TF2_DISCARDED) { + sql_print_information("InnoDB: Ignoring tablespace" + " for %.*s because " + "the DISCARD flag is set", + static_cast<int>(len), field); + continue; + } + const span<const char> name{field, len}; char* filepath = fil_make_filepath(nullptr, name, @@ -971,6 +994,7 @@ void dict_check_tablespaces_and_store_max_id() ut_free(filepath); } +done: mtr.commit(); fil_set_max_space_id_if_bigger(max_space_id); @@ -2246,22 +2270,10 @@ dict_load_tablespace( /* The tablespace may already be open. */ table->space = fil_space_for_table_exists_in_mem(table->space_id, table->flags); - if (table->space) { + if (table->space || table->file_unreadable) { return; } - if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) { - table->file_unreadable = true; - return; - } - - if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) { - ib::error() << "Failed to find tablespace for table " - << table->name << " in the cache. Attempting" - " to load the tablespace with space id " - << table->space_id; - } - /* Use the remote filepath if needed. This parameter is optional in the call to fil_ibd_open(). If not supplied, it will be built from the table->name. */ @@ -2284,6 +2296,12 @@ dict_load_tablespace( if (!table->space) { /* We failed to find a sensible tablespace file */ table->file_unreadable = true; + + if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) { + sql_print_error("InnoDB: Failed to load tablespace " + ULINTPF " for table %s", + table->space_id, table->name); + } } ut_free(filepath); diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 40969335..f11187b9 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -752,16 +752,9 @@ dict_stats_empty_index( } } -/*********************************************************************//** -Write all zeros (or 1 where it makes sense) into a table and its indexes' -statistics members. The resulting stats correspond to an empty table. */ -static -void -dict_stats_empty_table( -/*===================*/ - dict_table_t* table, /*!< in/out: table */ +void dict_stats_empty_table( + dict_table_t* table, bool empty_defrag_stats) - /*!< in: whether to empty defrag stats */ { /* Initialize table/index level stats is now protected by table level lock_mutex.*/ diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index a66aac22..b0c34dc6 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -69,6 +69,8 @@ static recalc_pool_t recalc_pool; /** Whether the global data structures have been initialized */ static bool stats_initialised; +static THD *dict_stats_thd; + /*****************************************************************//** Free the resources occupied by the recalc pool, called once during thread de-initialization. */ @@ -90,6 +92,9 @@ static void dict_stats_recalc_pool_deinit() defrag_pool_t defrag_empty_pool; recalc_pool.swap(recalc_empty_pool); defrag_pool.swap(defrag_empty_pool); + + if (dict_stats_thd) + destroy_background_thd(dict_stats_thd); } /*****************************************************************//** @@ -361,52 +366,50 @@ done: { ut_ad(i->state == recalc::IN_PROGRESS); recalc_pool.erase(i); - const bool reschedule= !update_now && recalc_pool.empty(); if (err == DB_SUCCESS_LOCKED_REC) recalc_pool.emplace_back(recalc{table_id, recalc::IDLE}); mysql_mutex_unlock(&recalc_pool_mutex); - if (reschedule) - dict_stats_schedule(MIN_RECALC_INTERVAL * 1000); } return update_now; } -static tpool::timer* dict_stats_timer; -static std::mutex dict_stats_mutex; +/** Check if the recalc pool is empty. */ +static bool is_recalc_pool_empty() +{ + mysql_mutex_lock(&recalc_pool_mutex); + bool empty= recalc_pool.empty(); + mysql_mutex_unlock(&recalc_pool_mutex); + return empty; +} +static tpool::timer* dict_stats_timer; static void dict_stats_func(void*) { - THD *thd= innobase_create_background_thd("InnoDB statistics"); - set_current_thd(thd); - while (dict_stats_process_entry_from_recalc_pool(thd)) {} - dict_defrag_process_entries_from_defrag_pool(thd); + if (!dict_stats_thd) + dict_stats_thd= innobase_create_background_thd("InnoDB statistics"); + set_current_thd(dict_stats_thd); + + while (dict_stats_process_entry_from_recalc_pool(dict_stats_thd)) {} + dict_defrag_process_entries_from_defrag_pool(dict_stats_thd); + + innobase_reset_background_thd(dict_stats_thd); set_current_thd(nullptr); - destroy_background_thd(thd); + if (!is_recalc_pool_empty()) + dict_stats_schedule(MIN_RECALC_INTERVAL * 1000); } void dict_stats_start() { - std::lock_guard<std::mutex> lk(dict_stats_mutex); - if (!dict_stats_timer) - dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func); + DBUG_ASSERT(!dict_stats_timer); + dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func); } static void dict_stats_schedule(int ms) { - std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock); - /* - Use try_lock() to avoid deadlock in dict_stats_shutdown(), which - uses dict_stats_mutex too. If there is simultaneous timer reschedule, - the first one will win, which is fine. - */ - if (!lk.try_lock()) - { - return; - } - if (dict_stats_timer) + if(dict_stats_timer) dict_stats_timer->set_time(ms,0); } @@ -418,7 +421,6 @@ void dict_stats_schedule_now() /** Shut down the dict_stats_thread. */ void dict_stats_shutdown() { - std::lock_guard<std::mutex> lk(dict_stats_mutex); delete dict_stats_timer; dict_stats_timer= 0; } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 8a88f4e2..bd0ace7c 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -341,8 +341,9 @@ static bool fil_node_open_file_low(fil_node_t *node) ut_ad(!node->is_open()); ut_ad(node->space->is_closing()); mysql_mutex_assert_owner(&fil_system.mutex); - ulint type; static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); +#if defined _WIN32 || defined HAVE_FCNTL_DIRECT + ulint type; switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) { case 1: case 2: @@ -351,6 +352,9 @@ static bool fil_node_open_file_low(fil_node_t *node) default: type= OS_DATA_FILE; } +#else + constexpr auto type= OS_DATA_FILE; +#endif for (;;) { @@ -560,7 +564,7 @@ fil_space_extend_must_retry( ut_ad(UT_LIST_GET_LAST(space->chain) == node); ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE); ut_ad(node->space == space); - ut_ad(space->referenced() || space->is_being_truncated); + ut_ad(space->referenced()); *success = space->size >= size; @@ -649,8 +653,7 @@ fil_space_extend_must_retry( default: ut_ad(space->purpose == FIL_TYPE_TABLESPACE || space->purpose == FIL_TYPE_IMPORT); - if (space->purpose == FIL_TYPE_TABLESPACE - && !space->is_being_truncated) { + if (space->purpose == FIL_TYPE_TABLESPACE) { goto do_flush; } break; @@ -735,12 +738,10 @@ bool fil_space_extend(fil_space_t *space, uint32_t size) bool success= false; const bool acquired= space->acquire(); mysql_mutex_lock(&fil_system.mutex); - if (acquired || space->is_being_truncated) - { + if (acquired) while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain), size, &success)) mysql_mutex_lock(&fil_system.mutex); - } mysql_mutex_unlock(&fil_system.mutex); if (acquired) space->release(); @@ -1903,9 +1904,10 @@ fil_ibd_create( mtr.flag_wr_unlock(); log_write_up_to(lsn, true); - ulint type; static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); +#if defined _WIN32 || defined HAVE_FCNTL_DIRECT + ulint type; switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { case 1: case 2: @@ -1914,6 +1916,9 @@ fil_ibd_create( default: type = OS_DATA_FILE; } +#else + constexpr auto type = OS_DATA_FILE; +#endif file = os_file_create( innodb_data_file_key, path, @@ -2184,8 +2189,6 @@ func_exit: goto corrupted; } - os_file_get_last_error(operation_not_for_export, - !operation_not_for_export); if (!operation_not_for_export) { goto corrupted; } @@ -2448,21 +2451,15 @@ fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space) mysql_mutex_unlock(&fil_system.mutex); if (space) { - /* Compare the filename we are trying to open with the - filename from the first node of the tablespace we opened - previously. Fail if it is different. */ - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - if (0 != strcmp(innobase_basename(filename), - innobase_basename(node->name))) { - ib::info() - << "Ignoring data file '" << filename - << "' with space ID " << space->id - << ". Another data file called " << node->name - << " exists with the same space ID."; - space = NULL; - return(FIL_LOAD_ID_CHANGED); - } - return(FIL_LOAD_OK); + sql_print_information("InnoDB: Ignoring data file '%s'" + " with space ID " ULINTPF + ". Another data file called %s" + " exists" + " with the same space ID.", + filename, space->id, + UT_LIST_GET_FIRST(space->chain)->name); + space = NULL; + return FIL_LOAD_ID_CHANGED; } if (srv_operation == SRV_OPERATION_RESTORE) { @@ -3027,11 +3024,9 @@ fil_space_validate_for_mtr_commit( ut_ad(!is_predefined_tablespace(space->id)); /* We are serving mtr_commit(). While there is an active - mini-transaction, we should have !space->stop_new_ops. This is + mini-transaction, we should have !space->is_stopping(). This is guaranteed by meta-data locks or transactional locks. */ - ut_ad(!space->is_stopping() - || space->is_being_truncated /* fil_truncate_prepare() */ - || space->referenced()); + ut_ad(!space->is_stopping() || space->referenced()); } #endif /* UNIV_DEBUG */ @@ -3080,7 +3075,7 @@ ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write() and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. @param lsn checkpoint LSN @return current LSN */ -lsn_t fil_names_clear(lsn_t lsn) +ATTRIBUTE_COLD lsn_t fil_names_clear(lsn_t lsn) { mtr_t mtr; diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 16aea2a7..eaf4e04a 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -49,11 +49,6 @@ Updated 14/02/2015 #include "buf0lru.h" #include "ibuf0ibuf.h" #include "zlib.h" -#ifdef __linux__ -#include <linux/fs.h> -#include <sys/ioctl.h> -#include <fcntl.h> -#endif #include "row0mysql.h" #include "lz4.h" #include "lzo/lzo1x.h" diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc index cafff419..1c20efcd 100644 --- a/storage/innobase/fsp/fsp0file.cc +++ b/storage/innobase/fsp/fsp0file.cc @@ -435,12 +435,22 @@ Datafile::validate_for_recovery() return(err); } + if (!m_space_id) { + m_space_id = recv_sys.dblwr.find_first_page( + m_filepath, m_handle); + if (m_space_id) { + m_defer= false; + goto free_first_page; + } else return err; + } + if (!m_defer) { err = find_space_id(); if (err != DB_SUCCESS || m_space_id == 0) { - ib::error() << "Datafile '" << m_filepath - << "' is corrupted. Cannot determine " - "the space ID from the first 64 pages."; + sql_print_error( + "InnoDB: Datafile '%s' is corrupted." + " Cannot determine the space ID from" + " the first 64 pages.", m_filepath); return(err); } } @@ -453,7 +463,7 @@ Datafile::validate_for_recovery() m_space_id, m_filepath, m_handle)) { return m_defer ? err : DB_CORRUPTION; } - +free_first_page: /* Free the previously read first page and then re-validate. */ free_first_page(); m_defer = false; @@ -492,11 +502,11 @@ err_exit: return DB_SUCCESS; } - ib::info() << error_txt << " in datafile: " << m_filepath - << ", Space ID:" << m_space_id << ", Flags: " - << m_flags; + sql_print_error("InnoDB: %s in datafile: %s, Space ID: " + UINT32PF ", " "Flags: " UINT32PF, + error_txt, m_filepath, m_space_id, m_flags); m_is_valid = false; - return(DB_CORRUPTION); + return DB_CORRUPTION; } /* Check if the whole page is blank. */ diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 6c5c354e..87672a82 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -42,8 +42,6 @@ Created 11/29/1995 Heikki Tuuri #include "fsp0types.h" #include "log.h" -typedef uint32_t page_no_t; - /** Returns the first extent descriptor for a segment. We think of the extent lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. @@ -332,7 +330,7 @@ xdes_t* xdes_get_descriptor_with_space_hdr( buf_block_t* header, const fil_space_t* space, - page_no_t offset, + uint32_t offset, mtr_t* mtr, dberr_t* err = nullptr, buf_block_t** desc_block = nullptr, @@ -396,7 +394,7 @@ try to add new extents to the space free list @param[out] err error code @param[out] xdes extent descriptor page @return the extent descriptor */ -static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset, +static xdes_t *xdes_get_descriptor(const fil_space_t *space, uint32_t offset, mtr_t *mtr, dberr_t *err= nullptr, buf_block_t **xdes= nullptr) { @@ -842,8 +840,7 @@ fsp_fill_free_list( if (i) { buf_block_t *f= buf_LRU_get_free_block(false); - buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i), - zip_size, mtr, f); + buf_block_t *block= buf_page_create(space, i, zip_size, mtr, f); if (UNIV_UNLIKELY(block != f)) buf_pool.free_block(f); fsp_init_file_page(space, block, mtr); @@ -855,9 +852,7 @@ fsp_fill_free_list( { buf_block_t *f= buf_LRU_get_free_block(false); buf_block_t *block= - buf_page_create(space, - static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET), - zip_size, mtr, f); + buf_page_create(space, i + FSP_IBUF_BITMAP_OFFSET, zip_size, mtr, f); if (UNIV_UNLIKELY(block != f)) buf_pool.free_block(f); fsp_init_file_page(space, block, mtr); @@ -1028,40 +1023,13 @@ fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr, @param[in] offset page number of the allocated page @param[in,out] mtr mini-transaction @return block, initialized */ -static -buf_block_t* -fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr) +static buf_block_t* fsp_page_create(fil_space_t *space, uint32_t offset, + mtr_t *mtr) { - buf_block_t *block, *free_block; - - if (UNIV_UNLIKELY(space->is_being_truncated)) - { - const page_id_t page_id{space->id, offset}; - buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); - mysql_mutex_lock(&buf_pool.mutex); - block= reinterpret_cast<buf_block_t*> - (buf_pool.page_hash.get(page_id, chain)); - if (block && block->page.oldest_modification() <= 1) - block= nullptr; - mysql_mutex_unlock(&buf_pool.mutex); - - if (block) - { - ut_ad(block->page.buf_fix_count() >= 1); - ut_ad(block->page.lock.x_lock_count() == 1); - ut_ad(mtr->have_x_latch(*block)); - free_block= block; - goto got_free_block; - } - } - - free_block= buf_LRU_get_free_block(false); -got_free_block: - block= buf_page_create(space, static_cast<uint32_t>(offset), - space->zip_size(), mtr, free_block); + buf_block_t *free_block= buf_LRU_get_free_block(false), + *block= buf_page_create(space, offset, space->zip_size(), mtr, free_block); if (UNIV_UNLIKELY(block != free_block)) buf_pool.free_block(free_block); - fsp_init_file_page(space, block, mtr); return block; } @@ -1179,7 +1147,7 @@ MY_ATTRIBUTE((nonnull, warn_unused_result)) @param[in] offset page number in the extent @param[in,out] mtr mini-transaction @return error code */ -static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset, +static dberr_t fsp_free_extent(fil_space_t* space, uint32_t offset, mtr_t* mtr) { ut_ad(space->is_owner()); @@ -1216,7 +1184,7 @@ The page is marked as free and clean. @param[in] offset page number @param[in,out] mtr mini-transaction @return error code */ -static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr) +static dberr_t fsp_free_page(fil_space_t *space, uint32_t offset, mtr_t *mtr) { xdes_t* descr; ulint frag_n_used; @@ -1756,7 +1724,6 @@ page_alloc: ut_d(const auto x = block->page.lock.x_lock_count()); ut_ad(x || block->page.lock.not_recursive()); - ut_ad(x == 1 || space->is_being_truncated); ut_ad(x <= 2); ut_ad(!fil_page_get_type(block->page.frame)); mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, @@ -2493,7 +2460,7 @@ fseg_free_page_low( fseg_inode_t* seg_inode, buf_block_t* iblock, fil_space_t* space, - page_no_t offset, + uint32_t offset, mtr_t* mtr #ifdef BTR_CUR_HASH_ADAPT ,bool ahi=false @@ -2859,7 +2826,7 @@ fseg_free_step( return true; } - page_no_t page_no = fseg_get_nth_frag_page_no(inode, n); + uint32_t page_no = fseg_get_nth_frag_page_no(inode, n); if (fseg_free_page_low(inode, iblock, space, page_no, mtr #ifdef BTR_CUR_HASH_ADAPT diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc index e4a43e48..4ac9da50 100644 --- a/storage/innobase/fsp/fsp0sysspace.cc +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -33,6 +33,7 @@ Refactored 2013-7-26 by Kevin Lewis #include "os0file.h" #include "row0mysql.h" #include "buf0dblwr.h" +#include "log.h" /** The server header file is included to access opt_initialize global variable. If server passes the option for create/open DB to SE, we should remove such @@ -568,7 +569,7 @@ inline dberr_t SysTablespace::read_lsn_and_check_flags() } err = it->read_first_page( - m_ignore_read_only ? false : srv_read_only_mode); + m_ignore_read_only && srv_read_only_mode); if (err != DB_SUCCESS) { return(err); @@ -582,47 +583,62 @@ inline dberr_t SysTablespace::read_lsn_and_check_flags() /* Check the contents of the first page of the first datafile. */ - for (int retry = 0; retry < 2; ++retry) { + err = it->validate_first_page(); - err = it->validate_first_page(); - - if (err != DB_SUCCESS - && (retry == 1 - || recv_sys.dblwr.restore_first_page( + if (err != DB_SUCCESS) { + if (recv_sys.dblwr.restore_first_page( it->m_space_id, it->m_filepath, - it->handle()))) { - + it->handle())) { it->close(); - return(err); } + err = it->read_first_page( + m_ignore_read_only && srv_read_only_mode); } /* Make sure the tablespace space ID matches the space ID on the first page of the first datafile. */ - if (space_id() != it->m_space_id) { - - ib::error() - << "The data file '" << it->filepath() - << "' has the wrong space ID. It should be " - << space_id() << ", but " << it->m_space_id - << " was found"; - + if (err != DB_SUCCESS || space_id() != it->m_space_id) { + sql_print_error("InnoDB: The data file '%s'" + " has the wrong space ID." + " It should be " UINT32PF ", but " UINT32PF + " was found", it->filepath(), + space_id(), it->m_space_id); it->close(); - - return(err); + return err; } - if (srv_operation == SRV_OPERATION_NORMAL) { + if (srv_force_recovery != 6 + && srv_operation == SRV_OPERATION_NORMAL + && !log_sys.next_checkpoint_lsn + && log_sys.format == log_t::FORMAT_3_23) { + + log_sys.latch.wr_lock(SRW_LOCK_CALL); /* Prepare for possible upgrade from 0-sized ib_logfile0. */ - ut_ad(!log_sys.next_checkpoint_lsn); log_sys.next_checkpoint_lsn = mach_read_from_8( it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/); + if (log_sys.next_checkpoint_lsn < 8204) { + /* Before MDEV-14425, InnoDB had a minimum LSN + of 8192+12=8204. Likewise, mariadb-backup + --prepare would create an empty ib_logfile0 + after applying the log. We will allow an + upgrade from such an empty log. */ + sql_print_error("InnoDB: ib_logfile0 is " + "empty, and LSN is unknown."); + err = DB_CORRUPTION; + } else { + log_sys.last_checkpoint_lsn = + recv_sys.lsn = recv_sys.file_checkpoint = + log_sys.next_checkpoint_lsn; + log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn); + log_sys.next_checkpoint_no = 0; + } + + log_sys.latch.wr_unlock(); } it->close(); - - return(DB_SUCCESS); + return err; } /** Check if a file can be opened in the correct mode. diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 8ca8681b..0df9a7de 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -304,6 +304,8 @@ rtr_pcur_getnext_from_path( break; } + buf_page_make_young_if_needed(&block->page); + page = buf_block_get_frame(block); page_ssn = page_get_ssn_id(page); @@ -683,6 +685,8 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, return err; } + buf_page_make_young_if_needed(&block->page); + const page_t *page= buf_block_get_frame(block); #ifdef UNIV_ZIP_DEBUG if (rw_latch != RW_NO_LATCH) { @@ -1703,6 +1707,8 @@ corrupted: goto func_exit; } + buf_page_make_young_if_needed(&page_cursor->block->page); + /* Get the page SSN */ page = buf_block_get_frame(page_cursor->block); page_ssn = page_get_ssn_id(page); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 21bf10a1..407834f2 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1575,7 +1575,8 @@ static void innodb_drop_database(handlerton*, char *path) ibuf_delete_for_discarded_space(id); /* Any changes must be persisted before we return. */ - log_write_up_to(mtr.commit_lsn(), true); + if (mtr.commit_lsn()) + log_write_up_to(mtr.commit_lsn(), true); } my_free(namebuf); @@ -2080,7 +2081,7 @@ all_fail: ut_d(purge_sys.resume_FTS()); } -static void innodb_ddl_recovery_done(handlerton*) +static int innodb_ddl_recovery_done(handlerton*) { ut_ad(!ddl_recovery_done); ut_d(ddl_recovery_done= true); @@ -2091,6 +2092,7 @@ static void innodb_ddl_recovery_done(handlerton*) drop_garbage_tables_after_restore(); srv_init_purge_tasks(); } + return 0; } /********************************************************************//** @@ -4001,7 +4003,7 @@ static int innodb_init_params() data_mysql_default_charset_coll = (ulint) default_charset_info->number; -#ifndef _WIN32 +#ifdef HAVE_FCNTL_DIRECT if (srv_use_atomic_writes && my_may_have_atomic_write) { /* Force O_DIRECT on Unixes (on Windows writes are always @@ -4026,11 +4028,6 @@ static int innodb_init_params() } #endif - if (srv_read_only_mode) { - ib::info() << "Started in read only mode"; - srv_use_doublewrite_buf = FALSE; - } - #if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32 /* Currently native AIO is supported only on windows and linux and that also when the support is compiled in. In all other @@ -4046,9 +4043,7 @@ static int innodb_init_params() } #endif -#ifndef _WIN32 - ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); -#else +#ifdef _WIN32 switch (srv_file_flush_method) { case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC; @@ -4059,6 +4054,8 @@ static int innodb_init_params() default: ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC); } +#else + ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); #endif innodb_buffer_pool_size_init(); @@ -7817,20 +7814,6 @@ ha_innobase::write_row( #endif if ((error_result = update_auto_increment())) { - /* We don't want to mask autoinc overflow errors. */ - - /* Handle the case where the AUTOINC sub-system - failed during initialization. */ - if (m_prebuilt->autoinc_error == DB_UNSUPPORTED) { - error_result = ER_AUTOINC_READ_FAILED; - /* Set the error message to report too. */ - my_error(ER_AUTOINC_READ_FAILED, MYF(0)); - goto func_exit; - } else if (m_prebuilt->autoinc_error != DB_SUCCESS) { - error = m_prebuilt->autoinc_error; - goto report_error; - } - /* MySQL errors are passed straight back. */ goto func_exit; } @@ -7968,7 +7951,6 @@ set_max_autoinc: } } -report_error: /* Cleanup and exit. */ if (error == DB_TABLESPACE_DELETED) { ib_senderrf( @@ -11809,8 +11791,6 @@ index_bad: /* Set the flags2 when create table or alter tables */ m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - m_flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); DBUG_RETURN(true); } @@ -14710,12 +14690,7 @@ ha_innobase::info_low( DBUG_ASSERT(ib_table->get_ref_count() > 0); if (!ib_table->is_readable()) { - ib_table->stats_mutex_lock(); - ib_table->stat_initialized = true; - ib_table->stat_n_rows = 0; - ib_table->stat_clustered_index_size = 0; - ib_table->stat_sum_of_other_index_sizes = 0; - ib_table->stats_mutex_unlock(); + dict_stats_empty_table(ib_table, true); } if (flag & HA_STATUS_TIME) { @@ -15674,15 +15649,17 @@ ha_innobase::extra( { /* Warning: since it is not sure that MariaDB calls external_lock() before calling this function, m_prebuilt->trx can be obsolete! */ - trx_t* trx = check_trx_exists(ha_thd()); + trx_t* trx; switch (operation) { case HA_EXTRA_FLUSH: + (void)check_trx_exists(ha_thd()); if (m_prebuilt->blob_heap) { row_mysql_prebuilt_free_blob_heap(m_prebuilt); } break; case HA_EXTRA_RESET_STATE: + trx = check_trx_exists(ha_thd()); reset_template(); trx->duplicates = 0; stmt_boundary: @@ -15691,18 +15668,23 @@ ha_innobase::extra( trx->bulk_insert = false; break; case HA_EXTRA_NO_KEYREAD: + (void)check_trx_exists(ha_thd()); m_prebuilt->read_just_key = 0; break; case HA_EXTRA_KEYREAD: + (void)check_trx_exists(ha_thd()); m_prebuilt->read_just_key = 1; break; case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + (void)check_trx_exists(ha_thd()); m_prebuilt->keep_other_fields_on_keyread = 1; break; case HA_EXTRA_INSERT_WITH_UPDATE: + trx = check_trx_exists(ha_thd()); trx->duplicates |= TRX_DUP_IGNORE; goto stmt_boundary; case HA_EXTRA_NO_IGNORE_DUP_KEY: + trx = check_trx_exists(ha_thd()); trx->duplicates &= ~TRX_DUP_IGNORE; if (trx->is_bulk_insert()) { /* Allow a subsequent INSERT into an empty table @@ -15714,9 +15696,11 @@ ha_innobase::extra( } goto stmt_boundary; case HA_EXTRA_WRITE_CAN_REPLACE: + trx = check_trx_exists(ha_thd()); trx->duplicates |= TRX_DUP_REPLACE; goto stmt_boundary; case HA_EXTRA_WRITE_CANNOT_REPLACE: + trx = check_trx_exists(ha_thd()); trx->duplicates &= ~TRX_DUP_REPLACE; if (trx->is_bulk_insert()) { /* Allow a subsequent INSERT into an empty table @@ -15725,6 +15709,7 @@ ha_innobase::extra( } goto stmt_boundary; case HA_EXTRA_BEGIN_ALTER_COPY: + trx = check_trx_exists(ha_thd()); m_prebuilt->table->skip_alter_undo = 1; if (m_prebuilt->table->is_temporary() || !m_prebuilt->table->versioned_by_id()) { @@ -15737,8 +15722,10 @@ ha_innobase::extra( .first->second.set_versioned(0); break; case HA_EXTRA_END_ALTER_COPY: + trx = check_trx_exists(ha_thd()); m_prebuilt->table->skip_alter_undo = 0; - if (!m_prebuilt->table->is_temporary()) { + if (!m_prebuilt->table->is_temporary() + && !high_level_read_only) { log_buffer_flush_to_disk(); } break; @@ -18270,11 +18257,18 @@ static void buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save) { - if (*(my_bool*) save) { - mysql_mutex_unlock(&LOCK_global_system_variables); - buf_flush_sync(); - mysql_mutex_lock(&LOCK_global_system_variables); - } + if (!*(my_bool*) save) + return; + const uint s= srv_fil_make_page_dirty_debug; + mysql_mutex_unlock(&LOCK_global_system_variables); + if (s) + buf_flush_sync(); + else + { + while (buf_flush_list_space(fil_system.sys_space, nullptr)); + os_aio_wait_until_no_pending_writes(true); + } + mysql_mutex_lock(&LOCK_global_system_variables); } /** Override current MERGE_THRESHOLD setting for all indexes at dictionary @@ -19368,8 +19362,10 @@ static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size, 10 << 20, 10 << 20, 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0); +static ulong innodb_purge_rseg_truncate_frequency; + static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency, - srv_purge_rseg_truncate_frequency, + innodb_purge_rseg_truncate_frequency, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED, "Deprecated parameter with no effect", NULL, NULL, 128, 1, 128, 0); @@ -20606,6 +20602,10 @@ Compare_keys ha_innobase::compare_key_parts( if (old_part.length >= new_part.length) return Compare_keys::NotEqual; + if (old_part.length == old_field.key_length() && + new_part.length != new_field.length) + return Compare_keys::NotEqual; + return Compare_keys::EqualButKeyPartLength; } diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 40370ac5..1401136f 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -2317,12 +2317,16 @@ innodb_instant_alter_column_allowed_reason: } } + bool need_rebuild = false; + switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { case ALTER_OPTIONS: - if (alter_options_need_rebuild(ha_alter_info, table)) { + if ((srv_file_per_table && !m_prebuilt->table->space_id) + || alter_options_need_rebuild(ha_alter_info, table)) { reason_rebuild = my_get_err_msg( ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD); ha_alter_info->unsupported_reason = reason_rebuild; + need_rebuild= true; break; } /* fall through */ @@ -2434,7 +2438,7 @@ innodb_instant_alter_column_allowed_reason: /* We should be able to do the operation in-place. See if we can do it online (LOCK=NONE) or without rebuild. */ - bool online = true, need_rebuild = false; + bool online = true; const uint fulltext_indexes = innobase_fulltext_exist(altered_table); /* Fix the key parts. */ @@ -4338,7 +4342,8 @@ static void unlock_and_close_files(const std::vector<pfs_os_file_t> &deleted, row_mysql_unlock_data_dictionary(trx); for (pfs_os_file_t d : deleted) os_file_close(d); - log_write_up_to(trx->commit_lsn, true); + if (trx->commit_lsn) + log_write_up_to(trx->commit_lsn, true); } /** Commit a DDL transaction and unlink any deleted files. */ @@ -4681,11 +4686,13 @@ innobase_build_col_map( col_map[old_i - num_old_v] = i; if (!old_table->versioned() || !altered_table->versioned()) { - } else if (old_i == old_table->vers_start) { - new_table->vers_start = (i + num_v) + } else if (old_i - num_old_v == old_table->vers_start) { + ut_ad(field->vers_sys_start()); + new_table->vers_start = i & dict_index_t::MAX_N_FIELDS; - } else if (old_i == old_table->vers_end) { - new_table->vers_end = (i + num_v) + } else if (old_i - num_old_v == old_table->vers_end) { + ut_ad(field->vers_sys_end()); + new_table->vers_end = i & dict_index_t::MAX_N_FIELDS; } goto found_col; @@ -6217,24 +6224,20 @@ empty_table: /* Convert the table to the instant ALTER TABLE format. */ mtr.commit(); mtr.start(); - index->set_modified(mtr); - if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, + if (buf_block_t* root = btr_root_block_get(index, RW_S_LATCH, &mtr, &err)) { if (fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX) { DBUG_ASSERT("wrong page type" == 0); err = DB_CORRUPTION; goto func_exit; } - - btr_set_instant(root, *index, &mtr); - mtr.commit(); - mtr.start(); - index->set_modified(mtr); - err = row_ins_clust_index_entry_low( - BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, - index->n_uniq, entry, 0, thr); } + mtr.commit(); + mtr.start(); + err = row_ins_clust_index_entry_low( + BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, + index->n_uniq, entry, 0, thr); goto func_exit; } @@ -7775,6 +7778,7 @@ bool check_col_is_in_fk_indexes( for (const auto &a : add_fk) { + if (!a->foreign_index) continue; for (ulint i= 0; i < a->n_fields; i++) { if (a->foreign_index->fields[i].col == col) @@ -11666,7 +11670,6 @@ foreign_fail: } unlock_and_close_files(deleted, trx); - log_write_up_to(trx->commit_lsn, true); DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", DBUG_SUICIDE();); trx->free(); @@ -11723,7 +11726,6 @@ foreign_fail: } unlock_and_close_files(deleted, trx); - log_write_up_to(trx->commit_lsn, true); DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", DBUG_SUICIDE();); trx->free(); diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index b00308d7..711144e3 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -4539,6 +4539,15 @@ i_s_dict_fill_sys_tables( DBUG_RETURN(0); } +/** Handle the error for information schema query +@param err error value +@param thd thread +@return 0 if query is interrupted or error */ +static int i_s_sys_error_handling(int err, THD *thd) +{ + return thd_kill_level(thd) ? 0 : err; +} + /** Convert one SYS_TABLES record to dict_table_t. @param pcur persistent cursor position on SYS_TABLES record @param mtr mini-transaction (nullptr=use the dict_sys cache) @@ -4587,6 +4596,7 @@ i_s_sys_tables_fill_table( { btr_pcur_t pcur; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_tables_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -4616,8 +4626,15 @@ i_s_sys_tables_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_tables(thd, table_rec, - tables->table); + err = i_s_dict_fill_sys_tables( + thd, table_rec, tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + if (table_rec) { + dict_mem_table_free(table_rec); + } + goto func_exit; + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -4635,8 +4652,10 @@ i_s_sys_tables_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** @@ -4807,6 +4826,7 @@ i_s_sys_tables_fill_table_stats( btr_pcur_t pcur; const rec_t* rec; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_tables_fill_table_stats"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -4832,8 +4852,12 @@ i_s_sys_tables_fill_table_stats( &table_rec); if (UNIV_LIKELY(!err_msg)) { - i_s_dict_fill_sys_tablestats(thd, table_rec, + err = i_s_dict_fill_sys_tablestats(thd, table_rec, tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } } else { ut_ad(!table_rec); dict_sys.unlock(); @@ -4851,8 +4875,9 @@ i_s_sys_tables_fill_table_stats( mtr.commit(); dict_sys.unlock(); - - DBUG_RETURN(0); +func_exit: + ut_free(pcur.old_rec_buf); + DBUG_RETURN(err); } /*******************************************************************//** @@ -5024,6 +5049,7 @@ i_s_sys_indexes_fill_table( const rec_t* rec; mem_heap_t* heap; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_indexes_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5059,11 +5085,13 @@ i_s_sys_indexes_fill_table( dict_sys.unlock(); if (!err_msg) { - if (int err = i_s_dict_fill_sys_indexes( - thd, table_id, space_id, &index_rec, - tables->table)) { - mem_heap_free(heap); - DBUG_RETURN(err); + err = i_s_dict_fill_sys_indexes( + thd, table_id, space_id, + &index_rec, + tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, @@ -5081,9 +5109,11 @@ i_s_sys_indexes_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: mem_heap_free(heap); + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes @@ -5232,6 +5262,7 @@ i_s_sys_columns_fill_table( const char* col_name; mem_heap_t* heap; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_columns_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5263,9 +5294,14 @@ i_s_sys_columns_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_columns(thd, table_id, col_name, - &column_rec, nth_v_col, - tables->table); + err = i_s_dict_fill_sys_columns( + thd, table_id, col_name, + &column_rec, nth_v_col, + tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -5282,9 +5318,11 @@ i_s_sys_columns_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: mem_heap_free(heap); + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** @@ -5416,6 +5454,7 @@ i_s_sys_virtual_fill_table( ulint pos; ulint base_pos; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_virtual_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5444,8 +5483,13 @@ i_s_sys_virtual_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos, - tables->table); + err = i_s_dict_fill_sys_virtual( + thd, table_id, pos, base_pos, + tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -5462,6 +5506,9 @@ i_s_sys_virtual_fill_table( dict_sys.unlock(); DBUG_RETURN(0); +func_exit: + ut_free(pcur.old_rec_buf); + DBUG_RETURN(err); } /** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_virtual @@ -5589,6 +5636,7 @@ i_s_sys_fields_fill_table( mem_heap_t* heap; index_id_t last_id; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_fields_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5624,8 +5672,13 @@ i_s_sys_fields_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_fields(thd, index_id, &field_rec, - pos, tables->table); + err = i_s_dict_fill_sys_fields( + thd, index_id, &field_rec, + pos, tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } last_id = index_id; } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, @@ -5643,9 +5696,11 @@ i_s_sys_fields_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: mem_heap_free(heap); + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields @@ -5782,6 +5837,7 @@ i_s_sys_foreign_fill_table( const rec_t* rec; mem_heap_t* heap; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_foreign_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5809,8 +5865,12 @@ i_s_sys_foreign_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_foreign(thd, &foreign_rec, - tables->table); + err = i_s_dict_fill_sys_foreign( + thd, &foreign_rec, tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -5827,9 +5887,11 @@ i_s_sys_foreign_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: mem_heap_free(heap); + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** @@ -5963,6 +6025,7 @@ i_s_sys_foreign_cols_fill_table( const rec_t* rec; mem_heap_t* heap; mtr_t mtr; + int err = 0; DBUG_ENTER("i_s_sys_foreign_cols_fill_table"); RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); @@ -5994,9 +6057,13 @@ i_s_sys_foreign_cols_fill_table( dict_sys.unlock(); if (!err_msg) { - i_s_dict_fill_sys_foreign_cols( - thd, name, for_col_name, ref_col_name, pos, - tables->table); + err = i_s_dict_fill_sys_foreign_cols( + thd, name, for_col_name, + ref_col_name, pos, tables->table); + if (err) { + err = i_s_sys_error_handling(err, thd); + goto func_exit; + } } else { push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_CANT_FIND_SYSTEM_REC, "%s", @@ -6013,9 +6080,11 @@ i_s_sys_foreign_cols_fill_table( mtr.commit(); dict_sys.unlock(); +func_exit: mem_heap_free(heap); + ut_free(pcur.old_rec_buf); - DBUG_RETURN(0); + DBUG_RETURN(err); } /*******************************************************************//** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols @@ -6218,6 +6287,8 @@ static int i_s_sys_tablespaces_fill_table(THD *thd, TABLE_LIST *tables, Item*) mysql_mutex_unlock(&fil_system.mutex); if (err == DB_SUCCESS) err= i_s_sys_tablespaces_fill(thd, *fil_system.temp_space, tables->table); + else + err = i_s_sys_error_handling(err, thd); DBUG_RETURN(err); } diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index b9e94a67..4ec07b81 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -309,8 +309,13 @@ ibuf_header_page_get( buf_block_t* block = buf_page_get( page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), 0, RW_X_LATCH, mtr); + if (UNIV_UNLIKELY(!block)) { + return nullptr; + } + + buf_page_make_young_if_needed(&block->page); - return block ? block->page.frame : nullptr; + return block->page.frame; } /** Acquire the change buffer root page. @@ -326,7 +331,12 @@ static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr) buf_block_t *block= buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO}, 0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err); - ut_ad(!block || ibuf.empty == page_is_empty(block->page.frame)); + if (block) + { + ut_ad(ibuf.empty == page_is_empty(block->page.frame)); + buf_page_make_young_if_needed(&block->page); + } + return block; } @@ -408,7 +418,8 @@ err_exit: + header_page->page.frame, &ibuf.seg_size, &mtr); do { - DBUG_EXECUTE_IF("intermittent_read_failure", continue;); + IF_DBUG(if (_db_keyword_(nullptr, "intermittent_read_failure", + 1)) continue,); ut_ad(ibuf.seg_size >= 2); } while (0); diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 5a0401fa..b42c543c 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -89,10 +89,12 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index); @param[in] merge whether change buffer merge should be attempted @param[in,out] mtr mini-transaction @param[out] err error code +@param[out] first set if this is a first-time access to the page @return block */ buf_block_t *btr_block_get(const dict_index_t &index, uint32_t page, rw_lock_type_t mode, bool merge, - mtr_t *mtr, dberr_t *err= nullptr); + mtr_t *mtr, dberr_t *err= nullptr, + bool *first= nullptr); /**************************************************************//** Gets the index id field of a page. diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 332b2039..cd7cc294 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -262,8 +262,6 @@ buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr, buf_block_t *free_block); -/** Move a block to the start of the LRU list. */ -void buf_page_make_young(buf_page_t *bpage); /** Mark the page status as FREED for the given tablespace and page number. @param[in,out] space tablespace @param[in] page page number @@ -285,15 +283,6 @@ there is danger of dropping from the buffer pool. @return true if bpage should be made younger */ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage); -/** Move a page to the start of the buffer pool LRU list if it is too old. -@param[in,out] bpage buffer pool page */ -inline void buf_page_make_young_if_needed(buf_page_t *bpage) -{ - if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) { - buf_page_make_young(bpage); - } -} - /********************************************************************//** Increments the modify clock of a frame by 1. The caller must (1) own the buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock @@ -656,12 +645,9 @@ public: access_time= 0; } - void set_os_unused() + void set_os_unused() const { MEM_NOACCESS(frame, srv_page_size); -#ifdef MADV_FREE - madvise(frame, srv_page_size, MADV_FREE); -#endif } void set_os_used() const @@ -1301,6 +1287,11 @@ public: /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ inline void resize(); +#ifdef __linux__ + /** Collect garbage (release pages from the LRU list) */ + inline void garbage_collect(); +#endif + /** @return whether resize() is in progress */ bool resize_in_progress() const { @@ -1507,10 +1498,8 @@ public: n_chunks_new / 4 * chunks->size; } - /** @return whether the buffer pool has run out */ - TPOOL_SUPPRESS_TSAN - bool ran_out() const - { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); } + /** @return whether the buffer pool is running low */ + bool need_LRU_eviction() const; /** @return whether the buffer pool is shrinking */ inline bool is_shrinking() const @@ -1836,6 +1825,9 @@ public: Set whenever the free list grows, along with a broadcast of done_free. Protected by buf_pool.mutex. */ Atomic_relaxed<bool> try_LRU_scan; + /** Whether we have warned to be running out of buffer pool */ + std::atomic_flag LRU_warned; + /* @} */ /** @name LRU replacement algorithm fields */ @@ -1898,7 +1890,8 @@ public: a delete-buffering operation is pending. Protected by mutex. */ buf_page_t watch[innodb_purge_threads_MAX + 1]; /** Reserve a buffer. */ - buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); } + buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads) + { return io_buf.reserve(wait_for_reads); } /** Remove a block from flush_list. @param bpage buffer pool page */ @@ -1933,7 +1926,7 @@ private: void close(); /** Reserve a buffer */ - buf_tmp_buffer_t *reserve(); + buf_tmp_buffer_t *reserve(bool wait_for_reads); } io_buf; /** whether resize() is in the critical path */ diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 9932b0e5..6e7662d9 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -105,7 +105,8 @@ public: If we are upgrading from a version before MySQL 4.1, then this function performs the necessary update operations to support innodb_file_per_table. If we are in a crash recovery, this function - loads the pages from double write buffer into memory. + loads the pages from double write buffer which are not older than + the checkpoint into memory. @param file File handle @param path Path name of file @return DB_SUCCESS or error code */ diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index aec08e77..28410276 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -108,6 +108,16 @@ buf_LRU_add_block( blocks in the LRU list, else put to the start; if the LRU list is very short, added to the start regardless of this parameter */ + +/** Move a block to the start of the buf_pool.LRU list. +@param bpage buffer pool page */ +void buf_page_make_young(buf_page_t *bpage); +/** Flag a page accessed in buf_pool and move it to the start of buf_pool.LRU +if it is too old. +@param bpage buffer pool page +@return whether this is not the first access */ +bool buf_page_make_young_if_needed(buf_page_t *bpage); + /******************************************************************//** Adds a block to the LRU list of decompressed zip pages. */ void diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h index f7d33d5b..3143aafd 100644 --- a/storage/innobase/include/dict0load.h +++ b/storage/innobase/include/dict0load.h @@ -35,22 +35,16 @@ Created 4/24/1996 Heikki Tuuri #include "btr0types.h" #include <deque> +#include <set> /** A stack of table names related through foreign key constraints */ typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t; -/** Check each tablespace found in the data dictionary. -Then look at each table defined in SYS_TABLES that has a space_id > 0 -to find all the file-per-table tablespaces. +/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system. +Open each data file if an encryption plugin has been loaded. -In a crash recovery we already have some tablespace objects created from -processing the REDO log. We will compare the -space_id information in the data dictionary to what we find in the -tablespace file. In addition, more validation will be done if recovery -was needed and force_recovery is not set. - -We also scan the biggest space id, and store it to fil_system. */ -void dict_check_tablespaces_and_store_max_id(); +@param spaces set of tablespace files to open */ +void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces); /** Make sure the data_file_name is saved in dict_table_t if needed. @param[in,out] table Table object */ diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 0dc1b984..3b006daf 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -235,4 +235,13 @@ dict_stats_report_error(dict_table_t* table, bool defragment = false) void test_dict_stats_all(); #endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ +/** Write all zeros (or 1 where it makes sense) into a table +and its indexes'statistics members. The resulting stats +correspond to an empty table. +@param table table stats to be emptied +@param empty_defrag_stats empty the defrag stats */ +void +dict_stats_empty_table( + dict_table_t* table, + bool empty_defrag_stats); #endif /* dict0stats_h */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 6f58e3c1..cdc32515 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -359,8 +359,6 @@ struct fil_space_t final lsn_t max_lsn; /** tablespace identifier */ uint32_t id; - /** whether undo tablespace truncation is in progress */ - bool is_being_truncated; fil_type_t purpose;/*!< purpose */ UT_LIST_BASE_NODE_T(fil_node_t) chain; /*!< base node for the file chain */ @@ -440,6 +438,8 @@ private: /** LSN of freeing last page; protected by freed_range_mutex */ lsn_t last_freed_lsn; + /** LSN of undo tablespace creation or 0; protected by latch */ + lsn_t create_lsn; public: /** @return whether doublewrite buffering is needed */ inline bool use_doublewrite() const; @@ -447,6 +447,12 @@ public: /** @return whether a page has been freed */ inline bool is_freed(uint32_t page); + /** Set create_lsn. */ + inline void set_create_lsn(lsn_t lsn); + + /** @return the latest tablespace rebuild LSN, or 0 */ + lsn_t get_create_lsn() const { return create_lsn; } + /** Apply freed_ranges to the file. @param writable whether the file is writable @return number of pages written or hole-punched */ @@ -524,9 +530,6 @@ public: /** Note that operations on the tablespace must stop. */ inline void set_stopping(); - /** Note that operations on the tablespace can resume after truncation */ - inline void clear_stopping(); - /** Drop the tablespace and wait for any pending operations to cease @param id tablespace identifier @param detached_handle pointer to file to be closed later, or nullptr @@ -1555,14 +1558,6 @@ inline void fil_space_t::set_stopping() #endif } -inline void fil_space_t::clear_stopping() -{ - mysql_mutex_assert_owner(&fil_system.mutex); - static_assert(STOPPING_WRITES == 1U << 30, "compatibility"); - ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed); - ut_ad((n & STOPPING) == STOPPING_WRITES); -} - /** Flush pending writes from the file system cache to the file. */ template<bool have_reference> inline void fil_space_t::flush() { @@ -1802,7 +1797,7 @@ bool fil_comp_algo_loaded(ulint comp_algo); and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. @param lsn checkpoint LSN @return current LSN */ -lsn_t fil_names_clear(lsn_t lsn); +ATTRIBUTE_COLD lsn_t fil_names_clear(lsn_t lsn); #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH void test_make_filepath(); diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl index da14cfcb..3cb09c92 100644 --- a/storage/innobase/include/fts0priv.inl +++ b/storage/innobase/include/fts0priv.inl @@ -34,29 +34,6 @@ fts_write_object_id( ib_id_t id, /* in: a table/index id */ char* str) /* in: buffer to write the id to */ { - -#ifdef _WIN32 - - DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name", - return(sprintf(str, UINT64PFx, id));); - - /* Use this to construct old(5.6.14 and 5.7.3) windows - ambiguous aux table names */ - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - return(sprintf(str, "%016llu", (ulonglong) id));); - -#else /* _WIN32 */ - - /* Use this to construct old(5.6.14 and 5.7.3) windows - ambiguous aux table names */ - DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name", - return(sprintf(str, "%016llu", (ulonglong) id));); - - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - return(sprintf(str, "%016llx", (ulonglong) id));); - -#endif /* _WIN32 */ - return(sprintf(str, "%016llx", (ulonglong) id)); } diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index f873eabf..54851ca0 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -79,13 +79,6 @@ ATTRIBUTE_COLD void log_make_checkpoint(); /** Make a checkpoint at the latest lsn on shutdown. */ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); -/** -Checks that there is enough free space in the log to start a new query step. -Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this -function may only be called if the calling thread owns no synchronization -objects! */ -ATTRIBUTE_COLD void log_check_margins(); - /******************************************************//** Prints info of the log. */ void @@ -179,24 +172,33 @@ private: std::atomic<lsn_t> flushed_to_disk_lsn; /** log sequence number when log resizing was initiated, or 0 */ std::atomic<lsn_t> resize_lsn; - /** set when there may be need to flush the log buffer, or - preflush buffer pool pages, or initiate a log checkpoint. + /** set when there may be need to initiate a log checkpoint. This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ - std::atomic<bool> check_flush_or_checkpoint_; - + std::atomic<bool> need_checkpoint; #if defined(__aarch64__) -/* On ARM, we do more spinning */ -typedef srw_spin_lock log_rwlock_t; -#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST + /* On ARM, we do more spinning */ + typedef srw_spin_lock log_rwlock; + typedef pthread_mutex_wrapper<true> log_lsn_lock; #else -typedef srw_lock log_rwlock_t; -#define LSN_LOCK_ATTR nullptr + typedef srw_lock log_rwlock; + typedef srw_mutex log_lsn_lock; #endif public: - /** rw-lock protecting buf */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch; + /** rw-lock protecting writes to buf; normal mtr_t::commit() + outside any log checkpoint is covered by a shared latch */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch; +private: + /** mutex protecting buf_free et al, together with latch */ + log_lsn_lock lsn_lock; +public: + /** first free offset within buf use; protected by lsn_lock */ + Atomic_relaxed<size_t> buf_free; + /** number of write requests (to buf); protected by lsn_lock */ + size_t write_to_buf; + /** number of append_prepare_wait(); protected by lsn_lock */ + size_t waits; private: /** Last written LSN */ lsn_t write_lsn; @@ -227,20 +229,12 @@ private: /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; - /** spin lock protecting lsn, buf_free in append_prepare() */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock; - void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); } - void lock_lsn() { pthread_mutex_lock(&lsn_lock); } - void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); } - void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); } + void init_lsn_lock() {lsn_lock.init(); } + void lock_lsn() { lsn_lock.wr_lock(); } + void unlock_lsn() {lsn_lock.wr_unlock(); } + void destroy_lsn_lock() { lsn_lock.destroy(); } public: - /** first free offset within buf use; protected by lsn_lock */ - Atomic_relaxed<size_t> buf_free; - /** number of write requests (to buf); protected by exclusive lsn_lock */ - ulint write_to_buf; - /** number of waits in append_prepare(); protected by lsn_lock */ - ulint waits; /** recommended maximum size of buf, after which the buffer is flushed */ size_t max_buf_free; @@ -308,6 +302,9 @@ public: bool is_opened() const noexcept { return log.is_opened(); } + /** @return target write LSN to react on buf_free >= max_buf_free */ + inline lsn_t get_write_target() const; + /** @return LSN at which log resizing was started and is still in progress @retval 0 if no log resizing is in progress */ lsn_t resize_in_progress() const noexcept @@ -419,13 +416,14 @@ public: inline void persist(lsn_t lsn) noexcept; #endif - bool check_flush_or_checkpoint() const + bool check_for_checkpoint() const + { + return UNIV_UNLIKELY(need_checkpoint.load(std::memory_order_relaxed)); + } + void set_check_for_checkpoint(bool need= true) { - return UNIV_UNLIKELY - (check_flush_or_checkpoint_.load(std::memory_order_relaxed)); + need_checkpoint.store(need, std::memory_order_relaxed); } - void set_check_flush_or_checkpoint(bool flag= true) - { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } /** Make previous write_buf() durable and update flushed_to_disk_lsn. */ bool flush(lsn_t lsn) noexcept; @@ -446,8 +444,9 @@ public: private: /** Wait in append_prepare() for buffer to become available + @param lsn log sequence number to write up to @param ex whether log_sys.latch is exclusively locked */ - ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept; + ATTRIBUTE_COLD void append_prepare_wait(lsn_t lsn, bool ex) noexcept; public: /** Reserve space in the log buffer for appending data. @tparam pmem log_sys.is_pmem() diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 6d75e15a..a73b7279 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -44,6 +44,11 @@ ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); +/** Read the latest checkpoint information from log file +and store it in log_sys.next_checkpoint and recv_sys.file_checkpoint +@return error code or DB_SUCCESS */ +dberr_t recv_recovery_read_checkpoint(); + /** Start recovering from a redo log checkpoint. of first system tablespace page @return error code or DB_SUCCESS */ @@ -114,7 +119,19 @@ struct recv_dblwr_t @param name tablespace filepath @param file tablespace file handle @return whether the operation failed */ - bool restore_first_page(uint32_t space_id, const char *name, os_file_t file); + bool restore_first_page(uint32_t space_id, const char *name, + pfs_os_file_t file); + + /** Restore the first page of the given tablespace from + doublewrite buffer. + 1) Find the page which has page_no as 0 + 2) Read first 3 pages from tablespace file + 3) Compare the space_ids from the pages with page0 which + was retrieved from doublewrite buffer + @param name tablespace filepath + @param file tablespace file handle + @return space_id or 0 in case of error */ + uint32_t find_first_page(const char *name, pfs_os_file_t file); typedef std::deque<byte*, ut_allocator<byte*> > list; diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 841cfab1..c916edc9 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -89,8 +89,9 @@ struct mtr_t { { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); } /** Commit a mini-transaction that is shrinking a tablespace. - @param space tablespace that is being shrunk */ - ATTRIBUTE_COLD void commit_shrink(fil_space_t &space); + @param space tablespace that is being shrunk + @param size new size in pages */ + ATTRIBUTE_COLD void commit_shrink(fil_space_t &space, uint32_t size); /** Commit a mini-transaction that is deleting or renaming a file. @param space tablespace that is being renamed or deleted @@ -105,7 +106,7 @@ struct mtr_t { This is to be used at log_checkpoint(). @param checkpoint_lsn the log sequence number of a checkpoint, or 0 @return current LSN */ - lsn_t commit_files(lsn_t checkpoint_lsn= 0); + ATTRIBUTE_COLD lsn_t commit_files(lsn_t checkpoint_lsn= 0); /** @return mini-transaction savepoint (current size of m_memo) */ ulint get_savepoint() const diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index c9db6a1f..c8374515 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -142,9 +142,11 @@ static const ulint OS_FILE_NORMAL = 62; /* @} */ /** Types for file create @{ */ -static const ulint OS_DATA_FILE = 100; -static const ulint OS_LOG_FILE = 101; -static const ulint OS_DATA_FILE_NO_O_DIRECT = 103; +static constexpr ulint OS_DATA_FILE = 100; +static constexpr ulint OS_LOG_FILE = 101; +#if defined _WIN32 || defined HAVE_FCNTL_DIRECT +static constexpr ulint OS_DATA_FILE_NO_O_DIRECT = 103; +#endif /* @} */ /** Error codes from os_file_get_last_error @{ */ @@ -373,7 +375,7 @@ os_file_create_simple_no_error_handling_func( bool* success) MY_ATTRIBUTE((warn_unused_result)); -#ifdef _WIN32 +#ifndef HAVE_FCNTL_DIRECT #define os_file_set_nocache(fd, file_name, operation_name) do{}while(0) #else /** Tries to disable OS caching on an opened file descriptor. diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index db846795..457d9ab5 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -210,14 +210,11 @@ extern unsigned long long srv_max_undo_log_size; extern uint srv_n_fil_crypt_threads; extern uint srv_n_fil_crypt_threads_started; -/** Rate at which UNDO records should be purged. */ -extern ulong srv_purge_rseg_truncate_frequency; - /** Enable or Disable Truncate of UNDO tablespace. */ extern my_bool srv_undo_log_truncate; /** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */ -constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) / +constexpr uint32_t SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) / UNIV_PAGE_SIZE_DEF; extern char* srv_log_group_home_dir; diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h index 1dca0cc1..01067322 100644 --- a/storage/innobase/include/srw_lock.h +++ b/storage/innobase/include/srw_lock.h @@ -34,7 +34,6 @@ this program; if not, write to the Free Software Foundation, Inc., # define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */ #endif -#ifdef SUX_LOCK_GENERIC /** An exclusive-only variant of srw_lock */ template<bool spinloop> class pthread_mutex_wrapper final @@ -70,7 +69,6 @@ template<> inline void pthread_mutex_wrapper<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); } # endif -#endif /** Futex-based mutex */ template<bool spinloop> @@ -541,7 +539,7 @@ public: /** @return whether any lock may be held by any thread */ bool is_locked_or_waiting() const noexcept { return lock.is_locked_or_waiting(); } - /** @return whether an exclusive lock may be held by any thread */ + /** @return whether a shared or exclusive lock may be held by any thread */ bool is_locked() const noexcept { return lock.is_locked(); } /** @return whether an exclusive lock may be held by any thread */ bool is_write_locked() const noexcept { return lock.is_write_locked(); } diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 3ddd2e98..0f4f8afa 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -140,6 +140,15 @@ private: bool m_initialized{false}; /** whether purge is enabled; protected by latch and std::atomic */ std::atomic<bool> m_enabled{false}; + /** The primary candidate for iterator::free_history() is + rseg=trx_sys.rseg_array[skipped_rseg]. This field may be changed + after invoking rseg.set_skip_allocation() and rseg.clear_skip_allocation() + and while holding the exclusive rseg.latch. + + This may only be 0 if innodb_undo_tablespaces=0, because rollback segment + 0 always resides in the system tablespace and would never be used when + dedicated undo tablespaces are in use. */ + Atomic_relaxed<uint8_t> skipped_rseg; public: /** whether purge is active (may hold table handles) */ std::atomic<bool> m_active{false}; @@ -197,6 +206,11 @@ public: return undo_no <= other.undo_no; } + /** Remove unnecessary history data from a rollback segment. + @param rseg rollback segment + @return error code */ + inline dberr_t free_history_rseg(trx_rseg_t &rseg) const; + /** Free the undo pages up to this. */ dberr_t free_history() const; @@ -240,14 +254,15 @@ public: by the pq_mutex */ mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */ - /** Undo tablespace file truncation (only accessed by the - srv_purge_coordinator_thread) */ - struct { - /** The undo tablespace that is currently being truncated */ - fil_space_t* current; - /** The undo tablespace that was last truncated */ - fil_space_t* last; - } truncate; + /** innodb_undo_log_truncate=ON state; + only modified by purge_coordinator_callback() */ + struct { + /** The undo tablespace that is currently being truncated */ + Atomic_relaxed<fil_space_t*> current; + /** The number of the undo tablespace that was last truncated, + relative from srv_undo_space_id_start */ + uint32_t last; + } truncate_undo_space; /** Create the instance */ void create(); @@ -357,6 +372,26 @@ public: typically via purge_sys_t::view_guard. */ return view.sees(id); } + +private: + /** Enable the use of a rollback segment and advance skipped_rseg, + after iterator::free_history_rseg() had invoked + rseg.set_skip_allocation(). */ + inline void rseg_enable(trx_rseg_t &rseg); + + /** Try to start truncating a tablespace. + @param id undo tablespace identifier + @param size the maximum desired undo tablespace size, in pages + @return undo tablespace whose truncation was started + @retval nullptr if truncation is not currently possible */ + inline fil_space_t *undo_truncate_try(uint32_t id, uint32_t size); +public: + /** Check if innodb_undo_log_truncate=ON needs to be handled. + This is only to be called by purge_coordinator_callback(). + @return undo tablespace chosen by innodb_undo_log_truncate=ON + @retval nullptr if truncation is not currently possible */ + fil_space_t *truncating_tablespace(); + /** A wrapper around trx_sys_t::clone_oldest_view(). */ template<bool also_end_view= false> void clone_oldest_view() diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index 43e0c290..7fa43047 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -73,14 +73,15 @@ private: /** Reference counter to track is_persistent() transactions, with SKIP flag. */ std::atomic<uint32_t> ref; - +public: /** Whether undo tablespace truncation is pending */ static constexpr uint32_t SKIP= 1; /** Transaction reference count multiplier */ static constexpr uint32_t REF= 2; + /** @return the reference count and flags */ uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); } - +private: /** Set the SKIP bit */ void ref_set_skip() { diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 5dd0169f..3fa41fdf 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -902,8 +902,8 @@ public: uint64_t recovered_binlog_offset; /** Latest recovered binlog file name */ char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN]; - /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */ - lsn_t recovered_binlog_lsn; + /** Set when latest position is from pre-version 10.3.5 TRX_SYS. */ + bool recovered_binlog_is_legacy_pos; /** @@ -1191,6 +1191,11 @@ public: return count; } + /** Disable further allocation of transactions in a rollback segment + that are subject to innodb_undo_log_truncate=ON + @param space undo tablespace that will be truncated */ + inline void undo_truncate_start(fil_space_t &space); + /** Set the undo log empty value */ void set_undo_non_empty(bool val) { diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 3cfbe331..0a3e0d62 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1108,6 +1108,7 @@ public: { ut_ad(state == TRX_STATE_NOT_STARTED); ut_ad(!id); + ut_ad(!*detailed_error); ut_ad(!mutex_is_owner()); ut_ad(!has_logged()); ut_ad(!is_referenced()); diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 91999c81..9f39b303 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -100,6 +100,7 @@ bool log_t::create() /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */ lsn.store(FIRST_LSN, std::memory_order_relaxed); flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed); + need_checkpoint.store(true, std::memory_order_relaxed); write_lsn= FIRST_LSN; #ifndef HAVE_PMEM @@ -124,18 +125,17 @@ bool log_t::create() TRASH_ALLOC(flush_buf, buf_size); checkpoint_buf= static_cast<byte*>(aligned_malloc(4096, 4096)); memset_aligned<4096>(checkpoint_buf, 0, 4096); + max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; #else ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); + max_buf_free= 1; #endif latch.SRW_LOCK_INIT(log_latch_key); init_lsn_lock(); - max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; - set_check_flush_or_checkpoint(); - last_checkpoint_lsn= FIRST_LSN; log_capacity= 0; max_modified_age_async= 0; @@ -236,6 +236,7 @@ void log_t::attach_low(log_file_t file, os_offset_t size) log.close(); mprotect(ptr, size_t(size), PROT_READ); buf= static_cast<byte*>(ptr); + max_buf_free= size; # if defined __linux__ || defined _WIN32 set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); # endif @@ -264,6 +265,7 @@ void log_t::attach_low(log_file_t file, os_offset_t size) TRASH_ALLOC(buf, buf_size); TRASH_ALLOC(flush_buf, buf_size); + max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; #endif #if defined __linux__ || defined _WIN32 @@ -813,8 +815,8 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept #ifndef SUX_LOCK_GENERIC ut_ad(latch.is_write_locked()); #endif - ut_ad(!srv_read_only_mode); ut_ad(!is_pmem()); + ut_ad(!srv_read_only_mode); const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; @@ -849,7 +851,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept ... /* TODO: Update the LSN and adjust other code. */ #else /* The rest of the block will be written as garbage. - (We want to avoid memset() while holding mutex.) + (We want to avoid memset() while holding exclusive log_sys.latch) This block will be overwritten later, once records beyond the current LSN are generated. */ # ifdef HAVE_valgrind @@ -886,6 +888,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept write_lsn= lsn; } + set_check_for_checkpoint(false); return lsn; } @@ -927,8 +930,9 @@ wait and check if an already running write is covering the request. void log_write_up_to(lsn_t lsn, bool durable, const completion_callback *callback) { - ut_ad(!srv_read_only_mode); + ut_ad(!srv_read_only_mode || (log_sys.buf_free < log_sys.max_buf_free)); ut_ad(lsn != LSN_MAX); + ut_ad(lsn != 0); if (UNIV_UNLIKELY(recv_no_ibuf_operations)) { @@ -985,7 +989,6 @@ repeat: @param durable whether to wait for a durable write to complete */ void log_buffer_flush_to_disk(bool durable) { - ut_ad(!srv_read_only_mode); log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable); } @@ -1017,16 +1020,6 @@ ATTRIBUTE_COLD void log_write_and_flush() #endif } -/******************************************************************** - -Tries to establish a big enough margin of free space in the log buffer, such -that a new log entry can be catenated without an immediate need for a flush. */ -ATTRIBUTE_COLD static void log_flush_margin() -{ - if (log_sys.buf_free > log_sys.max_buf_free) - log_buffer_flush_to_disk(false); -} - /****************************************************************//** Tries to establish a big enough margin of free space in the log, such that a new log entry can be catenated without an immediate need for a @@ -1034,12 +1027,12 @@ checkpoint. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ ATTRIBUTE_COLD static void log_checkpoint_margin() { - while (log_sys.check_flush_or_checkpoint()) + while (log_sys.check_for_checkpoint()) { log_sys.latch.rd_lock(SRW_LOCK_CALL); ut_ad(!recv_no_log_write); - if (!log_sys.check_flush_or_checkpoint()) + if (!log_sys.check_for_checkpoint()) { func_exit: log_sys.latch.rd_unlock(); @@ -1055,7 +1048,7 @@ func_exit: #ifndef DBUG_OFF skip_checkpoint: #endif - log_sys.set_check_flush_or_checkpoint(false); + log_sys.set_check_for_checkpoint(false); goto func_exit; } @@ -1069,30 +1062,17 @@ func_exit: } } -/** -Checks that there is enough free space in the log to start a new query step. -Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this -function may only be called if the calling thread owns no synchronization -objects! */ -ATTRIBUTE_COLD void log_check_margins() -{ - do - { - log_flush_margin(); - log_checkpoint_margin(); - ut_ad(!recv_no_log_write); - } - while (log_sys.check_flush_or_checkpoint()); -} - /** Wait for a log checkpoint if needed. NOTE that this function may only be called while not holding any synchronization objects except dict_sys.latch. */ void log_free_check() { ut_ad(!lock_sys.is_writer()); - if (log_sys.check_flush_or_checkpoint()) - log_check_margins(); + if (log_sys.check_for_checkpoint()) + { + ut_ad(!recv_no_log_write); + log_checkpoint_margin(); + } } extern void buf_resize_shutdown(); diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 3c3fe41e..e72f842f 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -833,7 +833,22 @@ processed: filename= tbl_name + 1; } } - space->add(filename, OS_FILE_CLOSED, size, false, false); + pfs_os_file_t handle= OS_FILE_CLOSED; + if (srv_operation == SRV_OPERATION_RESTORE) + { + /* During mariadb-backup --backup, a table could be renamed, + created and dropped, and we may be missing the file at this + point of --prepare. Try to create the file if it does not exist + already. If the file exists, we'll pass handle=OS_FILE_CLOSED + and the file will be opened normally in fil_space_t::acquire() + inside recv_sys_t::recover_deferred(). */ + bool success; + handle= os_file_create(innodb_data_file_key, filename, + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT | + OS_FILE_ON_ERROR_SILENT, + OS_FILE_AIO, OS_DATA_FILE, false, &success); + } + space->add(filename, handle, size, false, false); space->recv_size= it->second.size; space->size_in_header= size; return space; @@ -1238,7 +1253,8 @@ static void fil_name_process(const char *name, ulint len, uint32_t space_id, file_name_t& f = p.first->second; - if (auto d = deferred_spaces.find(space_id)) { + auto d = deferred_spaces.find(space_id); + if (d) { if (deleted) { d->deleted = true; goto got_deleted; @@ -1311,7 +1327,16 @@ same_space: FILE_* record. */ ut_ad(space == NULL); - if (srv_force_recovery) { + if (srv_operation == SRV_OPERATION_RESTORE && d + && ftype == FILE_RENAME) { +rename: + d->file_name = fname.name; + f.name = fname.name; + break; + } + + if (srv_force_recovery + || srv_operation == SRV_OPERATION_RESTORE) { /* Without innodb_force_recovery, missing tablespaces will only be reported in @@ -1330,7 +1355,11 @@ same_space: break; case FIL_LOAD_DEFER: - /** Skip the deferred spaces + if (d && ftype == FILE_RENAME + && srv_operation == SRV_OPERATION_RESTORE) { + goto rename; + } + /* Skip the deferred spaces when lsn is already processed */ if (!if_exists) { deferred_spaces.add( @@ -1735,20 +1764,6 @@ dberr_t recv_sys_t::find_checkpoint() { if (wrong_size) return DB_CORRUPTION; - if (log_sys.next_checkpoint_lsn < 8204) - { - /* Before MDEV-14425, InnoDB had a minimum LSN of 8192+12=8204. - Likewise, mariadb-backup --prepare would create an empty - ib_logfile0 after applying the log. We will allow an upgrade - from such an empty log. - - If a user replaces the redo log with an empty file and the - FIL_PAGE_FILE_FLUSH_LSN field was zero in the system - tablespace (see SysTablespace::read_lsn_and_check_flags()) we - must refuse to start up. */ - sql_print_error("InnoDB: ib_logfile0 is empty, and LSN is unknown."); - return DB_CORRUPTION; - } lsn= log_sys.next_checkpoint_lsn; log_sys.format= log_t::FORMAT_3_23; goto upgrade; @@ -2409,7 +2424,7 @@ struct recv_ring : public recv_buf { const size_t s(*this - start); ut_ad(s + len <= srv_page_size); - if (!log_sys.is_encrypted()) + if (!len || !log_sys.is_encrypted()) { if (start.ptr + s == ptr && ptr + len <= end()) return ptr; @@ -3205,7 +3220,7 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, skipped_after_init = false; ut_ad(end_lsn == page_lsn); if (end_lsn != page_lsn) { - sql_print_warning( + sql_print_information( "InnoDB: The last skipped log record" " LSN " LSN_PF " is not equal to page LSN " LSN_PF, @@ -4012,7 +4027,6 @@ static bool recv_scan_log(bool last_phase) const size_t block_size_1{log_sys.get_block_size() - 1}; mysql_mutex_lock(&recv_sys.mutex); - ut_d(recv_sys.after_apply= last_phase); if (!last_phase) recv_sys.clear(); else @@ -4221,6 +4235,7 @@ static bool recv_scan_log(bool last_phase) recv_sys.lsn= rewound_lsn; } func_exit: + ut_d(recv_sys.after_apply= last_phase); mysql_mutex_unlock(&recv_sys.mutex); DBUG_RETURN(!store); } @@ -4507,12 +4522,36 @@ done: return err; } +dberr_t recv_recovery_read_checkpoint() +{ + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + ut_d(mysql_mutex_lock(&buf_pool.mutex)); + ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + ut_d(mysql_mutex_unlock(&buf_pool.mutex)); + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) + { + sql_print_information("InnoDB: innodb_force_recovery=6" + " skips redo log apply"); + return DB_SUCCESS; + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + dberr_t err= recv_sys.find_checkpoint(); + log_sys.latch.wr_unlock(); + return err; +} + /** Start recovering from a redo log checkpoint. of first system tablespace page @return error code or DB_SUCCESS */ dberr_t recv_recovery_from_checkpoint_start() { - bool rescan = false; + bool rescan = false; + dberr_t err = DB_SUCCESS; ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || srv_operation == SRV_OPERATION_RESTORE @@ -4525,20 +4564,12 @@ dberr_t recv_recovery_from_checkpoint_start() if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { sql_print_information("InnoDB: innodb_force_recovery=6" " skips redo log apply"); - return(DB_SUCCESS); + return err; } recv_sys.recovery_on = true; log_sys.latch.wr_lock(SRW_LOCK_CALL); - - dberr_t err = recv_sys.find_checkpoint(); - if (err != DB_SUCCESS) { -early_exit: - log_sys.latch.wr_unlock(); - return err; - } - log_sys.set_capacity(); /* Start reading the log from the checkpoint lsn. The variable @@ -4548,7 +4579,9 @@ early_exit: ut_ad(recv_sys.pages.empty()); if (log_sys.format == log_t::FORMAT_3_23) { - goto early_exit; +early_exit: + log_sys.latch.wr_unlock(); + return err; } if (log_sys.is_latest()) { @@ -4843,7 +4876,7 @@ byte *recv_dblwr_t::find_page(const page_id_t page_id, } bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name, - os_file_t file) + pfs_os_file_t file) { const page_id_t page_id(space_id, 0); const byte* page= find_page(page_id); @@ -4851,10 +4884,10 @@ bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name, { /* If the first page of the given user tablespace is not there in the doublewrite buffer, then the recovery is going to fail - now. Hence this is treated as error. */ - ib::error() - << "Corrupted page " << page_id << " of datafile '" - << name <<"' could not be found in the doublewrite buffer."; + now. Report error only when doublewrite buffer is not empty */ + if (pages.size()) + ib::error() << "Corrupted page " << page_id << " of datafile '" + << name << "' could not be found in the doublewrite buffer."; return true; } @@ -4868,3 +4901,58 @@ bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name, IORequestWrite, name, file, page, 0, physical_size) != DB_SUCCESS; } + +uint32_t recv_dblwr_t::find_first_page(const char *name, pfs_os_file_t file) +{ + os_offset_t file_size= os_file_get_size(file); + if (file_size != (os_offset_t) -1) + { + for (const page_t *page : pages) + { + uint32_t space_id= page_get_space_id(page); + byte *read_page= nullptr; + if (page_get_page_no(page) > 0 || space_id == 0) + { +next_page: + aligned_free(read_page); + continue; + } + uint32_t flags= mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + page_id_t page_id(space_id, 0); + size_t page_size= fil_space_t::physical_size(flags); + if (file_size < 4 * page_size) + goto next_page; + read_page= + static_cast<byte*>(aligned_malloc(3 * page_size, page_size)); + /* Read 3 pages from the file and match the space id + with the space id which is stored in + doublewrite buffer page. */ + if (os_file_read(IORequestRead, file, read_page, page_size, + 3 * page_size, nullptr) != DB_SUCCESS) + goto next_page; + for (ulint j= 0; j <= 2; j++) + { + byte *cur_page= read_page + j * page_size; + if (buf_is_zeroes(span<const byte>(cur_page, page_size))) + { + space_id= 0; + goto early_exit; + } + if (mach_read_from_4(cur_page + FIL_PAGE_OFFSET) != j + 1 || + memcmp(cur_page + FIL_PAGE_SPACE_ID, + page + FIL_PAGE_SPACE_ID, 4) || + buf_page_is_corrupted(false, cur_page, flags)) + goto next_page; + } + if (!restore_first_page(space_id, name, file)) + { +early_exit: + aligned_free(read_page); + return space_id; + } + break; + } + } + return 0; +} diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 1834a164..01641f74 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -308,6 +308,22 @@ void mtr_t::release() m_memo.clear(); } +inline lsn_t log_t::get_write_target() const +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_locked()); +#endif + if (UNIV_LIKELY(buf_free < max_buf_free)) + return 0; + ut_ad(!is_pmem()); + /* The LSN corresponding to the end of buf is + write_lsn - (first_lsn & 4095) + buf_free, + but we use simpler arithmetics to return a smaller write target in + order to minimize waiting in log_write_up_to(). */ + ut_ad(max_buf_free >= 4096 * 4); + return write_lsn + max_buf_free / 2; +} + /** Commit a mini-transaction. */ void mtr_t::commit() { @@ -331,6 +347,7 @@ void mtr_t::commit() std::pair<lsn_t,page_flush_ahead> lsns{do_write()}; process_freed_pages(); size_t modified= 0; + const lsn_t write_lsn= log_sys.get_write_target(); if (m_made_dirty) { @@ -408,7 +425,8 @@ void mtr_t::commit() break; default: buf_page_t *bpage= static_cast<buf_page_t*>(slot.object); - const auto s= bpage->unfix(); + ut_d(const auto s=) + bpage->unfix(); if (slot.type & MTR_MEMO_MODIFY) { ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY || @@ -420,13 +438,10 @@ void mtr_t::commit() ut_ad(s < buf_page_t::READ_FIX); ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <= m_commit_lsn); - if (s >= buf_page_t::UNFIXED) - { - mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn); - if (UNIV_LIKELY_NULL(bpage->zip.data)) - memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data, - FIL_PAGE_LSN + bpage->frame, 8); - } + mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn); + if (UNIV_LIKELY_NULL(bpage->zip.data)) + memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data, + FIL_PAGE_LSN + bpage->frame, 8); modified++; } switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) { @@ -451,6 +466,9 @@ void mtr_t::commit() if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); + + if (UNIV_UNLIKELY(write_lsn != 0)) + log_write_up_to(write_lsn, false); } else { @@ -492,9 +510,20 @@ void mtr_t::rollback_to_savepoint(ulint begin, ulint end) m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end); } +/** Set create_lsn. */ +inline void fil_space_t::set_create_lsn(lsn_t lsn) +{ +#ifndef SUX_LOCK_GENERIC + /* Concurrent log_checkpoint_low() must be impossible. */ + ut_ad(latch.is_write_locked()); +#endif + create_lsn= lsn; +} + /** Commit a mini-transaction that is shrinking a tablespace. -@param space tablespace that is being shrunk */ -void mtr_t::commit_shrink(fil_space_t &space) +@param space tablespace that is being shrunk +@param size new size in pages */ +void mtr_t::commit_shrink(fil_space_t &space, uint32_t size) { ut_ad(is_active()); ut_ad(!is_inside_ibuf()); @@ -514,6 +543,15 @@ void mtr_t::commit_shrink(fil_space_t &space) const lsn_t start_lsn= do_write().first; ut_d(m_log.erase()); + fil_node_t *file= UT_LIST_GET_LAST(space.chain); + mysql_mutex_lock(&fil_system.mutex); + ut_ad(file->is_open()); + space.size= file->size= size; + space.set_create_lsn(m_commit_lsn); + mysql_mutex_unlock(&fil_system.mutex); + + space.clear_freed_ranges(); + /* Durably write the reduced FSP_SIZE before truncating the data file. */ log_write_and_flush(); #ifndef SUX_LOCK_GENERIC @@ -521,11 +559,11 @@ void mtr_t::commit_shrink(fil_space_t &space) #endif os_file_truncate(space.chain.start->name, space.chain.start->handle, - os_offset_t{space.size} << srv_page_size_shift, true); + os_offset_t{size} << srv_page_size_shift, true); space.clear_freed_ranges(); - const page_id_t high{space.id, space.size}; + const page_id_t high{space.id, size}; size_t modified= 0; auto it= m_memo.rbegin(); mysql_mutex_lock(&buf_pool.flush_list_mutex); @@ -586,13 +624,6 @@ void mtr_t::commit_shrink(fil_space_t &space) log_sys.latch.wr_unlock(); m_latch_ex= false; - mysql_mutex_lock(&fil_system.mutex); - ut_ad(space.is_being_truncated); - ut_ad(space.is_stopping_writes()); - space.clear_stopping(); - space.is_being_truncated= false; - mysql_mutex_unlock(&fil_system.mutex); - release(); release_resources(); } @@ -680,7 +711,7 @@ The caller must hold exclusive log_sys.latch. This is to be used at log_checkpoint(). @param checkpoint_lsn the log sequence number of a checkpoint, or 0 @return current LSN */ -lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn) +ATTRIBUTE_COLD lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn) { #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked()); @@ -840,26 +871,26 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) } /** Wait in append_prepare() for buffer to become available +@param lsn log sequence number to write up to @param ex whether log_sys.latch is exclusively locked */ -ATTRIBUTE_COLD void log_t::append_prepare_wait(bool ex) noexcept +ATTRIBUTE_COLD void log_t::append_prepare_wait(lsn_t lsn, bool ex) noexcept { - log_sys.waits++; - log_sys.unlock_lsn(); + waits++; + unlock_lsn(); if (ex) - log_sys.latch.wr_unlock(); + latch.wr_unlock(); else - log_sys.latch.rd_unlock(); + latch.rd_unlock(); - DEBUG_SYNC_C("log_buf_size_exceeded"); - log_buffer_flush_to_disk(log_sys.is_pmem()); + log_write_up_to(lsn, is_pmem()); if (ex) - log_sys.latch.wr_lock(SRW_LOCK_CALL); + latch.wr_lock(SRW_LOCK_CALL); else - log_sys.latch.rd_lock(SRW_LOCK_CALL); + latch.rd_lock(SRW_LOCK_CALL); - log_sys.lock_lsn(); + lock_lsn(); } /** Reserve space in the log buffer for appending data. @@ -878,34 +909,30 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept # endif #endif ut_ad(pmem == is_pmem()); - const lsn_t checkpoint_margin{last_checkpoint_lsn + log_capacity - size}; - const size_t avail{(pmem ? size_t(capacity()) : buf_size) - size}; lock_lsn(); write_to_buf++; - for (ut_d(int count= 50); - UNIV_UNLIKELY((pmem - ? size_t(get_lsn() - - get_flushed_lsn(std::memory_order_relaxed)) - : size_t{buf_free}) > avail); ) + const lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size}; + size_t b{buf_free}; + + if (UNIV_UNLIKELY(pmem + ? (end_lsn - + get_flushed_lsn(std::memory_order_relaxed)) > capacity() + : b + size >= buf_size)) { - append_prepare_wait(ex); - ut_ad(count--); + append_prepare_wait(l, ex); + b= buf_free; } - const lsn_t l{lsn.load(std::memory_order_relaxed)}; - lsn.store(l + size, std::memory_order_relaxed); - const size_t b{buf_free}; - size_t new_buf_free{b}; - new_buf_free+= size; + lsn.store(end_lsn, std::memory_order_relaxed); + size_t new_buf_free= b + size; if (pmem && new_buf_free >= file_size) new_buf_free-= size_t(capacity()); buf_free= new_buf_free; unlock_lsn(); - if (UNIV_UNLIKELY(l > checkpoint_margin) || - (!pmem && b >= max_buf_free)) - set_check_flush_or_checkpoint(); + if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity)) + set_check_for_checkpoint(); return {l, &buf[b]}; } @@ -930,7 +957,7 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age)) return mtr_t::PAGE_FLUSH_ASYNC; - log_sys.set_check_flush_or_checkpoint(); + log_sys.set_check_for_checkpoint(); return mtr_t::PAGE_FLUSH_SYNC; } @@ -989,10 +1016,9 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write() #ifndef DBUG_OFF do { - if (m_log_mode != MTR_LOG_ALL) + if (m_log_mode != MTR_LOG_ALL || + _db_keyword_(nullptr, "skip_page_checksum", 1)) continue; - DBUG_EXECUTE_IF("skip_page_checksum", continue;); - for (const mtr_memo_slot_t& slot : m_memo) if (slot.type & MTR_MEMO_MODIFY) { @@ -1150,9 +1176,6 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, } } -/** Write the mini-transaction log to the redo log buffer. -@param len number of bytes to write -@return {start_lsn,flush_ahead} */ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(size_t len) { diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 5e674806..31bec346 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -975,7 +975,7 @@ os_file_create_simple_func( *success = false; int create_flag; - const char* mode_str = NULL; + const char* mode_str __attribute__((unused)); ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); @@ -1051,6 +1051,7 @@ os_file_create_simple_func( } while (retry); +#ifdef HAVE_FCNTL_DIRECT /* This function is always called for data files, we should disable OS caching (O_DIRECT) here as we do in os_file_create_func(), so we open the same file in the same mode, see man page of open(2). */ @@ -1065,6 +1066,7 @@ os_file_create_simple_func( break; } } +#endif #ifndef _WIN32 if (!read_only @@ -1150,7 +1152,7 @@ os_file_create_func( ); int create_flag; - const char* mode_str = NULL; + const char* mode_str __attribute__((unused)); on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false; @@ -1192,10 +1194,13 @@ os_file_create_func( return(OS_FILE_CLOSED); } +#ifdef HAVE_FCNTL_DIRECT ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE || type == OS_DATA_FILE_NO_O_DIRECT); - +#else + ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); +#endif ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); /* We let O_DSYNC only affect log files */ @@ -1241,7 +1246,7 @@ os_file_create_func( return file; } -#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT +#ifdef HAVE_FCNTL_DIRECT if (type == OS_DATA_FILE) { switch (srv_file_flush_method) { case SRV_O_DSYNC: @@ -2175,10 +2180,8 @@ os_file_create_func( if (srv_file_flush_method == SRV_O_DSYNC) attributes|= FILE_FLAG_WRITE_THROUGH; } - else if (type == OS_DATA_FILE) - { - switch (srv_file_flush_method) - { + else if (type == OS_DATA_FILE) { + switch (srv_file_flush_method) { case SRV_FSYNC: case SRV_LITTLESYNC: case SRV_NOSYNC: @@ -3042,30 +3045,15 @@ os_file_handle_error_cond_exit( return(false); } -#ifndef _WIN32 +#ifdef HAVE_FCNTL_DIRECT /** Tries to disable OS caching on an opened file descriptor. @param[in] fd file descriptor to alter @param[in] file_name file name, used in the diagnostic message @param[in] name "open" or "create"; used in the diagnostic message */ void -os_file_set_nocache( - int fd MY_ATTRIBUTE((unused)), - const char* file_name MY_ATTRIBUTE((unused)), - const char* operation_name MY_ATTRIBUTE((unused))) +os_file_set_nocache(int fd, const char *file_name, const char *operation_name) { - /* some versions of Solaris may not have DIRECTIO_ON */ -#if defined(__sun__) && defined(DIRECTIO_ON) - if (directio(fd, DIRECTIO_ON) == -1) { - int errno_save = errno; - - ib::error() - << "Failed to set DIRECTIO_ON on file " - << file_name << "; " << operation_name << ": " - << strerror(errno_save) << "," - " continuing anyway."; - } -#elif defined(O_DIRECT) if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { int errno_save = errno; static bool warning_message_printed = false; @@ -3084,10 +3072,8 @@ os_file_set_nocache( << ", continuing anyway."; } } -#endif /* defined(__sun__) && defined(DIRECTIO_ON) */ } - -#endif /* _WIN32 */ +#endif /* HAVE_FCNTL_DIRECT */ /** Check if the file system supports sparse files. @param fh file handle @@ -3177,8 +3163,18 @@ fallback: return true; } current_size &= ~4095ULL; +# ifdef __linux__ + if (!fallocate(file, 0, current_size, + size - current_size)) { + err = 0; + break; + } + + err = errno; +# else err = posix_fallocate(file, current_size, size - current_size); +# endif } } while (err == EINTR && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED); @@ -3457,7 +3453,7 @@ static void write_io_callback(void *c) if (UNIV_UNLIKELY(cb->m_err != 0)) ib::info () << "IO Error: " << cb->m_err - << "during write of " + << " during write of " << cb->m_len << " bytes, for file " << request.node->name << "(" << cb->m_fh << "), returned " << cb->m_ret_len; @@ -4194,7 +4190,6 @@ bool fil_node_t::read_page0() != DB_SUCCESS) { sql_print_error("InnoDB: Unable to read first page of file %s", name); -corrupted: aligned_free(page); return false; } @@ -4211,25 +4206,35 @@ corrupted: if (!fil_space_t::is_valid_flags(flags, space->id)) { uint32_t cflags= fsp_flags_convert_from_101(flags); - if (cflags == UINT32_MAX) + if (cflags != UINT32_MAX) { -invalid: - ib::error() << "Expected tablespace flags " - << ib::hex(space->flags) - << " but found " << ib::hex(flags) - << " in the file " << name; - goto corrupted; + uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK; + uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK; + + if (fil_space_t::is_flags_equal(cf, sf) || + fil_space_t::is_flags_equal(sf, cf)) + { + flags= cflags; + goto flags_ok; + } } - uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK; - uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK; + aligned_free(page); + goto invalid; + } - if (!fil_space_t::is_flags_equal(cf, sf) && - !fil_space_t::is_flags_equal(sf, cf)) - goto invalid; - flags= cflags; + if (!fil_space_t::is_flags_equal((flags & ~FSP_FLAGS_MEM_MASK), + (space->flags & ~FSP_FLAGS_MEM_MASK)) && + !fil_space_t::is_flags_equal((space->flags & ~FSP_FLAGS_MEM_MASK), + (flags & ~FSP_FLAGS_MEM_MASK))) + { +invalid: + sql_print_error("InnoDB: Expected tablespace flags 0x%zx but found 0x%zx" + " in the file %s", space->flags, flags, name); + return false; } + flags_ok: ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); /* Try to read crypt_data from page 0 if it is not yet read. */ diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc index 61614007..51bcc954 100644 --- a/storage/innobase/pars/pars0pars.cc +++ b/storage/innobase/pars/pars0pars.cc @@ -1778,9 +1778,6 @@ pars_create_table( ulint flags = 0; ulint flags2 = DICT_TF2_FTS_AUX_HEX_NAME; - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); - n_cols = que_node_list_get_len(column_defs); table = dict_table_t::create( diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 17a2f034..9d85e2b1 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -1630,9 +1630,6 @@ row_fts_merge_insert( /* We should set the flags2 with aux_table_name here, in order to get the correct aux table names. */ index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; - DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", - index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME - & ((1U << DICT_TF2_BITS) - 1);); fts_table.type = FTS_INDEX_TABLE; fts_table.index_id = index->id; fts_table.table_id = table->id; diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index d2609fdb..2516e24e 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -117,7 +117,6 @@ struct row_import { row_import() UNIV_NOTHROW : m_table(NULL), - m_version(0), m_hostname(NULL), m_table_name(NULL), m_autoinc(0), @@ -196,8 +195,6 @@ struct row_import { dict_table_t* m_table; /*!< Table instance */ - ulint m_version; /*!< Version of config file */ - byte* m_hostname; /*!< Hostname where the tablespace was exported */ byte* m_table_name; /*!< Exporting instance table @@ -2992,17 +2989,13 @@ row_import_read_meta_data( return(DB_IO_ERROR); } - cfg.m_version = mach_read_from_4(row); - /* Check the version number. */ - switch (cfg.m_version) { + switch (mach_read_from_4(row)) { case IB_EXPORT_CFG_VERSION_V1: - return(row_import_read_v1(file, thd, &cfg)); default: - ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, - "Unsupported meta-data version number (" ULINTPF "), " - "file ignored", cfg.m_version); + ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_NOT_SUPPORTED_YET, + "meta-data version"); } return(DB_ERROR); diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index bdee0ed1..9c3c5d22 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -2638,14 +2638,17 @@ row_ins_clust_index_entry_low( ut_ad(!dict_index_is_online_ddl(index)); ut_ad(!index->table->persistent_autoinc); ut_ad(!index->is_instant()); + ut_ad(!entry->info_bits); mtr.set_log_mode(MTR_LOG_NO_REDO); } else { index->set_modified(mtr); - if (UNIV_UNLIKELY(entry->is_metadata())) { + if (UNIV_UNLIKELY(entry->info_bits != 0)) { + ut_ad(entry->is_metadata()); ut_ad(index->is_instant()); ut_ad(!dict_index_is_online_ddl(index)); ut_ad(mode == BTR_MODIFY_TREE); + ut_ad(flags == BTR_NO_LOCKING_FLAG); } else { if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { @@ -2787,11 +2790,6 @@ avoid_bulk: skip_bulk_insert: if (UNIV_UNLIKELY(entry->info_bits != 0)) { - ut_ad(entry->is_metadata()); - ut_ad(flags == BTR_NO_LOCKING_FLAG); - ut_ad(index->is_instant()); - ut_ad(!dict_index_is_online_ddl(index)); - const rec_t* rec = btr_pcur_get_rec(&pcur); if (rec_get_info_bits(rec, page_rec_is_comp(rec)) @@ -2895,9 +2893,20 @@ do_insert: } } + if (err == DB_SUCCESS && entry->info_bits) { + if (buf_block_t* root + = btr_root_block_get(index, RW_X_LATCH, &mtr, + &err)) { + btr_set_instant(root, *index, &mtr); + } else { + ut_ad("cannot find root page" == 0); + } + } + mtr.commit(); if (big_rec) { + ut_ad(err == DB_SUCCESS); /* Online table rebuild could read (and ignore) the incomplete record at this point. If online rebuild is in progress, the diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 5df93fe6..188d8ba5 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -120,7 +120,7 @@ public: ut_ad(mtr_started == scan_mtr->is_active()); DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush", - log_sys.set_check_flush_or_checkpoint();); + log_sys.set_check_for_checkpoint();); for (idx_tuple_vec::iterator it = m_dtuple_vec.begin(); it != m_dtuple_vec.end(); @@ -128,7 +128,7 @@ public: dtuple = *it; ut_ad(dtuple); - if (log_sys.check_flush_or_checkpoint()) { + if (log_sys.check_for_checkpoint()) { if (mtr_started) { if (!btr_pcur_move_to_prev_on_page(pcur)) { error = DB_CORRUPTION; @@ -2235,6 +2235,8 @@ end_of_index: goto err_exit; } + buf_page_make_young_if_needed(&block->page); + page_cur_set_before_first(block, cur); if (!page_cur_move_to_next(cur) || page_cur_is_after_last(cur)) { @@ -3545,17 +3547,6 @@ row_merge_sort( of file marker). Thus, it must be at least one block. */ ut_ad(file->offset > 0); - /* These thd_progress* calls will crash on sol10-64 when innodb_plugin - is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on - sol10-64 in buildbot. - */ -#ifndef __sun__ - /* Progress report only for "normal" indexes. */ - if (dup && !(dup->index->type & DICT_FTS)) { - thd_progress_init(trx->mysql_thd, 1); - } -#endif /* __sun__ */ - if (global_system_variables.log_warnings > 2) { sql_print_information("InnoDB: Online DDL : merge-sorting" " has estimated " ULINTPF " runs", @@ -3564,15 +3555,6 @@ row_merge_sort( /* Merge the runs until we have one big run */ do { - /* Report progress of merge sort to MySQL for - show processlist progress field */ - /* Progress report only for "normal" indexes. */ -#ifndef __sun__ - if (dup && !(dup->index->type & DICT_FTS)) { - thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset); - } -#endif /* __sun__ */ - error = row_merge(trx, dup, file, block, tmpfd, &num_runs, run_offset, stage, crypt_block, space); @@ -3596,13 +3578,6 @@ row_merge_sort( ut_free(run_offset); - /* Progress report only for "normal" indexes. */ -#ifndef __sun__ - if (dup && !(dup->index->type & DICT_FTS)) { - thd_progress_end(trx->mysql_thd); - } -#endif /* __sun__ */ - DBUG_RETURN(error); } @@ -4436,13 +4411,14 @@ row_merge_file_create( merge_file->fd = row_merge_file_create_low(path); merge_file->offset = 0; merge_file->n_rec = 0; - +#ifdef HAVE_FCNTL_DIRECT if (merge_file->fd != OS_FILE_CLOSED) { if (srv_disable_sort_file_cache) { os_file_set_nocache(merge_file->fd, "row0merge.cc", "sort"); } } +#endif return(merge_file->fd); } diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 4756cc37..d83ab861 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -822,7 +822,6 @@ skip_secondaries: buf_page_get(page_id_t(rseg.space->id, page_no), 0, RW_X_LATCH, &mtr)) { - block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); byte* data_field = block->page.frame diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 6c76dd91..33f4d81f 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -1222,6 +1222,7 @@ re_scan: if (!cur_block) { goto func_end; } + buf_page_make_young_if_needed(&cur_block->page); } else { mtr->start(); goto func_end; diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index 8a1041c8..f14673c1 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -318,6 +318,8 @@ static buf_block_t* row_undo_rec_get(undo_node_t* node) return nullptr; } + buf_page_make_young_if_needed(&undo_page->page); + uint16_t offset = undo->top_offset; buf_block_t* prev_page = undo_page; diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index bec53841..a39574d2 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -2158,6 +2158,25 @@ row_upd_clust_rec_by_insert_inherit_func( return(inherit); } +/** Mark 'disowned' BLOBs as 'owned' and 'inherited' again, +after resuming from a lock wait. +@param entry clustered index entry */ +static ATTRIBUTE_COLD void row_upd_reown_inherited_fields(dtuple_t *entry) +{ + for (ulint i= 0; i < entry->n_fields; i++) + { + const dfield_t *dfield= dtuple_get_nth_field(entry, i); + if (dfield_is_ext(dfield)) + { + byte *blob_len= static_cast<byte*>(dfield->data) + + dfield->len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_LEN); + ut_ad(*blob_len & BTR_EXTERN_OWNER_FLAG); + *blob_len= byte((*blob_len & ~BTR_EXTERN_OWNER_FLAG) | + BTR_EXTERN_INHERITED_FLAG); + } + } +} + /***********************************************************//** Marks the clustered index record deleted and inserts the updated version of the record to the index. This function should be used when the ordering @@ -2236,12 +2255,16 @@ row_upd_clust_rec_by_insert( /* If the clustered index record is already delete marked, then we are here after a DB_LOCK_WAIT. Skip delete marking clustered index and disowning - its blobs. */ + its blobs. Mark the BLOBs in the index entry + (which we copied from the already "disowned" rec) + as "owned", like it was on the previous call of + row_upd_clust_rec_by_insert(). */ ut_ad(row_get_rec_trx_id(rec, index, offsets) == trx->id); ut_ad(!trx_undo_roll_ptr_is_insert( row_get_rec_roll_ptr(rec, index, offsets))); + row_upd_reown_inherited_fields(entry); goto check_fk; } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index bf9755fb..7c0c4b92 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -106,9 +106,6 @@ segment). It is quite possible that some of the tablespaces doesn't host any of the rollback-segment based on configuration used. */ uint32_t srv_undo_tablespaces_active; -/** Rate at which UNDO records should be purged. */ -ulong srv_purge_rseg_truncate_frequency; - /** Enable or Disable Truncate of UNDO tablespace. Note: If enabled then UNDO tablespace will be selected for truncate. While Server waits for undo-tablespace to truncate if user disables @@ -901,6 +898,9 @@ srv_export_innodb_status(void) export_vars.innodb_data_written = srv_stats.data_written + (dblwr << srv_page_size_shift); + export_vars.innodb_buffer_pool_read_requests + = buf_pool.stat.n_page_gets; + export_vars.innodb_buffer_pool_bytes_data = buf_pool.stat.LRU_bytes + (UT_LIST_GET_LEN(buf_pool.unzip_LRU) @@ -1503,7 +1503,8 @@ inline void purge_coordinator_state::do_purge() ulint n_pages_handled= trx_purge(n_threads, history_size); if (!trx_sys.history_exists()) goto no_history; - if (purge_sys.truncate.current || srv_shutdown_state != SRV_SHUTDOWN_NONE) + if (purge_sys.truncating_tablespace() || + srv_shutdown_state != SRV_SHUTDOWN_NONE) { purge_truncation_task.wait(); trx_purge_truncate_history(); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index ef5bcb67..738e0a7e 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -468,7 +468,7 @@ ATTRIBUTE_COLD static dberr_t srv_undo_tablespaces_reinit() rseg->init(nullptr, FIL_NULL); } - if (trx_sys.recovered_binlog_lsn + if (*trx_sys.recovered_binlog_filename #ifdef WITH_WSREP || !trx_sys.recovered_wsrep_xid.is_null() #endif /* WITH_WSREP */ @@ -476,7 +476,7 @@ ATTRIBUTE_COLD static dberr_t srv_undo_tablespaces_reinit() { /* Update binlog offset, binlog file name & wsrep xid in system tablespace rollback segment */ - if (trx_sys.recovered_binlog_lsn) + if (*trx_sys.recovered_binlog_filename) { ut_d(const size_t len = strlen(trx_sys.recovered_binlog_filename) + 1); ut_ad(len > 1); @@ -1122,10 +1122,14 @@ dberr_t srv_start(bool create_new_db) if (srv_force_recovery) { ib::info() << "!!! innodb_force_recovery is set to " << srv_force_recovery << " !!!"; + if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) { + srv_read_only_mode = true; + } } - if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) { - srv_read_only_mode = true; + if (srv_read_only_mode) { + sql_print_information("InnoDB: Started in read only mode"); + srv_use_doublewrite_buf = false; } high_level_read_only = srv_read_only_mode @@ -1302,6 +1306,10 @@ dberr_t srv_start(bool create_new_db) ut_ad(buf_page_cleaner_is_active); } + if (innodb_encrypt_temporary_tables && !log_crypt_init()) { + return srv_init_abort(DB_ERROR); + } + /* Check if undo tablespaces and redo log files exist before creating a new system tablespace */ if (create_new_db) { @@ -1310,6 +1318,11 @@ dberr_t srv_start(bool create_new_db) return(srv_init_abort(DB_ERROR)); } recv_sys.debug_free(); + } else { + err = recv_recovery_read_checkpoint(); + if (err != DB_SUCCESS) { + return srv_init_abort(err); + } } /* Open or create the data files. */ @@ -1334,12 +1347,9 @@ dberr_t srv_start(bool create_new_db) " old data files which contain your precious data!"; /* fall through */ default: - /* Other errors might come from Datafile::validate_first_page() */ - return(srv_init_abort(err)); - } - - if (innodb_encrypt_temporary_tables && !log_crypt_init()) { - return srv_init_abort(DB_ERROR); + /* Other errors might be flagged by + Datafile::validate_first_page() */ + return srv_init_abort(err); } if (create_new_db) { @@ -1355,10 +1365,10 @@ dberr_t srv_start(bool create_new_db) return srv_init_abort(err); } - srv_undo_space_id_start= 1; + srv_undo_space_id_start = 1; } - /* Open log file and data files in the systemtablespace: we keep + /* Open data files in the system tablespace: we keep them open until database shutdown */ ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug); @@ -1771,21 +1781,13 @@ dberr_t srv_start(bool create_new_db) } if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { - /* The following call is necessary for the insert + /* The following call is necessary for the change buffer to work with multiple tablespaces. We must know the mapping between space id's and .ibd file names. - In a crash recovery, we check that the info in data - dictionary is consistent with what we already know - about space id's from the calls to fil_ibd_load(). - - In a normal startup, we create the space objects for - every table in the InnoDB data dictionary that has - an .ibd file. - We also determine the maximum tablespace id used. */ - dict_check_tablespaces_and_store_max_id(); + dict_check_tablespaces_and_store_max_id(nullptr); } if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO @@ -1933,7 +1935,7 @@ void innodb_preshutdown() better prevent any further changes from being buffered. */ innodb_change_buffering= 0; - if (trx_sys.is_initialised()) + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO && srv_was_started) while (trx_sys.any_active_transactions()) std::this_thread::sleep_for(std::chrono::milliseconds(1)); } diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc index e41451d8..5afb79f2 100644 --- a/storage/innobase/sync/srw_lock.cc +++ b/storage/innobase/sync/srw_lock.cc @@ -143,8 +143,7 @@ static inline void srw_pause(unsigned delay) HMT_medium(); } -#ifdef SUX_LOCK_GENERIC -# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP +#ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP template<> void pthread_mutex_wrapper<true>::wr_wait() { const unsigned delay= srw_pause_delay(); @@ -158,8 +157,9 @@ template<> void pthread_mutex_wrapper<true>::wr_wait() pthread_mutex_lock(&lock); } -# endif +#endif +#ifdef SUX_LOCK_GENERIC template void ssux_lock_impl<false>::init(); template void ssux_lock_impl<true>::init(); template void ssux_lock_impl<false>::destroy(); diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 1f31ceda..cff16d9c 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -41,6 +41,7 @@ Created 3/26/1996 Heikki Tuuri #include "dict0load.h" #include <mysql/service_thd_mdl.h> #include <mysql/service_wsrep.h> +#include "log.h" /** Maximum allowable purge history length. <=0 means 'infinite'. */ ulong srv_max_purge_lag = 0; @@ -168,10 +169,15 @@ void purge_sys_t::create() ut_ad(this == &purge_sys); ut_ad(!m_initialized); ut_ad(!enabled()); + ut_ad(!m_active); + /* If innodb_undo_tablespaces>0, the rollback segment 0 + (which always resides in the system tablespace) will + never be used; @see trx_assign_rseg_low() */ + skipped_rseg= srv_undo_tablespaces > 0; m_paused= 0; query= purge_graph_build(); next_stored= false; - rseg= NULL; + rseg= nullptr; page_no= 0; offset= 0; hdr_page_no= 0; @@ -179,8 +185,8 @@ void purge_sys_t::create() latch.SRW_LOCK_INIT(trx_purge_latch_key); end_latch.init(); mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr); - truncate.current= NULL; - truncate.last= NULL; + truncate_undo_space.current= nullptr; + truncate_undo_space.last= 0; m_initialized= true; } @@ -350,14 +356,21 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) } /** Free an undo log segment. -@param block rollback segment header page +@param rseg_hdr rollback segment header page +@param block undo segment header page @param mtr mini-transaction */ -static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr) +static void trx_purge_free_segment(buf_block_t *rseg_hdr, buf_block_t *block, + mtr_t &mtr) { + ut_ad(mtr.memo_contains_flagged(rseg_hdr, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr.memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + block->page.frame, &mtr)) { + rseg_hdr->fix(); block->fix(); + ut_d(const page_id_t rseg_hdr_id{rseg_hdr->page.id()}); ut_d(const page_id_t id{block->page.id()}); mtr.commit(); /* NOTE: If the server is killed after the log that was produced @@ -368,26 +381,62 @@ static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr) This does not matter when using multiple innodb_undo_tablespaces; innodb_undo_log_truncate=ON will be able to reclaim the space. */ mtr.start(); + rseg_hdr->page.lock.x_lock(); + ut_ad(rseg_hdr->page.id() == rseg_hdr_id); block->page.lock.x_lock(); ut_ad(block->page.id() == id); - mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY); + mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); } while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + block->page.frame, &mtr)); } +void purge_sys_t::rseg_enable(trx_rseg_t &rseg) +{ + ut_ad(this == &purge_sys); +#ifndef SUX_LOCK_GENERIC + ut_ad(rseg.latch.is_write_locked()); +#endif + uint8_t skipped= skipped_rseg; + ut_ad(skipped < TRX_SYS_N_RSEGS); + if (&rseg == &trx_sys.rseg_array[skipped]) + { + /* If this rollback segment is subject to innodb_undo_log_truncate=ON, + we must not clear the flag. But we will advance purge_sys.skipped_rseg + to be able to choose another candidate for this soft truncation, and + to prevent the following scenario: + + (1) purge_sys_t::iterator::free_history_rseg() had invoked + rseg.set_skip_allocation() + (2) undo log truncation had completed on this rollback segment + (3) SET GLOBAL innodb_undo_log_truncate=OFF + (4) purge_sys_t::iterator::free_history_rseg() would not be able to + invoke rseg.set_skip_allocation() on any other rollback segment + before this rseg has grown enough */ + if (truncate_undo_space.current != rseg.space) + rseg.clear_skip_allocation(); + skipped++; + /* If innodb_undo_tablespaces>0, the rollback segment 0 + (which always resides in the system tablespace) will + never be used; @see trx_assign_rseg_low() */ + if (!(skipped&= (TRX_SYS_N_RSEGS - 1)) && srv_undo_tablespaces) + skipped++; + skipped_rseg= skipped; + } +} + /** Remove unnecessary history data from a rollback segment. @param rseg rollback segment @param limit truncate anything before this -@param all whether everything can be truncated @return error code */ -static dberr_t -trx_purge_truncate_rseg_history(trx_rseg_t &rseg, - const purge_sys_t::iterator &limit, bool all) +inline dberr_t purge_sys_t::iterator::free_history_rseg(trx_rseg_t &rseg) const { fil_addr_t hdr_addr; mtr_t mtr; + bool freed= false; + uint32_t rseg_ref= 0; mtr.start(); @@ -397,6 +446,8 @@ trx_purge_truncate_rseg_history(trx_rseg_t &rseg, { func_exit: mtr.commit(); + if (freed && (rseg.SKIP & rseg_ref)) + purge_sys.rseg_enable(rseg); return err; } @@ -418,16 +469,40 @@ loop: const trx_id_t undo_trx_no= mach_read_from_8(b->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO); - if (undo_trx_no >= limit.trx_no) + if (undo_trx_no >= trx_no) { - if (undo_trx_no == limit.trx_no) - err = trx_undo_truncate_start(&rseg, hdr_addr.page, - hdr_addr.boffset, limit.undo_no); + if (undo_trx_no == trx_no) + err= trx_undo_truncate_start(&rseg, hdr_addr.page, + hdr_addr.boffset, undo_no); goto func_exit; } - - if (!all) - goto func_exit; + else + { + rseg_ref= rseg.ref_load(); + if (rseg_ref >= rseg.REF || !purge_sys.sees(rseg.needs_purge)) + { + /* We cannot clear this entire rseg because trx_assign_rseg_low() + has already chosen it for a future trx_undo_assign(), or + because some recently started transaction needs purging. + + If this invocation could not reduce rseg.history_size at all + (!freed), we will try to ensure progress and prevent our + starvation by disabling one rollback segment for future + trx_assign_rseg_low() invocations until a future invocation has + made progress and invoked purge_sys_t::rseg_enable(rseg) on that + rollback segment. */ + + if (!(rseg.SKIP & rseg_ref) && !freed && + ut_d(!trx_rseg_n_slots_debug &&) + &rseg == &trx_sys.rseg_array[purge_sys.skipped_rseg]) + /* If rseg.space == purge_sys.truncate_undo_space.current + the following will be a no-op. A possible conflict + with innodb_undo_log_truncate=ON will be handled in + purge_sys_t::rseg_enable(). */ + rseg.set_skip_allocation(); + goto func_exit; + } + } fil_addr_t prev_hdr_addr= flst_get_prev_addr(b->page.frame + hdr_addr.boffset + @@ -459,7 +534,7 @@ loop: free_segment: ut_ad(rseg.curr_size >= seg_size); rseg.curr_size-= seg_size; - trx_purge_free_segment(b, mtr); + trx_purge_free_segment(rseg_hdr, b, mtr); break; case TRX_UNDO_CACHED: /* rseg.undo_cached must point to this page */ @@ -490,10 +565,11 @@ loop: mtr.commit(); ut_ad(rseg.history_size > 0); rseg.history_size--; + freed= true; mtr.start(); rseg_hdr->page.lock.x_lock(); ut_ad(rseg_hdr->page.id() == rseg.page_id()); - mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_MODIFY); + mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX); goto loop; } @@ -544,9 +620,7 @@ dberr_t purge_sys_t::iterator::free_history() const ut_ad(rseg.is_persistent()); log_free_check(); rseg.latch.wr_lock(SRW_LOCK_CALL); - dberr_t err= - trx_purge_truncate_rseg_history(rseg, *this, !rseg.is_referenced() && - purge_sys.sees(rseg.needs_purge)); + dberr_t err= free_history_rseg(rseg); rseg.latch.wr_unlock(); if (err) return err; @@ -554,6 +628,62 @@ dberr_t purge_sys_t::iterator::free_history() const return DB_SUCCESS; } +inline void trx_sys_t::undo_truncate_start(fil_space_t &space) +{ + ut_ad(this == &trx_sys); + /* Undo tablespace always are a single file. */ + ut_a(UT_LIST_GET_LEN(space.chain) == 1); + fil_node_t *file= UT_LIST_GET_FIRST(space.chain); + /* The undo tablespace files are never closed. */ + ut_ad(file->is_open()); + sql_print_information("InnoDB: Starting to truncate %s", file->name); + + for (auto &rseg : rseg_array) + if (rseg.space == &space) + { + /* Prevent a race with purge_sys_t::iterator::free_history_rseg() */ + rseg.latch.rd_lock(SRW_LOCK_CALL); + /* Once set, this rseg will not be allocated to subsequent + transactions, but we will wait for existing active + transactions to finish. */ + rseg.set_skip_allocation(); + rseg.latch.rd_unlock(); + } +} + +inline fil_space_t *purge_sys_t::undo_truncate_try(uint32_t id, uint32_t size) +{ + ut_ad(srv_is_undo_tablespace(id)); + fil_space_t *space= fil_space_get(id); + if (space && space->get_size() > size) + { + truncate_undo_space.current= space; + trx_sys.undo_truncate_start(*space); + return space; + } + return nullptr; +} + +fil_space_t *purge_sys_t::truncating_tablespace() +{ + ut_ad(this == &purge_sys); + + fil_space_t *space= truncate_undo_space.current; + if (space || srv_undo_tablespaces_active < 2 || !srv_undo_log_truncate) + return space; + + const uint32_t size= uint32_t(srv_max_undo_log_size >> srv_page_size_shift); + for (uint32_t i= truncate_undo_space.last, j= i;; ) + { + if (fil_space_t *s= undo_truncate_try(srv_undo_space_id_start + i, size)) + return s; + ++i; + i%= srv_undo_tablespaces_active; + if (i == j) + return nullptr; + } +} + #if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__ # if defined __arm__ || defined __aarch64__ /* Work around an internal compiler error in GCC 4.8.5 */ @@ -579,55 +709,14 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history() head.undo_no= 0; } - if (head.free_history() != DB_SUCCESS || srv_undo_tablespaces_active < 2) + if (head.free_history() != DB_SUCCESS) return; - while (srv_undo_log_truncate) + while (fil_space_t *space= purge_sys.truncating_tablespace()) { - if (!purge_sys.truncate.current) - { - const ulint threshold= - ulint(srv_max_undo_log_size >> srv_page_size_shift); - for (uint32_t i= purge_sys.truncate.last - ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0, - j= i;; ) - { - const uint32_t space_id= srv_undo_space_id_start + i; - ut_ad(srv_is_undo_tablespace(space_id)); - fil_space_t *space= fil_space_get(space_id); - ut_a(UT_LIST_GET_LEN(space->chain) == 1); - - if (space && space->get_size() > threshold) - { - purge_sys.truncate.current= space; - break; - } - - ++i; - i %= srv_undo_tablespaces_active; - if (i == j) - return; - } - } - - fil_space_t &space= *purge_sys.truncate.current; - /* Undo tablespace always are a single file. */ - fil_node_t *file= UT_LIST_GET_FIRST(space.chain); - /* The undo tablespace files are never closed. */ - ut_ad(file->is_open()); - - DBUG_LOG("undo", "marking for truncate: " << file->name); - - for (auto &rseg : trx_sys.rseg_array) - if (rseg.space == &space) - /* Once set, this rseg will not be allocated to subsequent - transactions, but we will wait for existing active - transactions to finish. */ - rseg.set_skip_allocation(); - for (auto &rseg : trx_sys.rseg_array) { - if (rseg.space != &space) + if (rseg.space != space) continue; rseg.latch.rd_lock(SRW_LOCK_CALL); @@ -660,15 +749,9 @@ not_free: rseg.latch.rd_unlock(); } - ib::info() << "Truncating " << file->name; - trx_purge_cleanse_purge_queue(space); - - log_free_check(); - - mtr_t mtr; - mtr.start(); - mtr.x_lock_space(&space); - const auto space_id= space.id; + const char *file_name= UT_LIST_GET_FIRST(space->chain)->name; + sql_print_information("InnoDB: Truncating %s", file_name); + trx_purge_cleanse_purge_queue(*space); /* Lock all modified pages of the tablespace. @@ -678,104 +761,41 @@ not_free: mini-transaction commit and the server was killed, then discarding the to-be-trimmed pages without flushing would break crash recovery. */ - rescan: - mysql_mutex_lock(&buf_pool.flush_list_mutex); - for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) - { - ut_ad(bpage->oldest_modification()); - ut_ad(bpage->in_file()); - - buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - - if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id) - { - ut_ad(bpage->frame); - bpage->fix(); - { - /* Try to acquire an exclusive latch while the cache line is - fresh after fix(). */ - const bool got_lock{bpage->lock.x_lock_try()}; - buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (!got_lock) - bpage->lock.x_lock(); - } - -#ifdef BTR_CUR_HASH_ADAPT - /* There is no AHI on undo tablespaces. */ - ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index); -#endif - ut_ad(!bpage->is_io_fixed()); - ut_ad(bpage->id().space() == space_id); - - if (bpage->oldest_modification() > 2) - { - mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage), - MTR_MEMO_PAGE_X_FIX); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - ut_ad(bpage->oldest_modification() > 2); - bpage->reset_oldest_modification(); - } - else - { - bpage->unfix(); - bpage->lock.x_unlock(); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - } - - if (prev != buf_pool.flush_hp.get()) - { - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - goto rescan; - } - } - bpage= prev; - } - - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - - /* Re-initialize tablespace, in a single mini-transaction. */ - const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; + if (UNIV_UNLIKELY(srv_shutdown_state != SRV_SHUTDOWN_NONE) && + srv_fast_shutdown) + return; /* Adjust the tablespace metadata. */ mysql_mutex_lock(&fil_system.mutex); - space.set_stopping(); - space.is_being_truncated= true; - if (space.crypt_data) + if (space->crypt_data) { - space.reacquire(); + space->reacquire(); mysql_mutex_unlock(&fil_system.mutex); - fil_space_crypt_close_tablespace(&space); - space.release(); + fil_space_crypt_close_tablespace(space); + space->release(); } else mysql_mutex_unlock(&fil_system.mutex); - for (auto i= 6000; space.referenced(); - std::this_thread::sleep_for(std::chrono::milliseconds(10))) - { - if (!--i) - { - mtr.commit(); - ib::error() << "Failed to freeze UNDO tablespace " << file->name; - return; - } - } + /* Re-initialize tablespace, in a single mini-transaction. */ + const uint32_t size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; + + log_free_check(); + mtr_t mtr; + mtr.start(); + mtr.x_lock_space(space); /* Associate the undo tablespace with mtr. During mtr::commit_shrink(), InnoDB can use the undo tablespace object to clear all freed ranges */ - mtr.set_named_space(&space); - mtr.trim_pages(page_id_t(space.id, size)); - ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS); - mysql_mutex_lock(&fil_system.mutex); - space.size= file->size= size; - mysql_mutex_unlock(&fil_system.mutex); + mtr.set_named_space(space); + mtr.trim_pages(page_id_t(space->id, size)); + ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS); for (auto &rseg : trx_sys.rseg_array) { - if (rseg.space != &space) + if (rseg.space != space) continue; ut_ad(!rseg.is_referenced()); @@ -784,7 +804,7 @@ not_free: possibly before this server had been started up. */ dberr_t err; - buf_block_t *rblock= trx_rseg_header_create(&space, + buf_block_t *rblock= trx_rseg_header_create(space, &rseg - trx_sys.rseg_array, trx_sys.get_max_trx_id(), &mtr, &err); @@ -797,7 +817,7 @@ not_free: rseg.reinit(rblock->page.id().page_no()); } - mtr.commit_shrink(space); + mtr.commit_shrink(*space, size); /* No mutex; this is only updated by the purge coordinator. */ export_vars.innodb_undo_truncations++; @@ -814,14 +834,15 @@ not_free: purge_sys.next_stored= false; } - DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc"; + DBUG_EXECUTE_IF("ib_undo_trunc", + sql_print_information("InnoDB: ib_undo_trunc"); log_buffer_flush_to_disk(); DBUG_SUICIDE();); - ib::info() << "Truncated " << file->name; - purge_sys.truncate.last= purge_sys.truncate.current; - ut_ad(&space == purge_sys.truncate.current); - purge_sys.truncate.current= nullptr; + sql_print_information("InnoDB: Truncated %s", file_name); + ut_ad(space == purge_sys.truncate_undo_space.current); + purge_sys.truncate_undo_space.current= nullptr; + purge_sys.truncate_undo_space.last= space->id - srv_undo_space_id_start; } } @@ -853,7 +874,9 @@ void purge_sys_t::rseg_get_next_history_log() { fil_addr_t prev_log_addr; +#ifndef SUX_LOCK_GENERIC ut_ad(rseg->latch.is_write_locked()); +#endif ut_a(rseg->last_page_no != FIL_NULL); tail.trx_no= rseg->last_trx_no() + 1; @@ -969,7 +992,9 @@ inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr) { ut_ad(next_stored); ut_ad(tail.trx_no < low_limit_no()); +#ifndef SUX_LOCK_GENERIC ut_ad(rseg->latch.is_write_locked()); +#endif if (!offset) { diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index b381c9de..2923dc64 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -2069,9 +2069,10 @@ trx_undo_get_undo_rec_low( mtr.start(); trx_undo_rec_t *undo_rec= nullptr; - if (const buf_block_t* undo_page= + if (buf_block_t* undo_page= buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr)) { + buf_page_make_young_if_needed(&undo_page->page); undo_rec= undo_page->page.frame + offset; const size_t end= mach_read_from_2(undo_rec); if (UNIV_UNLIKELY(end <= offset || diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 8d1a381c..87a2ac7b 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -296,8 +296,13 @@ buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const if (err) *err= DB_TABLESPACE_NOT_FOUND; return nullptr; } - return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr, - BUF_GET, mtr, err); + + buf_block_t *block= buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr, + BUF_GET, mtr, err); + if (UNIV_LIKELY(block != nullptr)) + buf_page_make_young_if_needed(&block->page); + + return block; } /** Upgrade a rollback segment header page to MariaDB 10.3 format. @@ -462,20 +467,32 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr) TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame; if (*binlog_name) { - lsn_t lsn= mach_read_from_8(my_assume_aligned<8> - (FIL_PAGE_LSN + rseg_hdr->page.frame)); static_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof trx_sys.recovered_binlog_filename, "compatibility"); - if (lsn > trx_sys.recovered_binlog_lsn) - { - trx_sys.recovered_binlog_lsn= lsn; - trx_sys.recovered_binlog_offset= + + /* Always prefer a position from rollback segment over + a legacy position from before version 10.3.5. */ + int cmp= *trx_sys.recovered_binlog_filename && + !trx_sys.recovered_binlog_is_legacy_pos + ? strncmp(reinterpret_cast<const char*>(binlog_name), + trx_sys.recovered_binlog_filename, + TRX_RSEG_BINLOG_NAME_LEN) + : 1; + + if (cmp >= 0) { + uint64_t binlog_offset = mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET + rseg_hdr->page.frame); - memcpy(trx_sys.recovered_binlog_filename, binlog_name, - TRX_RSEG_BINLOG_NAME_LEN); + if (cmp) + { + memcpy(trx_sys.recovered_binlog_filename, binlog_name, + TRX_RSEG_BINLOG_NAME_LEN); + trx_sys.recovered_binlog_offset= binlog_offset; + } + else if (binlog_offset > trx_sys.recovered_binlog_offset) + trx_sys.recovered_binlog_offset= binlog_offset; + trx_sys.recovered_binlog_is_legacy_pos= false; } - #ifdef WITH_WSREP trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid); #endif @@ -548,6 +565,7 @@ static void trx_rseg_init_binlog_info(const page_t* page) trx_sys.recovered_binlog_offset = mach_read_from_8( TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET + TRX_SYS + page); + trx_sys.recovered_binlog_is_legacy_pos= true; } #ifdef WITH_WSREP @@ -562,6 +580,7 @@ dberr_t trx_rseg_array_init() *trx_sys.recovered_binlog_filename = '\0'; trx_sys.recovered_binlog_offset = 0; + trx_sys.recovered_binlog_is_legacy_pos= false; #ifdef WITH_WSREP trx_sys.recovered_wsrep_xid.null(); XID wsrep_sys_xid; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index e5e2ef9e..942b8bd4 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -582,6 +582,7 @@ static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo) undo.top_page_no), 0, RW_S_LATCH, nullptr, BUF_GET, &mtr, &err)) { + buf_page_make_young_if_needed(&block->page); buf_block_t *undo_block= block; const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset; @@ -980,7 +981,13 @@ void trx_t::commit_empty(mtr_t *mtr) trx_undo_t *&undo= rsegs.m_redo.undo; ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED); - ut_ad(undo->size == 1); + + if (UNIV_UNLIKELY(undo->size != 1)) + { + sql_print_error("InnoDB: Undo log for transaction " TRX_ID_FMT + " is corrupted (" UINT32PF "!=1)", id, undo->size); + ut_ad("corrupted undo log" == 0); + } if (buf_block_t *u= buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0, @@ -1504,6 +1511,7 @@ void trx_t::commit_cleanup() mutex.wr_lock(); state= TRX_STATE_NOT_STARTED; + *detailed_error= '\0'; mod_tables.clear(); check_foreigns= true; diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 203edd9f..ccc68dfe 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -25,8 +25,8 @@ Created 3/26/1996 Heikki Tuuri *******************************************************/ #include "trx0undo.h" +#include "buf0rea.h" #include "fsp0fsp.h" -#include "mach0data.h" #include "mtr0log.h" #include "srv0mon.h" #include "srv0srv.h" @@ -178,8 +178,12 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec, block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no), 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr); + if (UNIV_UNLIKELY(!block)) + return nullptr; - return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr; + if (!buf_page_make_young_if_needed(&block->page)) + buf_read_ahead_linear(block->page.id(), 0, false); + return trx_undo_page_get_last_rec(block, page_no, offset); } /** Get the previous undo log record. @@ -268,12 +272,16 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, uint16_t offset, ulint mode, const buf_block_t*& block, mtr_t *mtr, dberr_t *err) { - block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode, - nullptr, BUF_GET, mtr, err); + buf_block_t *b= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode, + nullptr, BUF_GET, mtr, err); + block= b; if (!block) return nullptr; - if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset)) + if (!buf_page_make_young_if_needed(&b->page)) + buf_read_ahead_linear(b->page.id(), 0, false); + + if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset)) return rec; return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode, @@ -663,6 +671,8 @@ buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err) 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); if (!header_block) goto func_exit; + buf_page_make_young_if_needed(&header_block->page); + *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr); if (UNIV_UNLIKELY(*err != DB_SUCCESS)) @@ -732,6 +742,8 @@ trx_undo_free_page( return FIL_NULL; } + buf_page_make_young_if_needed(&header_block->page); + *err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); @@ -740,6 +752,14 @@ trx_undo_free_page( return FIL_NULL; } + const fil_addr_t last_addr = flst_get_last( + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + + header_block->page.frame); + if (UNIV_UNLIKELY(last_addr.page == page_no)) { + *err = DB_CORRUPTION; + return FIL_NULL; + } + *err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + header_block->page.frame, rseg->space, page_no, mtr); @@ -748,9 +768,6 @@ trx_undo_free_page( } buf_page_free(rseg->space, page_no, mtr); - const fil_addr_t last_addr = flst_get_last( - TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST - + header_block->page.frame); rseg->curr_size--; if (!in_history) { @@ -794,6 +811,9 @@ static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit, { ut_ad(is_temp == !undo.rseg->is_persistent()); + if (UNIV_UNLIKELY(undo.last_page_no == FIL_NULL)) + return DB_CORRUPTION; + for (mtr_t mtr;;) { mtr.start(); @@ -887,15 +907,13 @@ trx_undo_truncate_start( trx_undo_rec_t* last_rec; mtr_t mtr; + ut_ad(rseg->is_persistent()); + if (!limit) { return DB_SUCCESS; } loop: - mtr_start(&mtr); - - if (!rseg->is_persistent()) { - mtr.set_log_mode(MTR_LOG_NO_REDO); - } + mtr.start(); dberr_t err; const buf_block_t* undo_page; @@ -1263,6 +1281,8 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, return NULL; } + buf_page_make_young_if_needed(&block->page); + UT_LIST_REMOVE(rseg->undo_cached, undo); *pundo = undo; @@ -1297,19 +1317,24 @@ trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) ut_ad(mtr->get_log_mode() == MTR_LOG_ALL); trx_undo_t* undo = trx->rsegs.m_redo.undo; + buf_block_t* block; if (undo) { - return buf_page_get_gen( + block = buf_page_get_gen( page_id_t(undo->rseg->space->id, undo->last_page_no), 0, RW_X_LATCH, undo->guess_block, BUF_GET, mtr, err); + if (UNIV_LIKELY(block != nullptr)) { + buf_page_make_young_if_needed(&block->page); + } + return block; } *err = DB_SUCCESS; trx_rseg_t* rseg = trx->rsegs.m_redo.rseg; rseg->latch.wr_lock(SRW_LOCK_CALL); - buf_block_t* block = trx_undo_reuse_cached( + block = trx_undo_reuse_cached( trx, rseg, &trx->rsegs.m_redo.undo, mtr, err); if (!block) { @@ -1350,12 +1375,17 @@ trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo, : &trx->rsegs.m_redo.undo)); ut_ad(mtr->get_log_mode() == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL)); + buf_block_t* block; if (*undo) { - return buf_page_get_gen( + block = buf_page_get_gen( page_id_t(rseg->space->id, (*undo)->last_page_no), 0, RW_X_LATCH, (*undo)->guess_block, BUF_GET, mtr, err); + if (UNIV_LIKELY(block != nullptr)) { + buf_page_make_young_if_needed(&block->page); + } + return block; } DBUG_EXECUTE_IF( @@ -1365,7 +1395,6 @@ trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo, *err = DB_SUCCESS; rseg->latch.wr_lock(SRW_LOCK_CALL); - buf_block_t* block; if (is_temp) { ut_ad(!UT_LIST_GET_LEN(rseg->undo_cached)); } else { |