summaryrefslogtreecommitdiffstats
path: root/storage/innobase
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:33:02 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:33:02 +0000
commit4fa488fb0159c629483b7994aa84e73926b132b9 (patch)
tree182a19db69cdcb92be54cc6a5b0b9bfab28f80fd /storage/innobase
parentAdding debian version 1:10.11.6-2. (diff)
downloadmariadb-4fa488fb0159c629483b7994aa84e73926b132b9.tar.xz
mariadb-4fa488fb0159c629483b7994aa84e73926b132b9.zip
Merging upstream version 1:10.11.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/btr/btr0btr.cc36
-rw-r--r--storage/innobase/btr/btr0bulk.cc24
-rw-r--r--storage/innobase/btr/btr0cur.cc77
-rw-r--r--storage/innobase/btr/btr0pcur.cc17
-rw-r--r--storage/innobase/btr/btr0sea.cc1
-rw-r--r--storage/innobase/buf/buf0buf.cc360
-rw-r--r--storage/innobase/buf/buf0dblwr.cc11
-rw-r--r--storage/innobase/buf/buf0dump.cc22
-rw-r--r--storage/innobase/buf/buf0flu.cc144
-rw-r--r--storage/innobase/buf/buf0lru.cc39
-rw-r--r--storage/innobase/buf/buf0rea.cc6
-rw-r--r--storage/innobase/dict/dict0boot.cc5
-rw-r--r--storage/innobase/dict/dict0crea.cc3
-rw-r--r--storage/innobase/dict/dict0dict.cc22
-rw-r--r--storage/innobase/dict/dict0load.cc82
-rw-r--r--storage/innobase/dict/dict0stats.cc11
-rw-r--r--storage/innobase/dict/dict0stats_bg.cc52
-rw-r--r--storage/innobase/fil/fil0fil.cc55
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc5
-rw-r--r--storage/innobase/fsp/fsp0file.cc26
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc57
-rw-r--r--storage/innobase/fsp/fsp0sysspace.cc64
-rw-r--r--storage/innobase/gis/gis0sea.cc6
-rw-r--r--storage/innobase/handler/ha_innodb.cc84
-rw-r--r--storage/innobase/handler/handler0alter.cc40
-rw-r--r--storage/innobase/handler/i_s.cc127
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.cc17
-rw-r--r--storage/innobase/include/btr0btr.h4
-rw-r--r--storage/innobase/include/buf0buf.h35
-rw-r--r--storage/innobase/include/buf0dblwr.h3
-rw-r--r--storage/innobase/include/buf0lru.h10
-rw-r--r--storage/innobase/include/dict0load.h16
-rw-r--r--storage/innobase/include/dict0stats.h9
-rw-r--r--storage/innobase/include/fil0fil.h23
-rw-r--r--storage/innobase/include/fts0priv.inl23
-rw-r--r--storage/innobase/include/log0log.h71
-rw-r--r--storage/innobase/include/log0recv.h19
-rw-r--r--storage/innobase/include/mtr0mtr.h7
-rw-r--r--storage/innobase/include/os0file.h10
-rw-r--r--storage/innobase/include/srv0srv.h5
-rw-r--r--storage/innobase/include/srw_lock.h4
-rw-r--r--storage/innobase/include/trx0purge.h51
-rw-r--r--storage/innobase/include/trx0rseg.h5
-rw-r--r--storage/innobase/include/trx0sys.h9
-rw-r--r--storage/innobase/include/trx0trx.h1
-rw-r--r--storage/innobase/log/log0log.cc56
-rw-r--r--storage/innobase/log/log0recv.cc162
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc131
-rw-r--r--storage/innobase/os/os0file.cc91
-rw-r--r--storage/innobase/pars/pars0pars.cc3
-rw-r--r--storage/innobase/row/row0ftsort.cc3
-rw-r--r--storage/innobase/row/row0import.cc13
-rw-r--r--storage/innobase/row/row0ins.cc21
-rw-r--r--storage/innobase/row/row0merge.cc36
-rw-r--r--storage/innobase/row/row0purge.cc1
-rw-r--r--storage/innobase/row/row0sel.cc1
-rw-r--r--storage/innobase/row/row0undo.cc2
-rw-r--r--storage/innobase/row/row0upd.cc25
-rw-r--r--storage/innobase/srv/srv0srv.cc9
-rw-r--r--storage/innobase/srv/srv0start.cc48
-rw-r--r--storage/innobase/sync/srw_lock.cc6
-rw-r--r--storage/innobase/trx/trx0purge.cc351
-rw-r--r--storage/innobase/trx/trx0rec.cc3
-rw-r--r--storage/innobase/trx/trx0rseg.cc41
-rw-r--r--storage/innobase/trx/trx0trx.cc10
-rw-r--r--storage/innobase/trx/trx0undo.cc63
66 files changed, 1769 insertions, 1005 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 08be1991..705ff035 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -216,10 +216,11 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
+@param[out] first set if this is a first-time access to the page
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, rw_lock_type_t mode, bool merge,
- mtr_t *mtr, dberr_t *err)
+ mtr_t *mtr, dberr_t *err, bool *first)
{
ut_ad(mode != RW_NO_LATCH);
dberr_t local_err;
@@ -242,6 +243,8 @@ buf_block_t *btr_block_get(const dict_index_t &index,
*err= DB_PAGE_CORRUPTED;
block= nullptr;
}
+ else if (!buf_page_make_young_if_needed(&block->page) && first)
+ *first= true;
}
else if (*err == DB_DECRYPTION_FAILED)
btr_decryption_failed(index);
@@ -302,6 +305,8 @@ btr_root_block_get(
*err= DB_CORRUPTION;
block= nullptr;
}
+ else
+ buf_page_make_young_if_needed(&block->page);
}
else if (*err == DB_DECRYPTION_FAILED)
btr_decryption_failed(*index);
@@ -553,8 +558,11 @@ btr_page_alloc_for_ibuf(
root->page.frame)),
0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
if (new_block)
+ {
+ buf_page_make_young_if_needed(&new_block->page);
*err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+ }
ut_d(if (*err == DB_SUCCESS)
flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
return new_block;
@@ -873,7 +881,8 @@ static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap,
/************************************************************//**
Returns the upper level node pointer to a page. It is assumed that mtr holds
an x-latch on the tree.
-@return rec_get_offsets() of the node pointer record */
+@return rec_get_offsets() of the node pointer record
+@retval nullptr on corruption */
static
rec_offs*
btr_page_get_father_block(
@@ -1351,6 +1360,7 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page),
space->zip_size(), RW_SX_LATCH, &mtr))
{
+ buf_page_make_young_if_needed(&root->page);
mtr.set_named_space(space);
page_set_autoinc(root, autoinc, &mtr, reset);
}
@@ -2542,6 +2552,11 @@ btr_attach_half_pages(
offsets = btr_page_get_father_block(nullptr, heap, mtr,
&cursor);
+ if (UNIV_UNLIKELY(!offsets)) {
+ mem_heap_free(heap);
+ return DB_CORRUPTION;
+ }
+
/* Replace the address of the old child node (= page) with the
address of the new lower half */
@@ -3478,6 +3493,14 @@ btr_lift_page_up(
offsets = btr_page_get_father_block(offsets, heap,
mtr, &cursor);
}
+
+ if (UNIV_UNLIKELY(!offsets)) {
+parent_corrupted:
+ mem_heap_free(heap);
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+
father_block = btr_cur_get_block(&cursor);
father_page_zip = buf_block_get_page_zip(father_block);
@@ -3502,6 +3525,10 @@ btr_lift_page_up(
&cursor);
}
+ if (UNIV_UNLIKELY(!offsets)) {
+ goto parent_corrupted;
+ }
+
blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
}
@@ -3717,6 +3744,11 @@ btr_compress(
NULL, heap, mtr, &father_cursor);
}
+ if (UNIV_UNLIKELY(!offsets)) {
+ err = DB_CORRUPTION;
+ goto func_exit;
+ }
+
if (adjust) {
nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) {
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index 013cd131..5bf68c58 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -52,6 +52,7 @@ PageBulk::init()
if (m_page_no == FIL_NULL) {
mtr_t alloc_mtr;
+ dberr_t err= DB_SUCCESS;
/* We commit redo log for allocation by a separate mtr,
because we don't guarantee pages are committed following
@@ -60,28 +61,15 @@ PageBulk::init()
alloc_mtr.start();
m_index->set_modified(alloc_mtr);
- uint32_t n_reserved;
- dberr_t err = fsp_reserve_free_extents(
- &n_reserved, m_index->table->space, 1, FSP_NORMAL,
- &alloc_mtr);
- if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-oom:
- alloc_mtr.commit();
- m_mtr.commit();
- return err;
- }
-
/* Allocate a new page. */
new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
&alloc_mtr, &m_mtr, &err);
+ alloc_mtr.commit();
if (!new_block) {
- goto oom;
+ m_mtr.commit();
+ return err;
}
- m_index->table->space->release_free_extents(n_reserved);
-
- alloc_mtr.commit();
-
new_page = buf_block_get_frame(new_block);
m_page_no = new_block->page.id().page_no();
@@ -969,10 +957,10 @@ BtrBulk::pageCommit(
/** Log free check */
inline void BtrBulk::logFreeCheck()
{
- if (log_sys.check_flush_or_checkpoint()) {
+ if (log_sys.check_for_checkpoint()) {
release();
- log_check_margins();
+ log_free_check();
latch();
}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index e736f338..46afb73b 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1156,6 +1156,19 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
mtr_s_lock_index(index(), mtr);
}
+ dberr_t err;
+
+ if (!index()->table->space)
+ {
+ corrupted:
+ ut_ad("corrupted" == 0); // FIXME: remove this
+ err= DB_CORRUPTION;
+ func_exit:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return err;
+ }
+
const ulint zip_size= index()->table->space->zip_size();
/* Start with the root page. */
@@ -1169,7 +1182,6 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
low_bytes= 0;
ulint buf_mode= BUF_GET;
search_loop:
- dberr_t err;
auto block_savepoint= mtr->get_savepoint();
buf_block_t *block=
buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr,
@@ -1181,10 +1193,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
btr_decryption_failed(*index());
/* fall through */
default:
- func_exit:
- if (UNIV_LIKELY_NULL(heap))
- mem_heap_free(heap);
- return err;
+ goto func_exit;
case DB_SUCCESS:
/* This must be a search to perform an insert, delete mark, or delete;
try using the change buffer */
@@ -1251,16 +1260,11 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
btr_page_get_index_id(block->page.frame) != index()->id ||
fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
!fil_page_index_page_check(block->page.frame))
- {
- corrupted:
- ut_ad("corrupted" == 0); // FIXME: remove this
- err= DB_CORRUPTION;
- goto func_exit;
- }
+ goto corrupted;
page_cur.block= block;
ut_ad(block == mtr->at_savepoint(block_savepoint));
- ut_ad(rw_latch != RW_NO_LATCH);
+ const bool not_first_access{buf_page_make_young_if_needed(&block->page)};
#ifdef UNIV_ZIP_DEBUG
if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
ut_a(page_zip_validate(page_zip, block->page.frame, index()));
@@ -1539,6 +1543,9 @@ release_tree:
case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+ if (!not_first_access)
+ buf_read_ahead_linear(page_id, zip_size, false);
+
if (page_has_prev(block->page.frame) &&
page_rec_is_first(page_cur.rec, block->page.frame))
{
@@ -1578,6 +1585,8 @@ release_tree:
buf_mode= btr_op == BTR_DELETE_OP
? BUF_GET_IF_IN_POOL_OR_WATCH
: BUF_GET_IF_IN_POOL;
+ else if (!not_first_access)
+ buf_read_ahead_linear(page_id, zip_size, false);
break;
case BTR_MODIFY_TREE:
ut_ad(rw_latch == RW_X_LATCH);
@@ -1611,6 +1620,14 @@ ATTRIBUTE_COLD void mtr_t::index_lock_upgrade()
slot.type= MTR_MEMO_X_LOCK;
}
+/** Mark a non-leaf page "least recently used", but avoid invoking
+buf_page_t::set_accessed(), because we do not want linear read-ahead */
+static void btr_cur_nonleaf_make_young(buf_page_t *bpage)
+{
+ if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage)))
+ buf_page_make_young(bpage);
+}
+
ATTRIBUTE_COLD
dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
page_cur_mode_t mode, mtr_t *mtr)
@@ -1713,6 +1730,8 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
if (height != btr_page_get_level(block->page.frame))
goto corrupted;
+ btr_cur_nonleaf_make_young(&block->page);
+
#ifdef UNIV_ZIP_DEBUG
const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
@@ -1799,6 +1818,8 @@ search_loop:
btr_decryption_failed(*index);
goto func_exit;
}
+ else
+ btr_cur_nonleaf_make_young(&block->page);
#ifdef UNIV_ZIP_DEBUG
if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
@@ -1934,18 +1955,15 @@ index_locked:
ut_ad(n_blocks < BTR_MAX_LEVELS);
ut_ad(savepoint + n_blocks == mtr->get_savepoint());
+ bool first_access= false;
buf_block_t* block=
btr_block_get(*index, page,
height ? upper_rw_latch : root_leaf_rw_latch,
- !height, mtr, &err);
+ !height, mtr, &err, &first_access);
ut_ad(!block == (err != DB_SUCCESS));
if (!block)
- {
- if (err == DB_DECRYPTION_FAILED)
- btr_decryption_failed(*index);
break;
- }
if (first)
page_cur_set_before_first(block, &page_cur);
@@ -2029,10 +2047,16 @@ index_locked:
offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED,
&heap);
+ page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH);
- if (latch_mode != BTR_MODIFY_TREE);
+ if (latch_mode != BTR_MODIFY_TREE)
+ {
+ if (!height && first && first_access)
+ buf_read_ahead_linear(page_id_t(block->page.id().space(), page),
+ block->page.zip_size(), false);
+ }
else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
lock_intention,
node_ptr_max_size, compress_limit,
@@ -2070,7 +2094,6 @@ index_locked:
}
/* Go to the child node */
- page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
n_blocks++;
}
@@ -3837,22 +3860,14 @@ btr_cur_pess_upd_restore_supremum(
const page_id_t block_id{block->page.id()};
const page_id_t prev_id(block_id.space(), prev_page_no);
- dberr_t err;
buf_block_t* prev_block
- = buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr,
- BUF_PEEK_IF_IN_POOL, mtr, &err);
- /* Since we already held an x-latch on prev_block, it must
- be available and not be corrupted unless the buffer pool got
- corrupted somehow. */
+ = mtr->get_already_latched(prev_id, MTR_MEMO_PAGE_X_FIX);
if (UNIV_UNLIKELY(!prev_block)) {
- return err;
+ return DB_CORRUPTION;
}
ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT,
block->page.frame + FIL_PAGE_OFFSET, 4));
- /* We must already have an x-latch on prev_block! */
- ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
-
lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id,
PAGE_HEAP_NO_SUPREMUM,
page_rec_get_heap_no(rec));
@@ -6660,6 +6675,10 @@ btr_copy_blob_prefix(
mtr.commit();
return copied_len;
}
+ if (!buf_page_make_young_if_needed(&block->page)) {
+ buf_read_ahead_linear(id, 0, false);
+ }
+
page = buf_block_get_frame(block);
blob_header = page + offset;
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
index 54dd15ac..2131fb94 100644
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -25,9 +25,10 @@ Created 2/23/1996 Heikki Tuuri
*******************************************************/
#include "btr0pcur.h"
-#include "ut0byte.h"
+#include "buf0rea.h"
#include "rem0cmp.h"
#include "trx0trx.h"
+#include "ibuf0ibuf.h"
/**************************************************************//**
Resets a persistent cursor object, freeing ::old_rec_buf if it is
@@ -261,13 +262,15 @@ static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block,
buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size,
mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
- if (left_block &&
- btr_page_get_next(left_block->page.frame) != id.page_no())
+ if (!left_block);
+ else if (btr_page_get_next(left_block->page.frame) != id.page_no())
{
release_left_block:
mtr->release_last_page();
return false;
}
+ else
+ buf_page_make_young_if_needed(&left_block->page);
}
if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr))
@@ -539,10 +542,11 @@ btr_pcur_move_to_next_page(
}
dberr_t err;
+ bool first_access = false;
buf_block_t* next_block = btr_block_get(
*cursor->index(), next_page_no,
rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
- page_is_leaf(page), mtr, &err);
+ page_is_leaf(page), mtr, &err, &first_access);
if (UNIV_UNLIKELY(!next_block)) {
return err;
@@ -561,6 +565,11 @@ btr_pcur_move_to_next_page(
const auto s = mtr->get_savepoint();
mtr->rollback_to_savepoint(s - 2, s - 1);
+ if (first_access) {
+ buf_read_ahead_linear(next_block->page.id(),
+ next_block->zip_size(),
+ ibuf_inside(mtr));
+ }
return DB_SUCCESS;
}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index 8435047c..1c5928c4 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -1143,7 +1143,6 @@ block_and_ahi_release_and_fail:
}
block->page.fix();
- block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);
static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF},
"");
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 8ef18ee0..23b5b776 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -404,7 +404,7 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
if (id.space() == SRV_TMP_SPACE_ID
&& innodb_encrypt_temporary_tables) {
- slot = buf_pool.io_buf_reserve();
+ slot = buf_pool.io_buf_reserve(false);
slot->allocate();
bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame);
slot->release();
@@ -426,7 +426,7 @@ decompress:
return false;
}
- slot = buf_pool.io_buf_reserve();
+ slot = buf_pool.io_buf_reserve(false);
slot->allocate();
decompress_with_slot:
@@ -449,7 +449,7 @@ decrypt_failed:
return false;
}
- slot = buf_pool.io_buf_reserve();
+ slot = buf_pool.io_buf_reserve(false);
slot->allocate();
/* decrypt using crypt_buf to dst_frame */
@@ -742,6 +742,205 @@ bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
#ifndef UNIV_INNOCHECKSUM
+#ifdef __linux__
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <fstream>
+
+/** Memory Pressure
+
+based off https://www.kernel.org/doc/html/latest/accounting/psi.html#pressure-interface
+and https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory */
+class mem_pressure
+{
+ /* triggers + eventfd */
+ struct pollfd m_fds[3];
+ nfds_t m_num_fds;
+ int m_event_fd= -1;
+ Atomic_relaxed<bool> m_abort= false;
+
+ std::thread m_thd;
+ /* mem pressure garbage collection restricted to interval */
+ static constexpr ulonglong max_interval_us= 60*1000000;
+
+public:
+ mem_pressure() : m_num_fds(0) {}
+
+ bool setup()
+ {
+ static_assert(array_elements(m_fds) == (array_elements(m_triggers) + 1),
+ "insufficient fds");
+ std::string memcgroup{"/sys/fs/cgroup"};
+ std::string cgroup;
+ {
+ std::ifstream selfcgroup("/proc/self/cgroup");
+ std::getline(selfcgroup, cgroup, '\n');
+ }
+
+ cgroup.erase(0, 3); // Remove "0::"
+ memcgroup+= cgroup + "/memory.pressure";
+
+ m_num_fds= 0;
+ for (auto trig= std::begin(m_triggers); trig!= std::end(m_triggers); ++trig)
+ {
+ if ((m_fds[m_num_fds].fd=
+ open(memcgroup.c_str(), O_RDWR | O_NONBLOCK | O_CLOEXEC)) < 0)
+ {
+ /* User can't do anything about it, no point giving warning */
+ shutdown();
+ return false;
+ }
+ my_register_filename(m_fds[m_num_fds].fd, memcgroup.c_str(), FILE_BY_OPEN, 0, MYF(0));
+ ssize_t slen= strlen(*trig);
+ if (write(m_fds[m_num_fds].fd, *trig, slen) < slen)
+ {
+ /* we may fail this one, but continue to the next */
+ my_close(m_fds[m_num_fds].fd, MYF(MY_WME));
+ continue;
+ }
+ m_fds[m_num_fds].events= POLLPRI;
+ m_num_fds++;
+ }
+ if (m_num_fds < 1)
+ return false;
+
+ if ((m_event_fd= eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK)) == -1)
+ {
+ /* User can't do anything about it, no point giving warning */
+ shutdown();
+ return false;
+ }
+ my_register_filename(m_event_fd, "mem_pressure_eventfd", FILE_BY_DUP, 0, MYF(0));
+ m_fds[m_num_fds].fd= m_event_fd;
+ m_fds[m_num_fds].events= POLLIN;
+ m_num_fds++;
+ m_thd= std::thread(pressure_routine, this);
+ sql_print_information("InnoDB: Initialized memory pressure event listener");
+ return true;
+ }
+
+ void shutdown()
+ {
+ /* m_event_fd is in this list */
+ while (m_num_fds)
+ {
+ m_num_fds--;
+ my_close(m_fds[m_num_fds].fd, MYF(MY_WME));
+ m_fds[m_num_fds].fd= -1;
+ }
+ }
+
+ static void pressure_routine(mem_pressure *m);
+
+#ifdef UNIV_DEBUG
+ void trigger_collection()
+ {
+ uint64_t u= 1;
+ if (m_event_fd >=0 && write(m_event_fd, &u, sizeof(uint64_t)) != sizeof(uint64_t))
+ sql_print_information("InnoDB: (Debug) Failed to trigger memory pressure");
+ else /* assumed failed to meet intialization criteria, so trigger directy */
+ buf_pool.garbage_collect();
+ }
+#endif
+
+ void quit()
+ {
+ uint64_t u= 1;
+ m_abort= true;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+ /* return result ignored, cannot do anything with it */
+ write(m_event_fd, &u, sizeof(uint64_t));
+#pragma GCC diagnostic pop
+ }
+
+ void join()
+ {
+ if (m_thd.joinable())
+ {
+ quit();
+ m_thd.join();
+ }
+ }
+
+ static const char* const m_triggers[2];
+};
+
+
+/*
+ ref: https://docs.kernel.org/accounting/psi.html
+ maximum window size (second number) 10 seconds.
+ window size in multiples of 2 second interval required (for Unprivileged)
+ Time is in usec.
+*/
+const char* const mem_pressure::m_triggers[]=
+ {"some 5000000 10000000", /* 5s out of 10s */
+ "full 10000 2000000"}; /* 10ms out of 2s */
+
+static mem_pressure mem_pressure_obj;
+
+void mem_pressure::pressure_routine(mem_pressure *m)
+{
+ DBUG_ASSERT(m == &mem_pressure_obj);
+ if (my_thread_init())
+ {
+ m->shutdown();
+ return;
+ }
+
+ ulonglong last= microsecond_interval_timer() - max_interval_us;
+ while (!m->m_abort)
+ {
+ if (poll(&m->m_fds[0], m->m_num_fds, -1) < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ else
+ break;
+ }
+ if (!m->m_abort)
+ break;
+
+ for (pollfd &p : st_::span<pollfd>(m->m_fds, m->m_num_fds))
+ {
+ if (p.revents & POLLPRI)
+ {
+ ulonglong now= microsecond_interval_timer();
+ if ((now - last) > max_interval_us)
+ {
+ last= now;
+ buf_pool.garbage_collect();
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ if (p.revents & POLLIN)
+ {
+ uint64_t u;
+ /* we haven't aborted, so this must be a debug trigger */
+ if (read(p.fd, &u, sizeof(u)) >=0)
+ buf_pool.garbage_collect();
+ }
+#endif
+ }
+ }
+ m->shutdown();
+
+ my_thread_end();
+}
+
+/** Initialize mem pressure. */
+ATTRIBUTE_COLD void buf_mem_pressure_detect_init()
+{
+ mem_pressure_obj.setup();
+}
+
+ATTRIBUTE_COLD void buf_mem_pressure_shutdown()
+{
+ mem_pressure_obj.join();
+}
+#endif /* __linux__ */
+
#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
/** Enable buffers to be dumped to core files
@@ -1099,6 +1298,11 @@ bool buf_pool_t::create()
chunk_t::map_ref= chunk_t::map_reg;
buf_LRU_old_ratio_update(100 * 3 / 8, false);
btr_search_sys_create();
+
+#ifdef __linux__
+ if (srv_operation == SRV_OPERATION_NORMAL)
+ buf_mem_pressure_detect_init();
+#endif
ut_ad(is_initialised());
return false;
}
@@ -1300,14 +1504,17 @@ void buf_pool_t::io_buf_t::close()
n_slots= 0;
}
-buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
+buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve(bool wait_for_reads)
{
for (;;)
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
+ buf_dblwr.flush_buffered_writes();
os_aio_wait_until_no_pending_writes(true);
+ if (!wait_for_reads)
+ continue;
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
@@ -1536,6 +1743,7 @@ struct find_interesting_trx
inline void buf_pool_t::resize()
{
ut_ad(this == &buf_pool);
+ ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
bool warning = false;
@@ -1878,6 +2086,100 @@ calc_buf_pool_size:
return;
}
+#ifdef __linux__
+inline void buf_pool_t::garbage_collect()
+{
+ mysql_mutex_lock(&mutex);
+ size_t freed= 0;
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* buf_LRU_free_page() will temporarily release and reacquire
+ buf_pool.mutex for invoking btr_search_drop_page_hash_index(). Thus,
+ we must protect ourselves with the hazard pointer. */
+rescan:
+#else
+ lru_hp.set(nullptr);
+#endif
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev; bpage; bpage= prev)
+ {
+ prev= UT_LIST_GET_PREV(LRU, bpage);
+#ifdef BTR_CUR_HASH_ADAPT
+ lru_hp.set(prev);
+#endif
+ auto state= bpage->state();
+ ut_ad(state >= buf_page_t::FREED);
+ ut_ad(bpage->in_LRU_list);
+
+ /* We try to free any pages that can be freed without writing out
+ anything. */
+ switch (bpage->oldest_modification()) {
+ case 0:
+ try_to_evict:
+ if (buf_LRU_free_page(bpage, true))
+ {
+ evicted:
+ freed++;
+#ifdef BTR_CUR_HASH_ADAPT
+ bpage= prev;
+ prev= lru_hp.get();
+ if (!prev && bpage)
+ goto rescan;
+#endif
+ }
+ continue;
+ case 1:
+ break;
+ default:
+ if (state >= buf_page_t::UNFIXED)
+ continue;
+ }
+
+ if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
+ {
+ ut_ad(!bpage->is_io_fixed());
+ lsn_t oldest_modification= bpage->oldest_modification();
+ switch (oldest_modification) {
+ case 1:
+ mysql_mutex_lock(&flush_list_mutex);
+ oldest_modification= bpage->oldest_modification();
+ if (oldest_modification)
+ {
+ ut_ad(oldest_modification == 1);
+ delete_from_flush_list(bpage);
+ }
+ mysql_mutex_unlock(&flush_list_mutex);
+ /* fall through */
+ case 0:
+ bpage->lock.u_unlock(true);
+ goto try_to_evict;
+ default:
+ if (bpage->state() < buf_page_t::UNFIXED &&
+ oldest_modification <= log_sys.get_flushed_lsn())
+ {
+ release_freed_page(bpage);
+ goto evicted;
+ }
+ else
+ bpage->lock.u_unlock(true);
+ }
+ }
+ }
+
+#if defined MADV_FREE
+ /* FIXME: Issue fewer calls for larger contiguous blocks of
+ memory. For now, we assume that this is acceptable, because this
+ code should be executed rarely. */
+ for (buf_page_t *bpage= UT_LIST_GET_FIRST(free); bpage;
+ bpage= UT_LIST_GET_NEXT(list, bpage))
+ madvise(bpage->frame, srv_page_size, MADV_FREE);
+#endif
+ mysql_mutex_unlock(&mutex);
+ sql_print_information("InnoDB: Memory pressure event freed %zu pages",
+ freed);
+ return;
+}
+#endif /* __linux__ */
+
/** Thread pool task invoked by innodb_buffer_pool_size changes. */
static void buf_resize_callback(void *)
{
@@ -1906,12 +2208,23 @@ static tpool::waitable_task buf_resize_task(buf_resize_callback,
void buf_resize_start()
{
- srv_thread_pool->submit_task(&buf_resize_task);
+#if !defined(DBUG_OFF) && defined(__linux__)
+ DBUG_EXECUTE_IF("trigger_garbage_collection",
+ {
+ mem_pressure_obj.trigger_collection();
+ }
+ );
+#endif
+
+ srv_thread_pool->submit_task(&buf_resize_task);
}
void buf_resize_shutdown()
{
- buf_resize_task.wait();
+#ifdef __linux__
+ buf_mem_pressure_shutdown();
+#endif
+ buf_resize_task.wait();
}
@@ -2220,14 +2533,21 @@ lookup:
if (discard_attempted || !bpage->frame)
{
- /* Even when we are holding a hash_lock, it should be
- acceptable to wait for a page S-latch here, because
- buf_page_t::read_complete() will not wait for buf_pool.mutex,
- and because S-latch would not conflict with a U-latch
- that would be protecting buf_page_t::write_complete(). */
- bpage->lock.s_lock();
+ const bool got_s_latch= bpage->lock.s_lock_try();
hash_lock.unlock_shared();
- break;
+ if (UNIV_LIKELY(got_s_latch))
+ break;
+ /* We may fail to acquire bpage->lock because
+ buf_page_t::read_complete() may be invoking
+ buf_pool_t::corrupted_evict() on this block, which it would
+ hold an exclusive latch on.
+
+ Let us aqcuire and release buf_pool.mutex to ensure that any
+ buf_pool_t::corrupted_evict() will proceed before we reacquire
+ the hash_lock that it could be waiting for. */
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto lookup;
}
hash_lock.unlock_shared();
@@ -2246,7 +2566,6 @@ lookup:
ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
}
- bpage->set_accessed();
buf_page_make_young_if_needed(bpage);
#ifdef UNIV_DEBUG
@@ -2873,18 +3192,6 @@ get_latch_valid:
ut_ad(page_id_t(page_get_space_id(block->page.frame),
page_get_page_no(block->page.frame))
== page_id);
-
- if (mode == BUF_GET_POSSIBLY_FREED
- || mode == BUF_PEEK_IF_IN_POOL) {
- return block;
- }
-
- const bool not_first_access{block->page.set_accessed()};
- buf_page_make_young_if_needed(&block->page);
- if (!not_first_access) {
- buf_read_ahead_linear(page_id, block->zip_size(),
- ibuf_inside(mtr));
- }
}
return block;
@@ -3057,7 +3364,6 @@ bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
block->page.fix();
ut_ad(!block->page.is_read_fixed());
- block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);
mtr->memo_push(block, mtr_memo_type_t(rw_latch));
}
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index e9aea355..e2702adc 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -336,11 +336,14 @@ func_exit:
os_file_flush(file);
}
else
- for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
- if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
- /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+ {
+ alignas(8) char checkpoint[8];
+ mach_write_to_8(checkpoint, log_sys.next_checkpoint_lsn);
+ for (auto i= size * 2; i--; page += srv_page_size)
+ if (memcmp_aligned<8>(page + FIL_PAGE_LSN, checkpoint, 8) >= 0)
+ /* Valid pages are not older than the log checkpoint. */
recv_sys.dblwr.add(page);
-
+ }
err= DB_SUCCESS;
goto func_exit;
}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index 957632db..cc51f8c6 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -33,7 +33,7 @@ Created April 08, 2011 Vasil Dimov
#include "buf0rea.h"
#include "buf0dump.h"
-#include "dict0dict.h"
+#include "dict0load.h"
#include "os0file.h"
#include "srv0srv.h"
#include "srv0start.h"
@@ -180,7 +180,7 @@ static void buf_dump_generate_path(char *path, size_t path_size)
char buf[FN_REFLEN];
mysql_mutex_lock(&LOCK_global_system_variables);
- snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(),
+ snprintf(buf, sizeof buf, "%s" FN_ROOTDIR "%s", get_buf_dump_dir(),
srv_buf_dump_filename);
mysql_mutex_unlock(&LOCK_global_system_variables);
@@ -214,7 +214,7 @@ static void buf_dump_generate_path(char *path, size_t path_size)
format = "%s%s";
break;
default:
- format = "%s/%s";
+ format = "%s" FN_ROOTDIR "%s";
}
snprintf(path, path_size, format,
@@ -562,6 +562,22 @@ buf_load()
if (!SHUTTING_DOWN()) {
std::sort(dump, dump + dump_n);
+ std::set<uint32_t> missing;
+ for (const page_id_t id : st_::span<const page_id_t>
+ (dump, dump_n)) {
+ missing.emplace(id.space());
+ }
+ for (std::set<uint32_t>::iterator i = missing.begin();
+ i != missing.end(); ) {
+ auto j = i++;
+ if (fil_space_t* space = fil_space_t::get(*j)) {
+ space->release();
+ missing.erase(j);
+ }
+ }
+ if (!missing.empty()) {
+ dict_check_tablespaces_and_store_max_id(&missing);
+ }
}
/* Avoid calling the expensive fil_space_t::get() for each
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index b6357989..d4628985 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -655,7 +655,7 @@ static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
ut_ad(!bpage->zip_size() || !page_compressed);
/* Find free slot from temporary memory array */
- *slot= buf_pool.io_buf_reserve();
+ *slot= buf_pool.io_buf_reserve(true);
ut_a(*slot);
(*slot)->allocate();
@@ -754,16 +754,20 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
ut_ad(space->referenced());
const auto s= state();
- ut_a(s >= FREED);
+
+ const lsn_t lsn=
+ mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
+ ut_ad(lsn
+ ? lsn >= oldest_modification() || oldest_modification() == 2
+ : space->purpose != FIL_TYPE_TABLESPACE);
if (s < UNFIXED)
{
+ ut_a(s >= FREED);
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
- const lsn_t lsn=
- mach_read_from_8(my_assume_aligned<8>
- (FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
- ut_ad(lsn >= oldest_modification());
+ freed:
if (lsn > log_sys.get_flushed_lsn())
{
mysql_mutex_unlock(&buf_pool.mutex);
@@ -775,6 +779,12 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
return false;
}
+ if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
+ {
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+ goto freed;
+ }
+
ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
ut_ad(f >= UNFIXED);
ut_ad(f < READ_FIX);
@@ -869,15 +879,9 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
{
- if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
- {
- const lsn_t lsn=
- mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
- (write_frame ? write_frame
- : frame)));
- ut_ad(lsn >= oldest_modification());
+ if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE) &&
+ lsn > log_sys.get_flushed_lsn())
log_write_up_to(lsn, true);
- }
space->io(IORequest{type, this, slot}, physical_offset(), size,
write_frame, this);
}
@@ -1057,11 +1061,25 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
bool contiguous, bool evict,
ulint n_flushed, ulint n_to_flush)
{
- mysql_mutex_unlock(&buf_pool.mutex);
-
ut_ad(space->id == page_id.space());
ut_ad(bpage->id() == page_id);
+ {
+ const lsn_t lsn=
+ mach_read_from_8(my_assume_aligned<8>
+ (FIL_PAGE_LSN +
+ (bpage->zip.data ? bpage->zip.data : bpage->frame)));
+ ut_ad(lsn >= bpage->oldest_modification());
+ if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
+ {
+ ut_a(!bpage->flush(evict, space));
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return 0;
+ }
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
ulint count= 0;
page_id_t id= page_id;
page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
@@ -1741,6 +1759,28 @@ ulint buf_flush_LRU(ulint max_n, bool evict)
buf_pool.try_LRU_scan= true;
pthread_cond_broadcast(&buf_pool.done_free);
}
+ else if (!pages && !buf_pool.try_LRU_scan &&
+ !buf_pool.LRU_warned.test_and_set(std::memory_order_acquire))
+ {
+ /* For example, with the minimum innodb_buffer_pool_size=5M and
+ the default innodb_page_size=16k there are only a little over 316
+ pages in the buffer pool. The buffer pool can easily be exhausted
+ by a workload of some dozen concurrent connections. The system could
+ reach a deadlock like the following:
+
+ (1) Many threads are waiting in buf_LRU_get_free_block()
+ for buf_pool.done_free.
+ (2) Some threads are waiting for a page latch which is held by
+ another thread that is waiting in buf_LRU_get_free_block().
+ (3) This thread is the only one that could make progress, but
+ we fail to do so because all the pages that we scanned are
+ buffer-fixed or latched by some thread. */
+ sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
+ " %zu blocks are in use and %zu free."
+ " Consider increasing innodb_buffer_pool_size.",
+ UT_LIST_GET_LEN(buf_pool.LRU),
+ UT_LIST_GET_LEN(buf_pool.free));
+ }
return pages;
}
@@ -2124,6 +2164,8 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
limit= lsn;
buf_pool.page_cleaner_set_idle(false);
pthread_cond_signal(&buf_pool.do_flush_list);
+ if (furious)
+ log_sys.set_check_for_checkpoint();
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
@@ -2371,11 +2413,19 @@ func_exit:
goto func_exit;
}
+TPOOL_SUPPRESS_TSAN
+bool buf_pool_t::need_LRU_eviction() const
+{
+ /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting
+ for buf_flush_page_cleaner() to evict some blocks */
+ return UNIV_UNLIKELY(!try_LRU_scan ||
+ (UT_LIST_GET_LEN(LRU) > BUF_LRU_MIN_LEN &&
+ UT_LIST_GET_LEN(free) < srv_LRU_scan_depth / 2));
+}
+
#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
-/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
-We would only need this for buf_flush_page_cleaner(),
-but GCC 4.8.5 does not support pop_options. */
-# pragma GCC optimize ("O0")
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */
+__attribute__((optimize(0)))
#endif
/** page_cleaner thread tasked with flushing dirty pages from the buffer
pools. As of now we'll have only one coordinator. */
@@ -2409,21 +2459,24 @@ static void buf_flush_page_cleaner()
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
- if (buf_pool.ran_out())
- goto no_wait;
- else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
- break;
+ if (!buf_pool.need_LRU_eviction())
+ {
+ if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+ break;
- if (buf_pool.page_cleaner_idle() &&
- (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
- srv_max_dirty_pages_pct_lwm == 0.0))
- /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
- my_cond_wait(&buf_pool.do_flush_list,
- &buf_pool.flush_list_mutex.m_mutex);
- else
- my_cond_timedwait(&buf_pool.do_flush_list,
- &buf_pool.flush_list_mutex.m_mutex, &abstime);
- no_wait:
+ if (buf_pool.page_cleaner_idle() &&
+ (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+ srv_max_dirty_pages_pct_lwm == 0.0))
+ {
+ buf_pool.LRU_warned.clear(std::memory_order_release);
+ /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
+ my_cond_wait(&buf_pool.do_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ }
+ else
+ my_cond_timedwait(&buf_pool.do_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex, &abstime);
+ }
set_timespec(abstime, 1);
lsn_limit= buf_flush_sync_lsn;
@@ -2445,9 +2498,9 @@ static void buf_flush_page_cleaner()
do
{
- DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
- DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
-
+ IF_DBUG(if (_db_keyword_(nullptr, "ib_log_checkpoint_avoid", 1) ||
+ _db_keyword_(nullptr, "ib_log_checkpoint_avoid_hard", 1))
+ continue,);
if (!recv_recovery_is_on() &&
!srv_startup_is_before_trx_rollback_phase &&
srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
@@ -2455,7 +2508,7 @@ static void buf_flush_page_cleaner()
}
while (false);
- if (!buf_pool.ran_out())
+ if (!buf_pool.need_LRU_eviction())
continue;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
oldest_lsn= buf_pool.get_oldest_modification(0);
@@ -2484,7 +2537,7 @@ static void buf_flush_page_cleaner()
if (oldest_lsn >= soft_lsn_limit)
buf_flush_async_lsn= soft_lsn_limit= 0;
}
- else if (buf_pool.ran_out())
+ else if (buf_pool.need_LRU_eviction())
{
buf_pool.page_cleaner_set_idle(false);
buf_pool.n_flush_inc();
@@ -2549,10 +2602,11 @@ static void buf_flush_page_cleaner()
else
{
maybe_unemployed:
- const bool below{dirty_pct < pct_lwm};
- pct_lwm= 0.0;
- if (below)
+ if (dirty_pct < pct_lwm)
+ {
+ pct_lwm= 0.0;
goto possibly_unemployed;
+ }
}
}
else if (dirty_pct < srv_max_buf_pool_modified_pct)
@@ -2598,9 +2652,13 @@ static void buf_flush_page_cleaner()
MONITOR_FLUSH_ADAPTIVE_PAGES,
n_flushed);
}
- else if (buf_flush_async_lsn <= oldest_lsn)
+ else if (buf_flush_async_lsn <= oldest_lsn &&
+ !buf_pool.need_LRU_eviction())
goto check_oldest_and_set_idle;
+ else
+ mysql_mutex_lock(&buf_pool.mutex);
+ n= srv_max_io_capacity;
n= n >= n_flushed ? n - n_flushed : 0;
goto LRU_flush;
}
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 65ee8fa3..2a8d6ff2 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -60,10 +60,6 @@ static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
frames in the buffer pool, we set this to TRUE */
static bool buf_lru_switched_on_innodb_mon = false;
-/** True if diagnostic message about difficult to find free blocks
-in the buffer bool has already printed. */
-static bool buf_lru_free_blocks_error_printed;
-
/******************************************************************//**
These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
and page_zip_decompress() operations. Based on the statistics,
@@ -408,6 +404,7 @@ got_mutex:
buf_LRU_check_size_of_non_data_objects();
buf_block_t* block;
+ IF_DBUG(static bool buf_lru_free_blocks_error_printed,);
DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
if (!buf_lru_free_blocks_error_printed) {
n_iterations = 21;
@@ -417,9 +414,25 @@ retry:
/* If there is a block in the free list, take it */
if ((block = buf_LRU_get_free_only()) != nullptr) {
got_block:
+ const ulint LRU_size = UT_LIST_GET_LEN(buf_pool.LRU);
+ const ulint available = UT_LIST_GET_LEN(buf_pool.free);
+ const ulint scan_depth = srv_LRU_scan_depth / 2;
+ ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth
+ || buf_pool.need_LRU_eviction());
+
if (!have_mutex) {
mysql_mutex_unlock(&buf_pool.mutex);
}
+
+ if (UNIV_UNLIKELY(available < scan_depth)
+ && LRU_size > BUF_LRU_MIN_LEN) {
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (!buf_pool.page_cleaner_active()) {
+ buf_pool.page_cleaner_wakeup(true);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ }
+
block->page.zip.clear();
return block;
}
@@ -445,10 +458,11 @@ got_block:
if ((block = buf_LRU_get_free_only()) != nullptr) {
goto got_block;
}
+ const bool wake = buf_pool.need_LRU_eviction();
mysql_mutex_unlock(&buf_pool.mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
const auto n_flush = buf_pool.n_flush();
- if (!buf_pool.try_LRU_scan) {
+ if (wake && !buf_pool.page_cleaner_active()) {
buf_pool.page_cleaner_wakeup(true);
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
@@ -467,9 +481,10 @@ not_found:
MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
}
- if (n_iterations == 21 && !buf_lru_free_blocks_error_printed
- && srv_buf_pool_old_size == srv_buf_pool_size) {
- buf_lru_free_blocks_error_printed = true;
+ if (n_iterations == 21
+ && srv_buf_pool_old_size == srv_buf_pool_size
+ && buf_pool.LRU_warned.test_and_set(std::memory_order_acquire)) {
+ IF_DBUG(buf_lru_free_blocks_error_printed = true,);
mysql_mutex_unlock(&buf_pool.mutex);
ib::warn() << "Difficult to find free blocks in the buffer pool"
" (" << n_iterations << " search iterations)! "
@@ -787,6 +802,14 @@ void buf_page_make_young(buf_page_t *bpage)
mysql_mutex_unlock(&buf_pool.mutex);
}
+bool buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+ const bool not_first{bpage->set_accessed()};
+ if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage)))
+ buf_page_make_young(bpage);
+ return not_first;
+}
+
/** Try to free a block. If bpage is a descriptor of a compressed-only
ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
The caller must hold buf_pool.mutex.
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index c4f07738..9041c6a2 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -597,6 +597,12 @@ failed:
uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
hash_lock.unlock_shared();
+ /* The underlying file page of this buffer pool page could actually
+ be marked as freed, or a read of the page into the buffer pool might
+ be in progress. We may read uninitialized data here.
+ Suppress warnings of comparing uninitialized values. */
+ MEM_MAKE_DEFINED(&prev, sizeof prev);
+ MEM_MAKE_DEFINED(&next, sizeof next);
if (prev == FIL_NULL || next == FIL_NULL)
goto fail;
page_id_t id= page_id;
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
index 5516bce9..cb60d813 100644
--- a/storage/innobase/dict/dict0boot.cc
+++ b/storage/innobase/dict/dict0boot.cc
@@ -42,7 +42,10 @@ static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO};
static buf_block_t *dict_hdr_get(mtr_t *mtr)
{
/* We assume that the DICT_HDR page is always readable and available. */
- return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr);
+ buf_block_t *b=
+ buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr);
+ buf_page_make_young_if_needed(&b->page);
+ return b;
}
/**********************************************************************//**
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
index cce5f2f2..dd858287 100644
--- a/storage/innobase/dict/dict0crea.cc
+++ b/storage/innobase/dict/dict0crea.cc
@@ -353,9 +353,6 @@ dict_build_table_def_step(
/* Always set this bit for all new created tables */
DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- DICT_TF2_FLAG_UNSET(table,
- DICT_TF2_FTS_AUX_HEX_NAME););
if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) {
/* This table will need a new tablespace. */
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 5bc7ab6e..5d3cab17 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -2809,8 +2809,7 @@ dict_foreign_find_index(
for (dict_index_t* index = dict_table_get_first_index(table);
index;
index = dict_table_get_next_index(index)) {
- if (types_idx != index
- && !index->to_be_dropped
+ if (!index->to_be_dropped
&& !dict_index_is_online_ddl(index)
&& dict_foreign_qualify_index(
table, col_names, columns, n_cols,
@@ -3530,6 +3529,7 @@ dict_foreign_parse_drop_constraints(
const char* ptr1;
const char* id;
CHARSET_INFO* cs;
+ bool if_exists = false;
ut_a(trx->mysql_thd);
@@ -3583,6 +3583,7 @@ loop:
ptr1 = dict_accept(cs, ptr1, "EXISTS", &success);
if (success) {
ptr = ptr1;
+ if_exists = true;
}
}
@@ -3593,14 +3594,14 @@ loop:
goto syntax_error;
}
- ut_a(*n < 1000);
- (*constraints_to_drop)[*n] = id;
- (*n)++;
-
if (std::find_if(table->foreign_set.begin(),
- table->foreign_set.end(),
- dict_foreign_matches_id(id))
- == table->foreign_set.end()) {
+ table->foreign_set.end(),
+ dict_foreign_matches_id(id))
+ == table->foreign_set.end()) {
+
+ if (if_exists) {
+ goto loop;
+ }
if (!srv_read_only_mode) {
FILE* ef = dict_foreign_err_file;
@@ -3622,6 +3623,9 @@ loop:
return(DB_CANNOT_DROP_CONSTRAINT);
}
+ ut_a(*n < 1000);
+ (*constraints_to_drop)[*n] = id;
+ (*n)++;
goto loop;
syntax_error:
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
index f769839d..e7735586 100644
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -33,8 +33,8 @@ Created 4/24/1996 Heikki Tuuri
#include "dict0boot.h"
#include "dict0crea.h"
#include "dict0dict.h"
-#include "dict0mem.h"
#include "dict0stats.h"
+#include "ibuf0ibuf.h"
#include "fsp0file.h"
#include "fts0priv.h"
#include "mach0data.h"
@@ -865,18 +865,30 @@ err_exit:
return READ_OK;
}
-/** Check each tablespace found in the data dictionary.
-Then look at each table defined in SYS_TABLES that has a space_id > 0
-to find all the file-per-table tablespaces.
+/** @return SELECT MAX(space) FROM sys_tables */
+static uint32_t dict_find_max_space_id(btr_pcur_t *pcur, mtr_t *mtr)
+{
+ uint32_t max_space_id= 0;
-In a crash recovery we already have some tablespace objects created from
-processing the REDO log. We will compare the
-space_id information in the data dictionary to what we find in the
-tablespace file. In addition, more validation will be done if recovery
-was needed and force_recovery is not set.
+ for (const rec_t *rec= dict_startscan_system(pcur, mtr, dict_sys.sys_tables);
+ rec; rec= dict_getnext_system_low(pcur, mtr))
+ if (!dict_sys_tables_rec_check(rec))
+ {
+ ulint len;
+ const byte *field=
+ rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+ ut_ad(len == 4);
+ max_space_id= std::max(max_space_id, mach_read_from_4(field));
+ }
+
+ return max_space_id;
+}
-We also scan the biggest space id, and store it to fil_system. */
-void dict_check_tablespaces_and_store_max_id()
+/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system.
+Open each data file if an encryption plugin has been loaded.
+
+@param spaces set of tablespace files to open */
+void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces)
{
uint32_t max_space_id = 0;
btr_pcur_t pcur;
@@ -888,6 +900,12 @@ void dict_check_tablespaces_and_store_max_id()
dict_sys.lock(SRW_LOCK_CALL);
+ if (!spaces && ibuf.empty
+ && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+ max_space_id = dict_find_max_space_id(&pcur, &mtr);
+ goto done;
+ }
+
for (const rec_t *rec = dict_startscan_system(&pcur, &mtr,
dict_sys.sys_tables);
rec; rec = dict_getnext_system_low(&pcur, &mtr)) {
@@ -919,14 +937,6 @@ void dict_check_tablespaces_and_store_max_id()
continue;
}
- if (flags2 & DICT_TF2_DISCARDED) {
- sql_print_information("InnoDB: Ignoring tablespace"
- " for %.*s because "
- "the DISCARD flag is set",
- static_cast<int>(len), field);
- continue;
- }
-
/* For tables or partitions using .ibd files, the flag
DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN
before MySQL 5.6.5. The flag should not have been
@@ -939,6 +949,19 @@ void dict_check_tablespaces_and_store_max_id()
continue;
}
+ if (spaces && spaces->find(uint32_t(space_id))
+ == spaces->end()) {
+ continue;
+ }
+
+ if (flags2 & DICT_TF2_DISCARDED) {
+ sql_print_information("InnoDB: Ignoring tablespace"
+ " for %.*s because "
+ "the DISCARD flag is set",
+ static_cast<int>(len), field);
+ continue;
+ }
+
const span<const char> name{field, len};
char* filepath = fil_make_filepath(nullptr, name,
@@ -971,6 +994,7 @@ void dict_check_tablespaces_and_store_max_id()
ut_free(filepath);
}
+done:
mtr.commit();
fil_set_max_space_id_if_bigger(max_space_id);
@@ -2246,22 +2270,10 @@ dict_load_tablespace(
/* The tablespace may already be open. */
table->space = fil_space_for_table_exists_in_mem(table->space_id,
table->flags);
- if (table->space) {
+ if (table->space || table->file_unreadable) {
return;
}
- if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) {
- table->file_unreadable = true;
- return;
- }
-
- if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
- ib::error() << "Failed to find tablespace for table "
- << table->name << " in the cache. Attempting"
- " to load the tablespace with space id "
- << table->space_id;
- }
-
/* Use the remote filepath if needed. This parameter is optional
in the call to fil_ibd_open(). If not supplied, it will be built
from the table->name. */
@@ -2284,6 +2296,12 @@ dict_load_tablespace(
if (!table->space) {
/* We failed to find a sensible tablespace file */
table->file_unreadable = true;
+
+ if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
+ sql_print_error("InnoDB: Failed to load tablespace "
+ ULINTPF " for table %s",
+ table->space_id, table->name);
+ }
}
ut_free(filepath);
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index 40969335..f11187b9 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -752,16 +752,9 @@ dict_stats_empty_index(
}
}
-/*********************************************************************//**
-Write all zeros (or 1 where it makes sense) into a table and its indexes'
-statistics members. The resulting stats correspond to an empty table. */
-static
-void
-dict_stats_empty_table(
-/*===================*/
- dict_table_t* table, /*!< in/out: table */
+void dict_stats_empty_table(
+ dict_table_t* table,
bool empty_defrag_stats)
- /*!< in: whether to empty defrag stats */
{
/* Initialize table/index level stats is now protected by
table level lock_mutex.*/
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index a66aac22..b0c34dc6 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -69,6 +69,8 @@ static recalc_pool_t recalc_pool;
/** Whether the global data structures have been initialized */
static bool stats_initialised;
+static THD *dict_stats_thd;
+
/*****************************************************************//**
Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
@@ -90,6 +92,9 @@ static void dict_stats_recalc_pool_deinit()
defrag_pool_t defrag_empty_pool;
recalc_pool.swap(recalc_empty_pool);
defrag_pool.swap(defrag_empty_pool);
+
+ if (dict_stats_thd)
+ destroy_background_thd(dict_stats_thd);
}
/*****************************************************************//**
@@ -361,52 +366,50 @@ done:
{
ut_ad(i->state == recalc::IN_PROGRESS);
recalc_pool.erase(i);
- const bool reschedule= !update_now && recalc_pool.empty();
if (err == DB_SUCCESS_LOCKED_REC)
recalc_pool.emplace_back(recalc{table_id, recalc::IDLE});
mysql_mutex_unlock(&recalc_pool_mutex);
- if (reschedule)
- dict_stats_schedule(MIN_RECALC_INTERVAL * 1000);
}
return update_now;
}
-static tpool::timer* dict_stats_timer;
-static std::mutex dict_stats_mutex;
+/** Check if the recalc pool is empty. */
+static bool is_recalc_pool_empty()
+{
+ mysql_mutex_lock(&recalc_pool_mutex);
+ bool empty= recalc_pool.empty();
+ mysql_mutex_unlock(&recalc_pool_mutex);
+ return empty;
+}
+static tpool::timer* dict_stats_timer;
static void dict_stats_func(void*)
{
- THD *thd= innobase_create_background_thd("InnoDB statistics");
- set_current_thd(thd);
- while (dict_stats_process_entry_from_recalc_pool(thd)) {}
- dict_defrag_process_entries_from_defrag_pool(thd);
+ if (!dict_stats_thd)
+ dict_stats_thd= innobase_create_background_thd("InnoDB statistics");
+ set_current_thd(dict_stats_thd);
+
+ while (dict_stats_process_entry_from_recalc_pool(dict_stats_thd)) {}
+ dict_defrag_process_entries_from_defrag_pool(dict_stats_thd);
+
+ innobase_reset_background_thd(dict_stats_thd);
set_current_thd(nullptr);
- destroy_background_thd(thd);
+ if (!is_recalc_pool_empty())
+ dict_stats_schedule(MIN_RECALC_INTERVAL * 1000);
}
void dict_stats_start()
{
- std::lock_guard<std::mutex> lk(dict_stats_mutex);
- if (!dict_stats_timer)
- dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func);
+ DBUG_ASSERT(!dict_stats_timer);
+ dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func);
}
static void dict_stats_schedule(int ms)
{
- std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock);
- /*
- Use try_lock() to avoid deadlock in dict_stats_shutdown(), which
- uses dict_stats_mutex too. If there is simultaneous timer reschedule,
- the first one will win, which is fine.
- */
- if (!lk.try_lock())
- {
- return;
- }
- if (dict_stats_timer)
+ if(dict_stats_timer)
dict_stats_timer->set_time(ms,0);
}
@@ -418,7 +421,6 @@ void dict_stats_schedule_now()
/** Shut down the dict_stats_thread. */
void dict_stats_shutdown()
{
- std::lock_guard<std::mutex> lk(dict_stats_mutex);
delete dict_stats_timer;
dict_stats_timer= 0;
}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 8a88f4e2..bd0ace7c 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -341,8 +341,9 @@ static bool fil_node_open_file_low(fil_node_t *node)
ut_ad(!node->is_open());
ut_ad(node->space->is_closing());
mysql_mutex_assert_owner(&fil_system.mutex);
- ulint type;
static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
+#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+ ulint type;
switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
case 1:
case 2:
@@ -351,6 +352,9 @@ static bool fil_node_open_file_low(fil_node_t *node)
default:
type= OS_DATA_FILE;
}
+#else
+ constexpr auto type= OS_DATA_FILE;
+#endif
for (;;)
{
@@ -560,7 +564,7 @@ fil_space_extend_must_retry(
ut_ad(UT_LIST_GET_LAST(space->chain) == node);
ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
ut_ad(node->space == space);
- ut_ad(space->referenced() || space->is_being_truncated);
+ ut_ad(space->referenced());
*success = space->size >= size;
@@ -649,8 +653,7 @@ fil_space_extend_must_retry(
default:
ut_ad(space->purpose == FIL_TYPE_TABLESPACE
|| space->purpose == FIL_TYPE_IMPORT);
- if (space->purpose == FIL_TYPE_TABLESPACE
- && !space->is_being_truncated) {
+ if (space->purpose == FIL_TYPE_TABLESPACE) {
goto do_flush;
}
break;
@@ -735,12 +738,10 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
bool success= false;
const bool acquired= space->acquire();
mysql_mutex_lock(&fil_system.mutex);
- if (acquired || space->is_being_truncated)
- {
+ if (acquired)
while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
size, &success))
mysql_mutex_lock(&fil_system.mutex);
- }
mysql_mutex_unlock(&fil_system.mutex);
if (acquired)
space->release();
@@ -1903,9 +1904,10 @@ fil_ibd_create(
mtr.flag_wr_unlock();
log_write_up_to(lsn, true);
- ulint type;
static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
"compatibility");
+#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+ ulint type;
switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
case 1:
case 2:
@@ -1914,6 +1916,9 @@ fil_ibd_create(
default:
type = OS_DATA_FILE;
}
+#else
+ constexpr auto type = OS_DATA_FILE;
+#endif
file = os_file_create(
innodb_data_file_key, path,
@@ -2184,8 +2189,6 @@ func_exit:
goto corrupted;
}
- os_file_get_last_error(operation_not_for_export,
- !operation_not_for_export);
if (!operation_not_for_export) {
goto corrupted;
}
@@ -2448,21 +2451,15 @@ fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
mysql_mutex_unlock(&fil_system.mutex);
if (space) {
- /* Compare the filename we are trying to open with the
- filename from the first node of the tablespace we opened
- previously. Fail if it is different. */
- fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
- if (0 != strcmp(innobase_basename(filename),
- innobase_basename(node->name))) {
- ib::info()
- << "Ignoring data file '" << filename
- << "' with space ID " << space->id
- << ". Another data file called " << node->name
- << " exists with the same space ID.";
- space = NULL;
- return(FIL_LOAD_ID_CHANGED);
- }
- return(FIL_LOAD_OK);
+ sql_print_information("InnoDB: Ignoring data file '%s'"
+ " with space ID " ULINTPF
+ ". Another data file called %s"
+ " exists"
+ " with the same space ID.",
+ filename, space->id,
+ UT_LIST_GET_FIRST(space->chain)->name);
+ space = NULL;
+ return FIL_LOAD_ID_CHANGED;
}
if (srv_operation == SRV_OPERATION_RESTORE) {
@@ -3027,11 +3024,9 @@ fil_space_validate_for_mtr_commit(
ut_ad(!is_predefined_tablespace(space->id));
/* We are serving mtr_commit(). While there is an active
- mini-transaction, we should have !space->stop_new_ops. This is
+ mini-transaction, we should have !space->is_stopping(). This is
guaranteed by meta-data locks or transactional locks. */
- ut_ad(!space->is_stopping()
- || space->is_being_truncated /* fil_truncate_prepare() */
- || space->referenced());
+ ut_ad(!space->is_stopping() || space->referenced());
}
#endif /* UNIV_DEBUG */
@@ -3080,7 +3075,7 @@ ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write()
and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
@param lsn checkpoint LSN
@return current LSN */
-lsn_t fil_names_clear(lsn_t lsn)
+ATTRIBUTE_COLD lsn_t fil_names_clear(lsn_t lsn)
{
mtr_t mtr;
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
index 16aea2a7..eaf4e04a 100644
--- a/storage/innobase/fil/fil0pagecompress.cc
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -49,11 +49,6 @@ Updated 14/02/2015
#include "buf0lru.h"
#include "ibuf0ibuf.h"
#include "zlib.h"
-#ifdef __linux__
-#include <linux/fs.h>
-#include <sys/ioctl.h>
-#include <fcntl.h>
-#endif
#include "row0mysql.h"
#include "lz4.h"
#include "lzo/lzo1x.h"
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
index cafff419..1c20efcd 100644
--- a/storage/innobase/fsp/fsp0file.cc
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -435,12 +435,22 @@ Datafile::validate_for_recovery()
return(err);
}
+ if (!m_space_id) {
+ m_space_id = recv_sys.dblwr.find_first_page(
+ m_filepath, m_handle);
+ if (m_space_id) {
+ m_defer= false;
+ goto free_first_page;
+ } else return err;
+ }
+
if (!m_defer) {
err = find_space_id();
if (err != DB_SUCCESS || m_space_id == 0) {
- ib::error() << "Datafile '" << m_filepath
- << "' is corrupted. Cannot determine "
- "the space ID from the first 64 pages.";
+ sql_print_error(
+ "InnoDB: Datafile '%s' is corrupted."
+ " Cannot determine the space ID from"
+ " the first 64 pages.", m_filepath);
return(err);
}
}
@@ -453,7 +463,7 @@ Datafile::validate_for_recovery()
m_space_id, m_filepath, m_handle)) {
return m_defer ? err : DB_CORRUPTION;
}
-
+free_first_page:
/* Free the previously read first page and then re-validate. */
free_first_page();
m_defer = false;
@@ -492,11 +502,11 @@ err_exit:
return DB_SUCCESS;
}
- ib::info() << error_txt << " in datafile: " << m_filepath
- << ", Space ID:" << m_space_id << ", Flags: "
- << m_flags;
+ sql_print_error("InnoDB: %s in datafile: %s, Space ID: "
+ UINT32PF ", " "Flags: " UINT32PF,
+ error_txt, m_filepath, m_space_id, m_flags);
m_is_valid = false;
- return(DB_CORRUPTION);
+ return DB_CORRUPTION;
}
/* Check if the whole page is blank. */
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 6c5c354e..87672a82 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -42,8 +42,6 @@ Created 11/29/1995 Heikki Tuuri
#include "fsp0types.h"
#include "log.h"
-typedef uint32_t page_no_t;
-
/** Returns the first extent descriptor for a segment.
We think of the extent lists of the segment catenated in the order
FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
@@ -332,7 +330,7 @@ xdes_t*
xdes_get_descriptor_with_space_hdr(
buf_block_t* header,
const fil_space_t* space,
- page_no_t offset,
+ uint32_t offset,
mtr_t* mtr,
dberr_t* err = nullptr,
buf_block_t** desc_block = nullptr,
@@ -396,7 +394,7 @@ try to add new extents to the space free list
@param[out] err error code
@param[out] xdes extent descriptor page
@return the extent descriptor */
-static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+static xdes_t *xdes_get_descriptor(const fil_space_t *space, uint32_t offset,
mtr_t *mtr, dberr_t *err= nullptr,
buf_block_t **xdes= nullptr)
{
@@ -842,8 +840,7 @@ fsp_fill_free_list(
if (i)
{
buf_block_t *f= buf_LRU_get_free_block(false);
- buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i),
- zip_size, mtr, f);
+ buf_block_t *block= buf_page_create(space, i, zip_size, mtr, f);
if (UNIV_UNLIKELY(block != f))
buf_pool.free_block(f);
fsp_init_file_page(space, block, mtr);
@@ -855,9 +852,7 @@ fsp_fill_free_list(
{
buf_block_t *f= buf_LRU_get_free_block(false);
buf_block_t *block=
- buf_page_create(space,
- static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET),
- zip_size, mtr, f);
+ buf_page_create(space, i + FSP_IBUF_BITMAP_OFFSET, zip_size, mtr, f);
if (UNIV_UNLIKELY(block != f))
buf_pool.free_block(f);
fsp_init_file_page(space, block, mtr);
@@ -1028,40 +1023,13 @@ fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
@param[in] offset page number of the allocated page
@param[in,out] mtr mini-transaction
@return block, initialized */
-static
-buf_block_t*
-fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+static buf_block_t* fsp_page_create(fil_space_t *space, uint32_t offset,
+ mtr_t *mtr)
{
- buf_block_t *block, *free_block;
-
- if (UNIV_UNLIKELY(space->is_being_truncated))
- {
- const page_id_t page_id{space->id, offset};
- buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
- mysql_mutex_lock(&buf_pool.mutex);
- block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash.get(page_id, chain));
- if (block && block->page.oldest_modification() <= 1)
- block= nullptr;
- mysql_mutex_unlock(&buf_pool.mutex);
-
- if (block)
- {
- ut_ad(block->page.buf_fix_count() >= 1);
- ut_ad(block->page.lock.x_lock_count() == 1);
- ut_ad(mtr->have_x_latch(*block));
- free_block= block;
- goto got_free_block;
- }
- }
-
- free_block= buf_LRU_get_free_block(false);
-got_free_block:
- block= buf_page_create(space, static_cast<uint32_t>(offset),
- space->zip_size(), mtr, free_block);
+ buf_block_t *free_block= buf_LRU_get_free_block(false),
+ *block= buf_page_create(space, offset, space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
-
fsp_init_file_page(space, block, mtr);
return block;
}
@@ -1179,7 +1147,7 @@ MY_ATTRIBUTE((nonnull, warn_unused_result))
@param[in] offset page number in the extent
@param[in,out] mtr mini-transaction
@return error code */
-static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset,
+static dberr_t fsp_free_extent(fil_space_t* space, uint32_t offset,
mtr_t* mtr)
{
ut_ad(space->is_owner());
@@ -1216,7 +1184,7 @@ The page is marked as free and clean.
@param[in] offset page number
@param[in,out] mtr mini-transaction
@return error code */
-static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+static dberr_t fsp_free_page(fil_space_t *space, uint32_t offset, mtr_t *mtr)
{
xdes_t* descr;
ulint frag_n_used;
@@ -1756,7 +1724,6 @@ page_alloc:
ut_d(const auto x = block->page.lock.x_lock_count());
ut_ad(x || block->page.lock.not_recursive());
- ut_ad(x == 1 || space->is_being_truncated);
ut_ad(x <= 2);
ut_ad(!fil_page_get_type(block->page.frame));
mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
@@ -2493,7 +2460,7 @@ fseg_free_page_low(
fseg_inode_t* seg_inode,
buf_block_t* iblock,
fil_space_t* space,
- page_no_t offset,
+ uint32_t offset,
mtr_t* mtr
#ifdef BTR_CUR_HASH_ADAPT
,bool ahi=false
@@ -2859,7 +2826,7 @@ fseg_free_step(
return true;
}
- page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
+ uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
if (fseg_free_page_low(inode, iblock, space, page_no, mtr
#ifdef BTR_CUR_HASH_ADAPT
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index e4a43e48..4ac9da50 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -33,6 +33,7 @@ Refactored 2013-7-26 by Kevin Lewis
#include "os0file.h"
#include "row0mysql.h"
#include "buf0dblwr.h"
+#include "log.h"
/** The server header file is included to access opt_initialize global variable.
If server passes the option for create/open DB to SE, we should remove such
@@ -568,7 +569,7 @@ inline dberr_t SysTablespace::read_lsn_and_check_flags()
}
err = it->read_first_page(
- m_ignore_read_only ? false : srv_read_only_mode);
+ m_ignore_read_only && srv_read_only_mode);
if (err != DB_SUCCESS) {
return(err);
@@ -582,47 +583,62 @@ inline dberr_t SysTablespace::read_lsn_and_check_flags()
/* Check the contents of the first page of the
first datafile. */
- for (int retry = 0; retry < 2; ++retry) {
+ err = it->validate_first_page();
- err = it->validate_first_page();
-
- if (err != DB_SUCCESS
- && (retry == 1
- || recv_sys.dblwr.restore_first_page(
+ if (err != DB_SUCCESS) {
+ if (recv_sys.dblwr.restore_first_page(
it->m_space_id, it->m_filepath,
- it->handle()))) {
-
+ it->handle())) {
it->close();
-
return(err);
}
+ err = it->read_first_page(
+ m_ignore_read_only && srv_read_only_mode);
}
/* Make sure the tablespace space ID matches the
space ID on the first page of the first datafile. */
- if (space_id() != it->m_space_id) {
-
- ib::error()
- << "The data file '" << it->filepath()
- << "' has the wrong space ID. It should be "
- << space_id() << ", but " << it->m_space_id
- << " was found";
-
+ if (err != DB_SUCCESS || space_id() != it->m_space_id) {
+ sql_print_error("InnoDB: The data file '%s'"
+ " has the wrong space ID."
+ " It should be " UINT32PF ", but " UINT32PF
+ " was found", it->filepath(),
+ space_id(), it->m_space_id);
it->close();
-
- return(err);
+ return err;
}
- if (srv_operation == SRV_OPERATION_NORMAL) {
+ if (srv_force_recovery != 6
+ && srv_operation == SRV_OPERATION_NORMAL
+ && !log_sys.next_checkpoint_lsn
+ && log_sys.format == log_t::FORMAT_3_23) {
+
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
/* Prepare for possible upgrade from 0-sized ib_logfile0. */
- ut_ad(!log_sys.next_checkpoint_lsn);
log_sys.next_checkpoint_lsn = mach_read_from_8(
it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/);
+ if (log_sys.next_checkpoint_lsn < 8204) {
+ /* Before MDEV-14425, InnoDB had a minimum LSN
+ of 8192+12=8204. Likewise, mariadb-backup
+ --prepare would create an empty ib_logfile0
+ after applying the log. We will allow an
+ upgrade from such an empty log. */
+ sql_print_error("InnoDB: ib_logfile0 is "
+ "empty, and LSN is unknown.");
+ err = DB_CORRUPTION;
+ } else {
+ log_sys.last_checkpoint_lsn =
+ recv_sys.lsn = recv_sys.file_checkpoint =
+ log_sys.next_checkpoint_lsn;
+ log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn);
+ log_sys.next_checkpoint_no = 0;
+ }
+
+ log_sys.latch.wr_unlock();
}
it->close();
-
- return(DB_SUCCESS);
+ return err;
}
/** Check if a file can be opened in the correct mode.
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index 8ca8681b..0df9a7de 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -304,6 +304,8 @@ rtr_pcur_getnext_from_path(
break;
}
+ buf_page_make_young_if_needed(&block->page);
+
page = buf_block_get_frame(block);
page_ssn = page_get_ssn_id(page);
@@ -683,6 +685,8 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
return err;
}
+ buf_page_make_young_if_needed(&block->page);
+
const page_t *page= buf_block_get_frame(block);
#ifdef UNIV_ZIP_DEBUG
if (rw_latch != RW_NO_LATCH) {
@@ -1703,6 +1707,8 @@ corrupted:
goto func_exit;
}
+ buf_page_make_young_if_needed(&page_cursor->block->page);
+
/* Get the page SSN */
page = buf_block_get_frame(page_cursor->block);
page_ssn = page_get_ssn_id(page);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 21bf10a1..407834f2 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1575,7 +1575,8 @@ static void innodb_drop_database(handlerton*, char *path)
ibuf_delete_for_discarded_space(id);
/* Any changes must be persisted before we return. */
- log_write_up_to(mtr.commit_lsn(), true);
+ if (mtr.commit_lsn())
+ log_write_up_to(mtr.commit_lsn(), true);
}
my_free(namebuf);
@@ -2080,7 +2081,7 @@ all_fail:
ut_d(purge_sys.resume_FTS());
}
-static void innodb_ddl_recovery_done(handlerton*)
+static int innodb_ddl_recovery_done(handlerton*)
{
ut_ad(!ddl_recovery_done);
ut_d(ddl_recovery_done= true);
@@ -2091,6 +2092,7 @@ static void innodb_ddl_recovery_done(handlerton*)
drop_garbage_tables_after_restore();
srv_init_purge_tasks();
}
+ return 0;
}
/********************************************************************//**
@@ -4001,7 +4003,7 @@ static int innodb_init_params()
data_mysql_default_charset_coll = (ulint) default_charset_info->number;
-#ifndef _WIN32
+#ifdef HAVE_FCNTL_DIRECT
if (srv_use_atomic_writes && my_may_have_atomic_write) {
/*
Force O_DIRECT on Unixes (on Windows writes are always
@@ -4026,11 +4028,6 @@ static int innodb_init_params()
}
#endif
- if (srv_read_only_mode) {
- ib::info() << "Started in read only mode";
- srv_use_doublewrite_buf = FALSE;
- }
-
#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
/* Currently native AIO is supported only on windows and linux
and that also when the support is compiled in. In all other
@@ -4046,9 +4043,7 @@ static int innodb_init_params()
}
#endif
-#ifndef _WIN32
- ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
-#else
+#ifdef _WIN32
switch (srv_file_flush_method) {
case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
@@ -4059,6 +4054,8 @@ static int innodb_init_params()
default:
ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
}
+#else
+ ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
#endif
innodb_buffer_pool_size_init();
@@ -7817,20 +7814,6 @@ ha_innobase::write_row(
#endif
if ((error_result = update_auto_increment())) {
- /* We don't want to mask autoinc overflow errors. */
-
- /* Handle the case where the AUTOINC sub-system
- failed during initialization. */
- if (m_prebuilt->autoinc_error == DB_UNSUPPORTED) {
- error_result = ER_AUTOINC_READ_FAILED;
- /* Set the error message to report too. */
- my_error(ER_AUTOINC_READ_FAILED, MYF(0));
- goto func_exit;
- } else if (m_prebuilt->autoinc_error != DB_SUCCESS) {
- error = m_prebuilt->autoinc_error;
- goto report_error;
- }
-
/* MySQL errors are passed straight back. */
goto func_exit;
}
@@ -7968,7 +7951,6 @@ set_max_autoinc:
}
}
-report_error:
/* Cleanup and exit. */
if (error == DB_TABLESPACE_DELETED) {
ib_senderrf(
@@ -11809,8 +11791,6 @@ index_bad:
/* Set the flags2 when create table or alter tables */
m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- m_flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
DBUG_RETURN(true);
}
@@ -14710,12 +14690,7 @@ ha_innobase::info_low(
DBUG_ASSERT(ib_table->get_ref_count() > 0);
if (!ib_table->is_readable()) {
- ib_table->stats_mutex_lock();
- ib_table->stat_initialized = true;
- ib_table->stat_n_rows = 0;
- ib_table->stat_clustered_index_size = 0;
- ib_table->stat_sum_of_other_index_sizes = 0;
- ib_table->stats_mutex_unlock();
+ dict_stats_empty_table(ib_table, true);
}
if (flag & HA_STATUS_TIME) {
@@ -15674,15 +15649,17 @@ ha_innobase::extra(
{
/* Warning: since it is not sure that MariaDB calls external_lock()
before calling this function, m_prebuilt->trx can be obsolete! */
- trx_t* trx = check_trx_exists(ha_thd());
+ trx_t* trx;
switch (operation) {
case HA_EXTRA_FLUSH:
+ (void)check_trx_exists(ha_thd());
if (m_prebuilt->blob_heap) {
row_mysql_prebuilt_free_blob_heap(m_prebuilt);
}
break;
case HA_EXTRA_RESET_STATE:
+ trx = check_trx_exists(ha_thd());
reset_template();
trx->duplicates = 0;
stmt_boundary:
@@ -15691,18 +15668,23 @@ ha_innobase::extra(
trx->bulk_insert = false;
break;
case HA_EXTRA_NO_KEYREAD:
+ (void)check_trx_exists(ha_thd());
m_prebuilt->read_just_key = 0;
break;
case HA_EXTRA_KEYREAD:
+ (void)check_trx_exists(ha_thd());
m_prebuilt->read_just_key = 1;
break;
case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ (void)check_trx_exists(ha_thd());
m_prebuilt->keep_other_fields_on_keyread = 1;
break;
case HA_EXTRA_INSERT_WITH_UPDATE:
+ trx = check_trx_exists(ha_thd());
trx->duplicates |= TRX_DUP_IGNORE;
goto stmt_boundary;
case HA_EXTRA_NO_IGNORE_DUP_KEY:
+ trx = check_trx_exists(ha_thd());
trx->duplicates &= ~TRX_DUP_IGNORE;
if (trx->is_bulk_insert()) {
/* Allow a subsequent INSERT into an empty table
@@ -15714,9 +15696,11 @@ ha_innobase::extra(
}
goto stmt_boundary;
case HA_EXTRA_WRITE_CAN_REPLACE:
+ trx = check_trx_exists(ha_thd());
trx->duplicates |= TRX_DUP_REPLACE;
goto stmt_boundary;
case HA_EXTRA_WRITE_CANNOT_REPLACE:
+ trx = check_trx_exists(ha_thd());
trx->duplicates &= ~TRX_DUP_REPLACE;
if (trx->is_bulk_insert()) {
/* Allow a subsequent INSERT into an empty table
@@ -15725,6 +15709,7 @@ ha_innobase::extra(
}
goto stmt_boundary;
case HA_EXTRA_BEGIN_ALTER_COPY:
+ trx = check_trx_exists(ha_thd());
m_prebuilt->table->skip_alter_undo = 1;
if (m_prebuilt->table->is_temporary()
|| !m_prebuilt->table->versioned_by_id()) {
@@ -15737,8 +15722,10 @@ ha_innobase::extra(
.first->second.set_versioned(0);
break;
case HA_EXTRA_END_ALTER_COPY:
+ trx = check_trx_exists(ha_thd());
m_prebuilt->table->skip_alter_undo = 0;
- if (!m_prebuilt->table->is_temporary()) {
+ if (!m_prebuilt->table->is_temporary()
+ && !high_level_read_only) {
log_buffer_flush_to_disk();
}
break;
@@ -18270,11 +18257,18 @@ static
void
buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
{
- if (*(my_bool*) save) {
- mysql_mutex_unlock(&LOCK_global_system_variables);
- buf_flush_sync();
- mysql_mutex_lock(&LOCK_global_system_variables);
- }
+ if (!*(my_bool*) save)
+ return;
+ const uint s= srv_fil_make_page_dirty_debug;
+ mysql_mutex_unlock(&LOCK_global_system_variables);
+ if (s)
+ buf_flush_sync();
+ else
+ {
+ while (buf_flush_list_space(fil_system.sys_space, nullptr));
+ os_aio_wait_until_no_pending_writes(true);
+ }
+ mysql_mutex_lock(&LOCK_global_system_variables);
}
/** Override current MERGE_THRESHOLD setting for all indexes at dictionary
@@ -19368,8 +19362,10 @@ static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
10 << 20, 10 << 20,
1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0);
+static ulong innodb_purge_rseg_truncate_frequency;
+
static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency,
- srv_purge_rseg_truncate_frequency,
+ innodb_purge_rseg_truncate_frequency,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED,
"Deprecated parameter with no effect",
NULL, NULL, 128, 1, 128, 0);
@@ -20606,6 +20602,10 @@ Compare_keys ha_innobase::compare_key_parts(
if (old_part.length >= new_part.length)
return Compare_keys::NotEqual;
+ if (old_part.length == old_field.key_length() &&
+ new_part.length != new_field.length)
+ return Compare_keys::NotEqual;
+
return Compare_keys::EqualButKeyPartLength;
}
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 40370ac5..1401136f 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -2317,12 +2317,16 @@ innodb_instant_alter_column_allowed_reason:
}
}
+ bool need_rebuild = false;
+
switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
case ALTER_OPTIONS:
- if (alter_options_need_rebuild(ha_alter_info, table)) {
+ if ((srv_file_per_table && !m_prebuilt->table->space_id)
+ || alter_options_need_rebuild(ha_alter_info, table)) {
reason_rebuild = my_get_err_msg(
ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD);
ha_alter_info->unsupported_reason = reason_rebuild;
+ need_rebuild= true;
break;
}
/* fall through */
@@ -2434,7 +2438,7 @@ innodb_instant_alter_column_allowed_reason:
/* We should be able to do the operation in-place.
See if we can do it online (LOCK=NONE) or without rebuild. */
- bool online = true, need_rebuild = false;
+ bool online = true;
const uint fulltext_indexes = innobase_fulltext_exist(altered_table);
/* Fix the key parts. */
@@ -4338,7 +4342,8 @@ static void unlock_and_close_files(const std::vector<pfs_os_file_t> &deleted,
row_mysql_unlock_data_dictionary(trx);
for (pfs_os_file_t d : deleted)
os_file_close(d);
- log_write_up_to(trx->commit_lsn, true);
+ if (trx->commit_lsn)
+ log_write_up_to(trx->commit_lsn, true);
}
/** Commit a DDL transaction and unlink any deleted files. */
@@ -4681,11 +4686,13 @@ innobase_build_col_map(
col_map[old_i - num_old_v] = i;
if (!old_table->versioned()
|| !altered_table->versioned()) {
- } else if (old_i == old_table->vers_start) {
- new_table->vers_start = (i + num_v)
+ } else if (old_i - num_old_v == old_table->vers_start) {
+ ut_ad(field->vers_sys_start());
+ new_table->vers_start = i
& dict_index_t::MAX_N_FIELDS;
- } else if (old_i == old_table->vers_end) {
- new_table->vers_end = (i + num_v)
+ } else if (old_i - num_old_v == old_table->vers_end) {
+ ut_ad(field->vers_sys_end());
+ new_table->vers_end = i
& dict_index_t::MAX_N_FIELDS;
}
goto found_col;
@@ -6217,24 +6224,20 @@ empty_table:
/* Convert the table to the instant ALTER TABLE format. */
mtr.commit();
mtr.start();
- index->set_modified(mtr);
- if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr,
+ if (buf_block_t* root = btr_root_block_get(index, RW_S_LATCH, &mtr,
&err)) {
if (fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX) {
DBUG_ASSERT("wrong page type" == 0);
err = DB_CORRUPTION;
goto func_exit;
}
-
- btr_set_instant(root, *index, &mtr);
- mtr.commit();
- mtr.start();
- index->set_modified(mtr);
- err = row_ins_clust_index_entry_low(
- BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index,
- index->n_uniq, entry, 0, thr);
}
+ mtr.commit();
+ mtr.start();
+ err = row_ins_clust_index_entry_low(
+ BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index,
+ index->n_uniq, entry, 0, thr);
goto func_exit;
}
@@ -7775,6 +7778,7 @@ bool check_col_is_in_fk_indexes(
for (const auto &a : add_fk)
{
+ if (!a->foreign_index) continue;
for (ulint i= 0; i < a->n_fields; i++)
{
if (a->foreign_index->fields[i].col == col)
@@ -11666,7 +11670,6 @@ foreign_fail:
}
unlock_and_close_files(deleted, trx);
- log_write_up_to(trx->commit_lsn, true);
DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
DBUG_SUICIDE(););
trx->free();
@@ -11723,7 +11726,6 @@ foreign_fail:
}
unlock_and_close_files(deleted, trx);
- log_write_up_to(trx->commit_lsn, true);
DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
DBUG_SUICIDE(););
trx->free();
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index b00308d7..711144e3 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -4539,6 +4539,15 @@ i_s_dict_fill_sys_tables(
DBUG_RETURN(0);
}
+/** Handle the error for information schema query
+@param err error value
+@param thd thread
+@return 0 if query is interrupted or error */
+static int i_s_sys_error_handling(int err, THD *thd)
+{
+ return thd_kill_level(thd) ? 0 : err;
+}
+
/** Convert one SYS_TABLES record to dict_table_t.
@param pcur persistent cursor position on SYS_TABLES record
@param mtr mini-transaction (nullptr=use the dict_sys cache)
@@ -4587,6 +4596,7 @@ i_s_sys_tables_fill_table(
{
btr_pcur_t pcur;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_tables_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -4616,8 +4626,15 @@ i_s_sys_tables_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_tables(thd, table_rec,
- tables->table);
+ err = i_s_dict_fill_sys_tables(
+ thd, table_rec, tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ if (table_rec) {
+ dict_mem_table_free(table_rec);
+ }
+ goto func_exit;
+ }
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_FIND_SYSTEM_REC, "%s",
@@ -4635,8 +4652,10 @@ i_s_sys_tables_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
@@ -4807,6 +4826,7 @@ i_s_sys_tables_fill_table_stats(
btr_pcur_t pcur;
const rec_t* rec;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_tables_fill_table_stats");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -4832,8 +4852,12 @@ i_s_sys_tables_fill_table_stats(
&table_rec);
if (UNIV_LIKELY(!err_msg)) {
- i_s_dict_fill_sys_tablestats(thd, table_rec,
+ err = i_s_dict_fill_sys_tablestats(thd, table_rec,
tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
} else {
ut_ad(!table_rec);
dict_sys.unlock();
@@ -4851,8 +4875,9 @@ i_s_sys_tables_fill_table_stats(
mtr.commit();
dict_sys.unlock();
-
- DBUG_RETURN(0);
+func_exit:
+ ut_free(pcur.old_rec_buf);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
@@ -5024,6 +5049,7 @@ i_s_sys_indexes_fill_table(
const rec_t* rec;
mem_heap_t* heap;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_indexes_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5059,11 +5085,13 @@ i_s_sys_indexes_fill_table(
dict_sys.unlock();
if (!err_msg) {
- if (int err = i_s_dict_fill_sys_indexes(
- thd, table_id, space_id, &index_rec,
- tables->table)) {
- mem_heap_free(heap);
- DBUG_RETURN(err);
+ err = i_s_dict_fill_sys_indexes(
+ thd, table_id, space_id,
+ &index_rec,
+ tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
}
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -5081,9 +5109,11 @@ i_s_sys_indexes_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
mem_heap_free(heap);
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes
@@ -5232,6 +5262,7 @@ i_s_sys_columns_fill_table(
const char* col_name;
mem_heap_t* heap;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_columns_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5263,9 +5294,14 @@ i_s_sys_columns_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_columns(thd, table_id, col_name,
- &column_rec, nth_v_col,
- tables->table);
+ err = i_s_dict_fill_sys_columns(
+ thd, table_id, col_name,
+ &column_rec, nth_v_col,
+ tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_FIND_SYSTEM_REC, "%s",
@@ -5282,9 +5318,11 @@ i_s_sys_columns_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
mem_heap_free(heap);
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
@@ -5416,6 +5454,7 @@ i_s_sys_virtual_fill_table(
ulint pos;
ulint base_pos;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_virtual_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5444,8 +5483,13 @@ i_s_sys_virtual_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos,
- tables->table);
+ err = i_s_dict_fill_sys_virtual(
+ thd, table_id, pos, base_pos,
+ tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_FIND_SYSTEM_REC, "%s",
@@ -5462,6 +5506,9 @@ i_s_sys_virtual_fill_table(
dict_sys.unlock();
DBUG_RETURN(0);
+func_exit:
+ ut_free(pcur.old_rec_buf);
+ DBUG_RETURN(err);
}
/** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_virtual
@@ -5589,6 +5636,7 @@ i_s_sys_fields_fill_table(
mem_heap_t* heap;
index_id_t last_id;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_fields_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5624,8 +5672,13 @@ i_s_sys_fields_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
- pos, tables->table);
+ err = i_s_dict_fill_sys_fields(
+ thd, index_id, &field_rec,
+ pos, tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
last_id = index_id;
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -5643,9 +5696,11 @@ i_s_sys_fields_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
mem_heap_free(heap);
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields
@@ -5782,6 +5837,7 @@ i_s_sys_foreign_fill_table(
const rec_t* rec;
mem_heap_t* heap;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_foreign_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5809,8 +5865,12 @@ i_s_sys_foreign_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_foreign(thd, &foreign_rec,
- tables->table);
+ err = i_s_dict_fill_sys_foreign(
+ thd, &foreign_rec, tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_FIND_SYSTEM_REC, "%s",
@@ -5827,9 +5887,11 @@ i_s_sys_foreign_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
mem_heap_free(heap);
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
@@ -5963,6 +6025,7 @@ i_s_sys_foreign_cols_fill_table(
const rec_t* rec;
mem_heap_t* heap;
mtr_t mtr;
+ int err = 0;
DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
@@ -5994,9 +6057,13 @@ i_s_sys_foreign_cols_fill_table(
dict_sys.unlock();
if (!err_msg) {
- i_s_dict_fill_sys_foreign_cols(
- thd, name, for_col_name, ref_col_name, pos,
- tables->table);
+ err = i_s_dict_fill_sys_foreign_cols(
+ thd, name, for_col_name,
+ ref_col_name, pos, tables->table);
+ if (err) {
+ err = i_s_sys_error_handling(err, thd);
+ goto func_exit;
+ }
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_FIND_SYSTEM_REC, "%s",
@@ -6013,9 +6080,11 @@ i_s_sys_foreign_cols_fill_table(
mtr.commit();
dict_sys.unlock();
+func_exit:
mem_heap_free(heap);
+ ut_free(pcur.old_rec_buf);
- DBUG_RETURN(0);
+ DBUG_RETURN(err);
}
/*******************************************************************//**
Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols
@@ -6218,6 +6287,8 @@ static int i_s_sys_tablespaces_fill_table(THD *thd, TABLE_LIST *tables, Item*)
mysql_mutex_unlock(&fil_system.mutex);
if (err == DB_SUCCESS)
err= i_s_sys_tablespaces_fill(thd, *fil_system.temp_space, tables->table);
+ else
+ err = i_s_sys_error_handling(err, thd);
DBUG_RETURN(err);
}
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index b9e94a67..4ec07b81 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -309,8 +309,13 @@ ibuf_header_page_get(
buf_block_t* block = buf_page_get(
page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
0, RW_X_LATCH, mtr);
+ if (UNIV_UNLIKELY(!block)) {
+ return nullptr;
+ }
+
+ buf_page_make_young_if_needed(&block->page);
- return block ? block->page.frame : nullptr;
+ return block->page.frame;
}
/** Acquire the change buffer root page.
@@ -326,7 +331,12 @@ static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr)
buf_block_t *block=
buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO},
0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err);
- ut_ad(!block || ibuf.empty == page_is_empty(block->page.frame));
+ if (block)
+ {
+ ut_ad(ibuf.empty == page_is_empty(block->page.frame));
+ buf_page_make_young_if_needed(&block->page);
+ }
+
return block;
}
@@ -408,7 +418,8 @@ err_exit:
+ header_page->page.frame, &ibuf.seg_size, &mtr);
do {
- DBUG_EXECUTE_IF("intermittent_read_failure", continue;);
+ IF_DBUG(if (_db_keyword_(nullptr, "intermittent_read_failure",
+ 1)) continue,);
ut_ad(ibuf.seg_size >= 2);
} while (0);
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 5a0401fa..b42c543c 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -89,10 +89,12 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
@param[in] merge whether change buffer merge should be attempted
@param[in,out] mtr mini-transaction
@param[out] err error code
+@param[out] first set if this is a first-time access to the page
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, rw_lock_type_t mode, bool merge,
- mtr_t *mtr, dberr_t *err= nullptr);
+ mtr_t *mtr, dberr_t *err= nullptr,
+ bool *first= nullptr);
/**************************************************************//**
Gets the index id field of a page.
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 332b2039..cd7cc294 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -262,8 +262,6 @@ buf_block_t*
buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
buf_block_t *free_block);
-/** Move a block to the start of the LRU list. */
-void buf_page_make_young(buf_page_t *bpage);
/** Mark the page status as FREED for the given tablespace and page number.
@param[in,out] space tablespace
@param[in] page page number
@@ -285,15 +283,6 @@ there is danger of dropping from the buffer pool.
@return true if bpage should be made younger */
inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
-/** Move a page to the start of the buffer pool LRU list if it is too old.
-@param[in,out] bpage buffer pool page */
-inline void buf_page_make_young_if_needed(buf_page_t *bpage)
-{
- if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
- buf_page_make_young(bpage);
- }
-}
-
/********************************************************************//**
Increments the modify clock of a frame by 1. The caller must (1) own the
buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
@@ -656,12 +645,9 @@ public:
access_time= 0;
}
- void set_os_unused()
+ void set_os_unused() const
{
MEM_NOACCESS(frame, srv_page_size);
-#ifdef MADV_FREE
- madvise(frame, srv_page_size, MADV_FREE);
-#endif
}
void set_os_used() const
@@ -1301,6 +1287,11 @@ public:
/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
inline void resize();
+#ifdef __linux__
+ /** Collect garbage (release pages from the LRU list) */
+ inline void garbage_collect();
+#endif
+
/** @return whether resize() is in progress */
bool resize_in_progress() const
{
@@ -1507,10 +1498,8 @@ public:
n_chunks_new / 4 * chunks->size;
}
- /** @return whether the buffer pool has run out */
- TPOOL_SUPPRESS_TSAN
- bool ran_out() const
- { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+ /** @return whether the buffer pool is running low */
+ bool need_LRU_eviction() const;
/** @return whether the buffer pool is shrinking */
inline bool is_shrinking() const
@@ -1836,6 +1825,9 @@ public:
Set whenever the free list grows, along with a broadcast of done_free.
Protected by buf_pool.mutex. */
Atomic_relaxed<bool> try_LRU_scan;
+ /** Whether we have warned to be running out of buffer pool */
+ std::atomic_flag LRU_warned;
+
/* @} */
/** @name LRU replacement algorithm fields */
@@ -1898,7 +1890,8 @@ public:
a delete-buffering operation is pending. Protected by mutex. */
buf_page_t watch[innodb_purge_threads_MAX + 1];
/** Reserve a buffer. */
- buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
+ buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads)
+ { return io_buf.reserve(wait_for_reads); }
/** Remove a block from flush_list.
@param bpage buffer pool page */
@@ -1933,7 +1926,7 @@ private:
void close();
/** Reserve a buffer */
- buf_tmp_buffer_t *reserve();
+ buf_tmp_buffer_t *reserve(bool wait_for_reads);
} io_buf;
/** whether resize() is in the critical path */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index 9932b0e5..6e7662d9 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -105,7 +105,8 @@ public:
If we are upgrading from a version before MySQL 4.1, then this
function performs the necessary update operations to support
innodb_file_per_table. If we are in a crash recovery, this function
- loads the pages from double write buffer into memory.
+ loads the pages from double write buffer which are not older than
+ the checkpoint into memory.
@param file File handle
@param path Path name of file
@return DB_SUCCESS or error code */
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index aec08e77..28410276 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -108,6 +108,16 @@ buf_LRU_add_block(
blocks in the LRU list, else put to the
start; if the LRU list is very short, added to
the start regardless of this parameter */
+
+/** Move a block to the start of the buf_pool.LRU list.
+@param bpage buffer pool page */
+void buf_page_make_young(buf_page_t *bpage);
+/** Flag a page accessed in buf_pool and move it to the start of buf_pool.LRU
+if it is too old.
+@param bpage buffer pool page
+@return whether this is not the first access */
+bool buf_page_make_young_if_needed(buf_page_t *bpage);
+
/******************************************************************//**
Adds a block to the LRU list of decompressed zip pages. */
void
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index f7d33d5b..3143aafd 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -35,22 +35,16 @@ Created 4/24/1996 Heikki Tuuri
#include "btr0types.h"
#include <deque>
+#include <set>
/** A stack of table names related through foreign key constraints */
typedef std::deque<const char*, ut_allocator<const char*> > dict_names_t;
-/** Check each tablespace found in the data dictionary.
-Then look at each table defined in SYS_TABLES that has a space_id > 0
-to find all the file-per-table tablespaces.
+/** Check MAX(SPACE) FROM SYS_TABLES and store it in fil_system.
+Open each data file if an encryption plugin has been loaded.
-In a crash recovery we already have some tablespace objects created from
-processing the REDO log. We will compare the
-space_id information in the data dictionary to what we find in the
-tablespace file. In addition, more validation will be done if recovery
-was needed and force_recovery is not set.
-
-We also scan the biggest space id, and store it to fil_system. */
-void dict_check_tablespaces_and_store_max_id();
+@param spaces set of tablespace files to open */
+void dict_check_tablespaces_and_store_max_id(const std::set<uint32_t> *spaces);
/** Make sure the data_file_name is saved in dict_table_t if needed.
@param[in,out] table Table object */
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 0dc1b984..3b006daf 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -235,4 +235,13 @@ dict_stats_report_error(dict_table_t* table, bool defragment = false)
void test_dict_stats_all();
#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+/** Write all zeros (or 1 where it makes sense) into a table
+and its indexes'statistics members. The resulting stats
+correspond to an empty table.
+@param table table stats to be emptied
+@param empty_defrag_stats empty the defrag stats */
+void
+dict_stats_empty_table(
+ dict_table_t* table,
+ bool empty_defrag_stats);
#endif /* dict0stats_h */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 6f58e3c1..cdc32515 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -359,8 +359,6 @@ struct fil_space_t final
lsn_t max_lsn;
/** tablespace identifier */
uint32_t id;
- /** whether undo tablespace truncation is in progress */
- bool is_being_truncated;
fil_type_t purpose;/*!< purpose */
UT_LIST_BASE_NODE_T(fil_node_t) chain;
/*!< base node for the file chain */
@@ -440,6 +438,8 @@ private:
/** LSN of freeing last page; protected by freed_range_mutex */
lsn_t last_freed_lsn;
+ /** LSN of undo tablespace creation or 0; protected by latch */
+ lsn_t create_lsn;
public:
/** @return whether doublewrite buffering is needed */
inline bool use_doublewrite() const;
@@ -447,6 +447,12 @@ public:
/** @return whether a page has been freed */
inline bool is_freed(uint32_t page);
+ /** Set create_lsn. */
+ inline void set_create_lsn(lsn_t lsn);
+
+ /** @return the latest tablespace rebuild LSN, or 0 */
+ lsn_t get_create_lsn() const { return create_lsn; }
+
/** Apply freed_ranges to the file.
@param writable whether the file is writable
@return number of pages written or hole-punched */
@@ -524,9 +530,6 @@ public:
/** Note that operations on the tablespace must stop. */
inline void set_stopping();
- /** Note that operations on the tablespace can resume after truncation */
- inline void clear_stopping();
-
/** Drop the tablespace and wait for any pending operations to cease
@param id tablespace identifier
@param detached_handle pointer to file to be closed later, or nullptr
@@ -1555,14 +1558,6 @@ inline void fil_space_t::set_stopping()
#endif
}
-inline void fil_space_t::clear_stopping()
-{
- mysql_mutex_assert_owner(&fil_system.mutex);
- static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
- ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
- ut_ad((n & STOPPING) == STOPPING_WRITES);
-}
-
/** Flush pending writes from the file system cache to the file. */
template<bool have_reference> inline void fil_space_t::flush()
{
@@ -1802,7 +1797,7 @@ bool fil_comp_algo_loaded(ulint comp_algo);
and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
@param lsn checkpoint LSN
@return current LSN */
-lsn_t fil_names_clear(lsn_t lsn);
+ATTRIBUTE_COLD lsn_t fil_names_clear(lsn_t lsn);
#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
void test_make_filepath();
diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl
index da14cfcb..3cb09c92 100644
--- a/storage/innobase/include/fts0priv.inl
+++ b/storage/innobase/include/fts0priv.inl
@@ -34,29 +34,6 @@ fts_write_object_id(
ib_id_t id, /* in: a table/index id */
char* str) /* in: buffer to write the id to */
{
-
-#ifdef _WIN32
-
- DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
- return(sprintf(str, UINT64PFx, id)););
-
- /* Use this to construct old(5.6.14 and 5.7.3) windows
- ambiguous aux table names */
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- return(sprintf(str, "%016llu", (ulonglong) id)););
-
-#else /* _WIN32 */
-
- /* Use this to construct old(5.6.14 and 5.7.3) windows
- ambiguous aux table names */
- DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
- return(sprintf(str, "%016llu", (ulonglong) id)););
-
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- return(sprintf(str, "%016llx", (ulonglong) id)););
-
-#endif /* _WIN32 */
-
return(sprintf(str, "%016llx", (ulonglong) id));
}
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index f873eabf..54851ca0 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -79,13 +79,6 @@ ATTRIBUTE_COLD void log_make_checkpoint();
/** Make a checkpoint at the latest lsn on shutdown. */
ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
-/**
-Checks that there is enough free space in the log to start a new query step.
-Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
-function may only be called if the calling thread owns no synchronization
-objects! */
-ATTRIBUTE_COLD void log_check_margins();
-
/******************************************************//**
Prints info of the log. */
void
@@ -179,24 +172,33 @@ private:
std::atomic<lsn_t> flushed_to_disk_lsn;
/** log sequence number when log resizing was initiated, or 0 */
std::atomic<lsn_t> resize_lsn;
- /** set when there may be need to flush the log buffer, or
- preflush buffer pool pages, or initiate a log checkpoint.
+ /** set when there may be need to initiate a log checkpoint.
This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
- std::atomic<bool> check_flush_or_checkpoint_;
-
+ std::atomic<bool> need_checkpoint;
#if defined(__aarch64__)
-/* On ARM, we do more spinning */
-typedef srw_spin_lock log_rwlock_t;
-#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST
+ /* On ARM, we do more spinning */
+ typedef srw_spin_lock log_rwlock;
+ typedef pthread_mutex_wrapper<true> log_lsn_lock;
#else
-typedef srw_lock log_rwlock_t;
-#define LSN_LOCK_ATTR nullptr
+ typedef srw_lock log_rwlock;
+ typedef srw_mutex log_lsn_lock;
#endif
public:
- /** rw-lock protecting buf */
- alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch;
+ /** rw-lock protecting writes to buf; normal mtr_t::commit()
+ outside any log checkpoint is covered by a shared latch */
+ alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch;
+private:
+ /** mutex protecting buf_free et al, together with latch */
+ log_lsn_lock lsn_lock;
+public:
+ /** first free offset within buf use; protected by lsn_lock */
+ Atomic_relaxed<size_t> buf_free;
+ /** number of write requests (to buf); protected by lsn_lock */
+ size_t write_to_buf;
+ /** number of append_prepare_wait(); protected by lsn_lock */
+ size_t waits;
private:
/** Last written LSN */
lsn_t write_lsn;
@@ -227,20 +229,12 @@ private:
/** Buffer for writing to resize_log; @see flush_buf */
byte *resize_flush_buf;
- /** spin lock protecting lsn, buf_free in append_prepare() */
- alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock;
- void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); }
- void lock_lsn() { pthread_mutex_lock(&lsn_lock); }
- void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); }
- void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); }
+ void init_lsn_lock() {lsn_lock.init(); }
+ void lock_lsn() { lsn_lock.wr_lock(); }
+ void unlock_lsn() {lsn_lock.wr_unlock(); }
+ void destroy_lsn_lock() { lsn_lock.destroy(); }
public:
- /** first free offset within buf use; protected by lsn_lock */
- Atomic_relaxed<size_t> buf_free;
- /** number of write requests (to buf); protected by exclusive lsn_lock */
- ulint write_to_buf;
- /** number of waits in append_prepare(); protected by lsn_lock */
- ulint waits;
/** recommended maximum size of buf, after which the buffer is flushed */
size_t max_buf_free;
@@ -308,6 +302,9 @@ public:
bool is_opened() const noexcept { return log.is_opened(); }
+ /** @return target write LSN to react on buf_free >= max_buf_free */
+ inline lsn_t get_write_target() const;
+
/** @return LSN at which log resizing was started and is still in progress
@retval 0 if no log resizing is in progress */
lsn_t resize_in_progress() const noexcept
@@ -419,13 +416,14 @@ public:
inline void persist(lsn_t lsn) noexcept;
#endif
- bool check_flush_or_checkpoint() const
+ bool check_for_checkpoint() const
+ {
+ return UNIV_UNLIKELY(need_checkpoint.load(std::memory_order_relaxed));
+ }
+ void set_check_for_checkpoint(bool need= true)
{
- return UNIV_UNLIKELY
- (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+ need_checkpoint.store(need, std::memory_order_relaxed);
}
- void set_check_flush_or_checkpoint(bool flag= true)
- { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
/** Make previous write_buf() durable and update flushed_to_disk_lsn. */
bool flush(lsn_t lsn) noexcept;
@@ -446,8 +444,9 @@ public:
private:
/** Wait in append_prepare() for buffer to become available
+ @param lsn log sequence number to write up to
@param ex whether log_sys.latch is exclusively locked */
- ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept;
+ ATTRIBUTE_COLD void append_prepare_wait(lsn_t lsn, bool ex) noexcept;
public:
/** Reserve space in the log buffer for appending data.
@tparam pmem log_sys.is_pmem()
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index 6d75e15a..a73b7279 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -44,6 +44,11 @@ ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
@return whether the page was recovered correctly */
bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
+/** Read the latest checkpoint information from log file
+and store it in log_sys.next_checkpoint and recv_sys.file_checkpoint
+@return error code or DB_SUCCESS */
+dberr_t recv_recovery_read_checkpoint();
+
/** Start recovering from a redo log checkpoint.
of first system tablespace page
@return error code or DB_SUCCESS */
@@ -114,7 +119,19 @@ struct recv_dblwr_t
@param name tablespace filepath
@param file tablespace file handle
@return whether the operation failed */
- bool restore_first_page(uint32_t space_id, const char *name, os_file_t file);
+ bool restore_first_page(uint32_t space_id, const char *name,
+ pfs_os_file_t file);
+
+ /** Restore the first page of the given tablespace from
+ doublewrite buffer.
+ 1) Find the page which has page_no as 0
+ 2) Read first 3 pages from tablespace file
+ 3) Compare the space_ids from the pages with page0 which
+ was retrieved from doublewrite buffer
+ @param name tablespace filepath
+ @param file tablespace file handle
+ @return space_id or 0 in case of error */
+ uint32_t find_first_page(const char *name, pfs_os_file_t file);
typedef std::deque<byte*, ut_allocator<byte*> > list;
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index 841cfab1..c916edc9 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -89,8 +89,9 @@ struct mtr_t {
{ auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
/** Commit a mini-transaction that is shrinking a tablespace.
- @param space tablespace that is being shrunk */
- ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+ @param space tablespace that is being shrunk
+ @param size new size in pages */
+ ATTRIBUTE_COLD void commit_shrink(fil_space_t &space, uint32_t size);
/** Commit a mini-transaction that is deleting or renaming a file.
@param space tablespace that is being renamed or deleted
@@ -105,7 +106,7 @@ struct mtr_t {
This is to be used at log_checkpoint().
@param checkpoint_lsn the log sequence number of a checkpoint, or 0
@return current LSN */
- lsn_t commit_files(lsn_t checkpoint_lsn= 0);
+ ATTRIBUTE_COLD lsn_t commit_files(lsn_t checkpoint_lsn= 0);
/** @return mini-transaction savepoint (current size of m_memo) */
ulint get_savepoint() const
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index c9db6a1f..c8374515 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -142,9 +142,11 @@ static const ulint OS_FILE_NORMAL = 62;
/* @} */
/** Types for file create @{ */
-static const ulint OS_DATA_FILE = 100;
-static const ulint OS_LOG_FILE = 101;
-static const ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+static constexpr ulint OS_DATA_FILE = 100;
+static constexpr ulint OS_LOG_FILE = 101;
+#if defined _WIN32 || defined HAVE_FCNTL_DIRECT
+static constexpr ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+#endif
/* @} */
/** Error codes from os_file_get_last_error @{ */
@@ -373,7 +375,7 @@ os_file_create_simple_no_error_handling_func(
bool* success)
MY_ATTRIBUTE((warn_unused_result));
-#ifdef _WIN32
+#ifndef HAVE_FCNTL_DIRECT
#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
#else
/** Tries to disable OS caching on an opened file descriptor.
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index db846795..457d9ab5 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -210,14 +210,11 @@ extern unsigned long long srv_max_undo_log_size;
extern uint srv_n_fil_crypt_threads;
extern uint srv_n_fil_crypt_threads_started;
-/** Rate at which UNDO records should be purged. */
-extern ulong srv_purge_rseg_truncate_frequency;
-
/** Enable or Disable Truncate of UNDO tablespace. */
extern my_bool srv_undo_log_truncate;
/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
-constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+constexpr uint32_t SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
UNIV_PAGE_SIZE_DEF;
extern char* srv_log_group_home_dir;
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
index 1dca0cc1..01067322 100644
--- a/storage/innobase/include/srw_lock.h
+++ b/storage/innobase/include/srw_lock.h
@@ -34,7 +34,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
#endif
-#ifdef SUX_LOCK_GENERIC
/** An exclusive-only variant of srw_lock */
template<bool spinloop>
class pthread_mutex_wrapper final
@@ -70,7 +69,6 @@ template<>
inline void pthread_mutex_wrapper<true>::wr_lock()
{ if (!wr_lock_try()) wr_wait(); }
# endif
-#endif
/** Futex-based mutex */
template<bool spinloop>
@@ -541,7 +539,7 @@ public:
/** @return whether any lock may be held by any thread */
bool is_locked_or_waiting() const noexcept
{ return lock.is_locked_or_waiting(); }
- /** @return whether an exclusive lock may be held by any thread */
+ /** @return whether a shared or exclusive lock may be held by any thread */
bool is_locked() const noexcept { return lock.is_locked(); }
/** @return whether an exclusive lock may be held by any thread */
bool is_write_locked() const noexcept { return lock.is_write_locked(); }
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 3ddd2e98..0f4f8afa 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -140,6 +140,15 @@ private:
bool m_initialized{false};
/** whether purge is enabled; protected by latch and std::atomic */
std::atomic<bool> m_enabled{false};
+ /** The primary candidate for iterator::free_history() is
+ rseg=trx_sys.rseg_array[skipped_rseg]. This field may be changed
+ after invoking rseg.set_skip_allocation() and rseg.clear_skip_allocation()
+ and while holding the exclusive rseg.latch.
+
+ This may only be 0 if innodb_undo_tablespaces=0, because rollback segment
+ 0 always resides in the system tablespace and would never be used when
+ dedicated undo tablespaces are in use. */
+ Atomic_relaxed<uint8_t> skipped_rseg;
public:
/** whether purge is active (may hold table handles) */
std::atomic<bool> m_active{false};
@@ -197,6 +206,11 @@ public:
return undo_no <= other.undo_no;
}
+ /** Remove unnecessary history data from a rollback segment.
+ @param rseg rollback segment
+ @return error code */
+ inline dberr_t free_history_rseg(trx_rseg_t &rseg) const;
+
/** Free the undo pages up to this. */
dberr_t free_history() const;
@@ -240,14 +254,15 @@ public:
by the pq_mutex */
mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */
- /** Undo tablespace file truncation (only accessed by the
- srv_purge_coordinator_thread) */
- struct {
- /** The undo tablespace that is currently being truncated */
- fil_space_t* current;
- /** The undo tablespace that was last truncated */
- fil_space_t* last;
- } truncate;
+ /** innodb_undo_log_truncate=ON state;
+ only modified by purge_coordinator_callback() */
+ struct {
+ /** The undo tablespace that is currently being truncated */
+ Atomic_relaxed<fil_space_t*> current;
+ /** The number of the undo tablespace that was last truncated,
+ relative from srv_undo_space_id_start */
+ uint32_t last;
+ } truncate_undo_space;
/** Create the instance */
void create();
@@ -357,6 +372,26 @@ public:
typically via purge_sys_t::view_guard. */
return view.sees(id);
}
+
+private:
+ /** Enable the use of a rollback segment and advance skipped_rseg,
+ after iterator::free_history_rseg() had invoked
+ rseg.set_skip_allocation(). */
+ inline void rseg_enable(trx_rseg_t &rseg);
+
+ /** Try to start truncating a tablespace.
+ @param id undo tablespace identifier
+ @param size the maximum desired undo tablespace size, in pages
+ @return undo tablespace whose truncation was started
+ @retval nullptr if truncation is not currently possible */
+ inline fil_space_t *undo_truncate_try(uint32_t id, uint32_t size);
+public:
+ /** Check if innodb_undo_log_truncate=ON needs to be handled.
+ This is only to be called by purge_coordinator_callback().
+ @return undo tablespace chosen by innodb_undo_log_truncate=ON
+ @retval nullptr if truncation is not currently possible */
+ fil_space_t *truncating_tablespace();
+
/** A wrapper around trx_sys_t::clone_oldest_view(). */
template<bool also_end_view= false>
void clone_oldest_view()
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 43e0c290..7fa43047 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -73,14 +73,15 @@ private:
/** Reference counter to track is_persistent() transactions,
with SKIP flag. */
std::atomic<uint32_t> ref;
-
+public:
/** Whether undo tablespace truncation is pending */
static constexpr uint32_t SKIP= 1;
/** Transaction reference count multiplier */
static constexpr uint32_t REF= 2;
+ /** @return the reference count and flags */
uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
-
+private:
/** Set the SKIP bit */
void ref_set_skip()
{
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 5dd0169f..3fa41fdf 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -902,8 +902,8 @@ public:
uint64_t recovered_binlog_offset;
/** Latest recovered binlog file name */
char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
- /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
- lsn_t recovered_binlog_lsn;
+ /** Set when latest position is from pre-version 10.3.5 TRX_SYS. */
+ bool recovered_binlog_is_legacy_pos;
/**
@@ -1191,6 +1191,11 @@ public:
return count;
}
+ /** Disable further allocation of transactions in a rollback segment
+ that are subject to innodb_undo_log_truncate=ON
+ @param space undo tablespace that will be truncated */
+ inline void undo_truncate_start(fil_space_t &space);
+
/** Set the undo log empty value */
void set_undo_non_empty(bool val)
{
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 3cfbe331..0a3e0d62 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1108,6 +1108,7 @@ public:
{
ut_ad(state == TRX_STATE_NOT_STARTED);
ut_ad(!id);
+ ut_ad(!*detailed_error);
ut_ad(!mutex_is_owner());
ut_ad(!has_logged());
ut_ad(!is_referenced());
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 91999c81..9f39b303 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -100,6 +100,7 @@ bool log_t::create()
/* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */
lsn.store(FIRST_LSN, std::memory_order_relaxed);
flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed);
+ need_checkpoint.store(true, std::memory_order_relaxed);
write_lsn= FIRST_LSN;
#ifndef HAVE_PMEM
@@ -124,18 +125,17 @@ bool log_t::create()
TRASH_ALLOC(flush_buf, buf_size);
checkpoint_buf= static_cast<byte*>(aligned_malloc(4096, 4096));
memset_aligned<4096>(checkpoint_buf, 0, 4096);
+ max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
#else
ut_ad(!checkpoint_buf);
ut_ad(!buf);
ut_ad(!flush_buf);
+ max_buf_free= 1;
#endif
latch.SRW_LOCK_INIT(log_latch_key);
init_lsn_lock();
- max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
- set_check_flush_or_checkpoint();
-
last_checkpoint_lsn= FIRST_LSN;
log_capacity= 0;
max_modified_age_async= 0;
@@ -236,6 +236,7 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
log.close();
mprotect(ptr, size_t(size), PROT_READ);
buf= static_cast<byte*>(ptr);
+ max_buf_free= size;
# if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
# endif
@@ -264,6 +265,7 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
TRASH_ALLOC(buf, buf_size);
TRASH_ALLOC(flush_buf, buf_size);
+ max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
#endif
#if defined __linux__ || defined _WIN32
@@ -813,8 +815,8 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
- ut_ad(!srv_read_only_mode);
ut_ad(!is_pmem());
+ ut_ad(!srv_read_only_mode);
const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
@@ -849,7 +851,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
... /* TODO: Update the LSN and adjust other code. */
#else
/* The rest of the block will be written as garbage.
- (We want to avoid memset() while holding mutex.)
+ (We want to avoid memset() while holding exclusive log_sys.latch)
This block will be overwritten later, once records beyond
the current LSN are generated. */
# ifdef HAVE_valgrind
@@ -886,6 +888,7 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
write_lsn= lsn;
}
+ set_check_for_checkpoint(false);
return lsn;
}
@@ -927,8 +930,9 @@ wait and check if an already running write is covering the request.
void log_write_up_to(lsn_t lsn, bool durable,
const completion_callback *callback)
{
- ut_ad(!srv_read_only_mode);
+ ut_ad(!srv_read_only_mode || (log_sys.buf_free < log_sys.max_buf_free));
ut_ad(lsn != LSN_MAX);
+ ut_ad(lsn != 0);
if (UNIV_UNLIKELY(recv_no_ibuf_operations))
{
@@ -985,7 +989,6 @@ repeat:
@param durable whether to wait for a durable write to complete */
void log_buffer_flush_to_disk(bool durable)
{
- ut_ad(!srv_read_only_mode);
log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
}
@@ -1017,16 +1020,6 @@ ATTRIBUTE_COLD void log_write_and_flush()
#endif
}
-/********************************************************************
-
-Tries to establish a big enough margin of free space in the log buffer, such
-that a new log entry can be catenated without an immediate need for a flush. */
-ATTRIBUTE_COLD static void log_flush_margin()
-{
- if (log_sys.buf_free > log_sys.max_buf_free)
- log_buffer_flush_to_disk(false);
-}
-
/****************************************************************//**
Tries to establish a big enough margin of free space in the log, such
that a new log entry can be catenated without an immediate need for a
@@ -1034,12 +1027,12 @@ checkpoint. NOTE: this function may only be called if the calling thread
owns no synchronization objects! */
ATTRIBUTE_COLD static void log_checkpoint_margin()
{
- while (log_sys.check_flush_or_checkpoint())
+ while (log_sys.check_for_checkpoint())
{
log_sys.latch.rd_lock(SRW_LOCK_CALL);
ut_ad(!recv_no_log_write);
- if (!log_sys.check_flush_or_checkpoint())
+ if (!log_sys.check_for_checkpoint())
{
func_exit:
log_sys.latch.rd_unlock();
@@ -1055,7 +1048,7 @@ func_exit:
#ifndef DBUG_OFF
skip_checkpoint:
#endif
- log_sys.set_check_flush_or_checkpoint(false);
+ log_sys.set_check_for_checkpoint(false);
goto func_exit;
}
@@ -1069,30 +1062,17 @@ func_exit:
}
}
-/**
-Checks that there is enough free space in the log to start a new query step.
-Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
-function may only be called if the calling thread owns no synchronization
-objects! */
-ATTRIBUTE_COLD void log_check_margins()
-{
- do
- {
- log_flush_margin();
- log_checkpoint_margin();
- ut_ad(!recv_no_log_write);
- }
- while (log_sys.check_flush_or_checkpoint());
-}
-
/** Wait for a log checkpoint if needed.
NOTE that this function may only be called while not holding
any synchronization objects except dict_sys.latch. */
void log_free_check()
{
ut_ad(!lock_sys.is_writer());
- if (log_sys.check_flush_or_checkpoint())
- log_check_margins();
+ if (log_sys.check_for_checkpoint())
+ {
+ ut_ad(!recv_no_log_write);
+ log_checkpoint_margin();
+ }
}
extern void buf_resize_shutdown();
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 3c3fe41e..e72f842f 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -833,7 +833,22 @@ processed:
filename= tbl_name + 1;
}
}
- space->add(filename, OS_FILE_CLOSED, size, false, false);
+ pfs_os_file_t handle= OS_FILE_CLOSED;
+ if (srv_operation == SRV_OPERATION_RESTORE)
+ {
+ /* During mariadb-backup --backup, a table could be renamed,
+ created and dropped, and we may be missing the file at this
+ point of --prepare. Try to create the file if it does not exist
+ already. If the file exists, we'll pass handle=OS_FILE_CLOSED
+ and the file will be opened normally in fil_space_t::acquire()
+ inside recv_sys_t::recover_deferred(). */
+ bool success;
+ handle= os_file_create(innodb_data_file_key, filename,
+ OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT |
+ OS_FILE_ON_ERROR_SILENT,
+ OS_FILE_AIO, OS_DATA_FILE, false, &success);
+ }
+ space->add(filename, handle, size, false, false);
space->recv_size= it->second.size;
space->size_in_header= size;
return space;
@@ -1238,7 +1253,8 @@ static void fil_name_process(const char *name, ulint len, uint32_t space_id,
file_name_t& f = p.first->second;
- if (auto d = deferred_spaces.find(space_id)) {
+ auto d = deferred_spaces.find(space_id);
+ if (d) {
if (deleted) {
d->deleted = true;
goto got_deleted;
@@ -1311,7 +1327,16 @@ same_space:
FILE_* record. */
ut_ad(space == NULL);
- if (srv_force_recovery) {
+ if (srv_operation == SRV_OPERATION_RESTORE && d
+ && ftype == FILE_RENAME) {
+rename:
+ d->file_name = fname.name;
+ f.name = fname.name;
+ break;
+ }
+
+ if (srv_force_recovery
+ || srv_operation == SRV_OPERATION_RESTORE) {
/* Without innodb_force_recovery,
missing tablespaces will only be
reported in
@@ -1330,7 +1355,11 @@ same_space:
break;
case FIL_LOAD_DEFER:
- /** Skip the deferred spaces
+ if (d && ftype == FILE_RENAME
+ && srv_operation == SRV_OPERATION_RESTORE) {
+ goto rename;
+ }
+ /* Skip the deferred spaces
when lsn is already processed */
if (!if_exists) {
deferred_spaces.add(
@@ -1735,20 +1764,6 @@ dberr_t recv_sys_t::find_checkpoint()
{
if (wrong_size)
return DB_CORRUPTION;
- if (log_sys.next_checkpoint_lsn < 8204)
- {
- /* Before MDEV-14425, InnoDB had a minimum LSN of 8192+12=8204.
- Likewise, mariadb-backup --prepare would create an empty
- ib_logfile0 after applying the log. We will allow an upgrade
- from such an empty log.
-
- If a user replaces the redo log with an empty file and the
- FIL_PAGE_FILE_FLUSH_LSN field was zero in the system
- tablespace (see SysTablespace::read_lsn_and_check_flags()) we
- must refuse to start up. */
- sql_print_error("InnoDB: ib_logfile0 is empty, and LSN is unknown.");
- return DB_CORRUPTION;
- }
lsn= log_sys.next_checkpoint_lsn;
log_sys.format= log_t::FORMAT_3_23;
goto upgrade;
@@ -2409,7 +2424,7 @@ struct recv_ring : public recv_buf
{
const size_t s(*this - start);
ut_ad(s + len <= srv_page_size);
- if (!log_sys.is_encrypted())
+ if (!len || !log_sys.is_encrypted())
{
if (start.ptr + s == ptr && ptr + len <= end())
return ptr;
@@ -3205,7 +3220,7 @@ static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
skipped_after_init = false;
ut_ad(end_lsn == page_lsn);
if (end_lsn != page_lsn) {
- sql_print_warning(
+ sql_print_information(
"InnoDB: The last skipped log record"
" LSN " LSN_PF
" is not equal to page LSN " LSN_PF,
@@ -4012,7 +4027,6 @@ static bool recv_scan_log(bool last_phase)
const size_t block_size_1{log_sys.get_block_size() - 1};
mysql_mutex_lock(&recv_sys.mutex);
- ut_d(recv_sys.after_apply= last_phase);
if (!last_phase)
recv_sys.clear();
else
@@ -4221,6 +4235,7 @@ static bool recv_scan_log(bool last_phase)
recv_sys.lsn= rewound_lsn;
}
func_exit:
+ ut_d(recv_sys.after_apply= last_phase);
mysql_mutex_unlock(&recv_sys.mutex);
DBUG_RETURN(!store);
}
@@ -4507,12 +4522,36 @@ done:
return err;
}
+dberr_t recv_recovery_read_checkpoint()
+{
+ ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
+ ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+
+ if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO)
+ {
+ sql_print_information("InnoDB: innodb_force_recovery=6"
+ " skips redo log apply");
+ return DB_SUCCESS;
+ }
+
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ dberr_t err= recv_sys.find_checkpoint();
+ log_sys.latch.wr_unlock();
+ return err;
+}
+
/** Start recovering from a redo log checkpoint.
of first system tablespace page
@return error code or DB_SUCCESS */
dberr_t recv_recovery_from_checkpoint_start()
{
- bool rescan = false;
+ bool rescan = false;
+ dberr_t err = DB_SUCCESS;
ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED
|| srv_operation == SRV_OPERATION_RESTORE
@@ -4525,20 +4564,12 @@ dberr_t recv_recovery_from_checkpoint_start()
if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
sql_print_information("InnoDB: innodb_force_recovery=6"
" skips redo log apply");
- return(DB_SUCCESS);
+ return err;
}
recv_sys.recovery_on = true;
log_sys.latch.wr_lock(SRW_LOCK_CALL);
-
- dberr_t err = recv_sys.find_checkpoint();
- if (err != DB_SUCCESS) {
-early_exit:
- log_sys.latch.wr_unlock();
- return err;
- }
-
log_sys.set_capacity();
/* Start reading the log from the checkpoint lsn. The variable
@@ -4548,7 +4579,9 @@ early_exit:
ut_ad(recv_sys.pages.empty());
if (log_sys.format == log_t::FORMAT_3_23) {
- goto early_exit;
+early_exit:
+ log_sys.latch.wr_unlock();
+ return err;
}
if (log_sys.is_latest()) {
@@ -4843,7 +4876,7 @@ byte *recv_dblwr_t::find_page(const page_id_t page_id,
}
bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name,
- os_file_t file)
+ pfs_os_file_t file)
{
const page_id_t page_id(space_id, 0);
const byte* page= find_page(page_id);
@@ -4851,10 +4884,10 @@ bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name,
{
/* If the first page of the given user tablespace is not there
in the doublewrite buffer, then the recovery is going to fail
- now. Hence this is treated as error. */
- ib::error()
- << "Corrupted page " << page_id << " of datafile '"
- << name <<"' could not be found in the doublewrite buffer.";
+ now. Report error only when doublewrite buffer is not empty */
+ if (pages.size())
+ ib::error() << "Corrupted page " << page_id << " of datafile '"
+ << name << "' could not be found in the doublewrite buffer.";
return true;
}
@@ -4868,3 +4901,58 @@ bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name,
IORequestWrite, name, file, page, 0, physical_size) !=
DB_SUCCESS;
}
+
+uint32_t recv_dblwr_t::find_first_page(const char *name, pfs_os_file_t file)
+{
+ os_offset_t file_size= os_file_get_size(file);
+ if (file_size != (os_offset_t) -1)
+ {
+ for (const page_t *page : pages)
+ {
+ uint32_t space_id= page_get_space_id(page);
+ byte *read_page= nullptr;
+ if (page_get_page_no(page) > 0 || space_id == 0)
+ {
+next_page:
+ aligned_free(read_page);
+ continue;
+ }
+ uint32_t flags= mach_read_from_4(
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+ page_id_t page_id(space_id, 0);
+ size_t page_size= fil_space_t::physical_size(flags);
+ if (file_size < 4 * page_size)
+ goto next_page;
+ read_page=
+ static_cast<byte*>(aligned_malloc(3 * page_size, page_size));
+ /* Read 3 pages from the file and match the space id
+ with the space id which is stored in
+ doublewrite buffer page. */
+ if (os_file_read(IORequestRead, file, read_page, page_size,
+ 3 * page_size, nullptr) != DB_SUCCESS)
+ goto next_page;
+ for (ulint j= 0; j <= 2; j++)
+ {
+ byte *cur_page= read_page + j * page_size;
+ if (buf_is_zeroes(span<const byte>(cur_page, page_size)))
+ {
+ space_id= 0;
+ goto early_exit;
+ }
+ if (mach_read_from_4(cur_page + FIL_PAGE_OFFSET) != j + 1 ||
+ memcmp(cur_page + FIL_PAGE_SPACE_ID,
+ page + FIL_PAGE_SPACE_ID, 4) ||
+ buf_page_is_corrupted(false, cur_page, flags))
+ goto next_page;
+ }
+ if (!restore_first_page(space_id, name, file))
+ {
+early_exit:
+ aligned_free(read_page);
+ return space_id;
+ }
+ break;
+ }
+ }
+ return 0;
+}
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 1834a164..01641f74 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -308,6 +308,22 @@ void mtr_t::release()
m_memo.clear();
}
+inline lsn_t log_t::get_write_target() const
+{
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(latch.is_locked());
+#endif
+ if (UNIV_LIKELY(buf_free < max_buf_free))
+ return 0;
+ ut_ad(!is_pmem());
+ /* The LSN corresponding to the end of buf is
+ write_lsn - (first_lsn & 4095) + buf_free,
+ but we use simpler arithmetics to return a smaller write target in
+ order to minimize waiting in log_write_up_to(). */
+ ut_ad(max_buf_free >= 4096 * 4);
+ return write_lsn + max_buf_free / 2;
+}
+
/** Commit a mini-transaction. */
void mtr_t::commit()
{
@@ -331,6 +347,7 @@ void mtr_t::commit()
std::pair<lsn_t,page_flush_ahead> lsns{do_write()};
process_freed_pages();
size_t modified= 0;
+ const lsn_t write_lsn= log_sys.get_write_target();
if (m_made_dirty)
{
@@ -408,7 +425,8 @@ void mtr_t::commit()
break;
default:
buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
- const auto s= bpage->unfix();
+ ut_d(const auto s=)
+ bpage->unfix();
if (slot.type & MTR_MEMO_MODIFY)
{
ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
@@ -420,13 +438,10 @@ void mtr_t::commit()
ut_ad(s < buf_page_t::READ_FIX);
ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <=
m_commit_lsn);
- if (s >= buf_page_t::UNFIXED)
- {
- mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn);
- if (UNIV_LIKELY_NULL(bpage->zip.data))
- memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data,
- FIL_PAGE_LSN + bpage->frame, 8);
- }
+ mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn);
+ if (UNIV_LIKELY_NULL(bpage->zip.data))
+ memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data,
+ FIL_PAGE_LSN + bpage->frame, 8);
modified++;
}
switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) {
@@ -451,6 +466,9 @@ void mtr_t::commit()
if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
+
+ if (UNIV_UNLIKELY(write_lsn != 0))
+ log_write_up_to(write_lsn, false);
}
else
{
@@ -492,9 +510,20 @@ void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end);
}
+/** Set create_lsn. */
+inline void fil_space_t::set_create_lsn(lsn_t lsn)
+{
+#ifndef SUX_LOCK_GENERIC
+ /* Concurrent log_checkpoint_low() must be impossible. */
+ ut_ad(latch.is_write_locked());
+#endif
+ create_lsn= lsn;
+}
+
/** Commit a mini-transaction that is shrinking a tablespace.
-@param space tablespace that is being shrunk */
-void mtr_t::commit_shrink(fil_space_t &space)
+@param space tablespace that is being shrunk
+@param size new size in pages */
+void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
{
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
@@ -514,6 +543,15 @@ void mtr_t::commit_shrink(fil_space_t &space)
const lsn_t start_lsn= do_write().first;
ut_d(m_log.erase());
+ fil_node_t *file= UT_LIST_GET_LAST(space.chain);
+ mysql_mutex_lock(&fil_system.mutex);
+ ut_ad(file->is_open());
+ space.size= file->size= size;
+ space.set_create_lsn(m_commit_lsn);
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ space.clear_freed_ranges();
+
/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();
#ifndef SUX_LOCK_GENERIC
@@ -521,11 +559,11 @@ void mtr_t::commit_shrink(fil_space_t &space)
#endif
os_file_truncate(space.chain.start->name, space.chain.start->handle,
- os_offset_t{space.size} << srv_page_size_shift, true);
+ os_offset_t{size} << srv_page_size_shift, true);
space.clear_freed_ranges();
- const page_id_t high{space.id, space.size};
+ const page_id_t high{space.id, size};
size_t modified= 0;
auto it= m_memo.rbegin();
mysql_mutex_lock(&buf_pool.flush_list_mutex);
@@ -586,13 +624,6 @@ void mtr_t::commit_shrink(fil_space_t &space)
log_sys.latch.wr_unlock();
m_latch_ex= false;
- mysql_mutex_lock(&fil_system.mutex);
- ut_ad(space.is_being_truncated);
- ut_ad(space.is_stopping_writes());
- space.clear_stopping();
- space.is_being_truncated= false;
- mysql_mutex_unlock(&fil_system.mutex);
-
release();
release_resources();
}
@@ -680,7 +711,7 @@ The caller must hold exclusive log_sys.latch.
This is to be used at log_checkpoint().
@param checkpoint_lsn the log sequence number of a checkpoint, or 0
@return current LSN */
-lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
+ATTRIBUTE_COLD lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
{
#ifndef SUX_LOCK_GENERIC
ut_ad(log_sys.latch.is_write_locked());
@@ -840,26 +871,26 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn)
}
/** Wait in append_prepare() for buffer to become available
+@param lsn log sequence number to write up to
@param ex whether log_sys.latch is exclusively locked */
-ATTRIBUTE_COLD void log_t::append_prepare_wait(bool ex) noexcept
+ATTRIBUTE_COLD void log_t::append_prepare_wait(lsn_t lsn, bool ex) noexcept
{
- log_sys.waits++;
- log_sys.unlock_lsn();
+ waits++;
+ unlock_lsn();
if (ex)
- log_sys.latch.wr_unlock();
+ latch.wr_unlock();
else
- log_sys.latch.rd_unlock();
+ latch.rd_unlock();
- DEBUG_SYNC_C("log_buf_size_exceeded");
- log_buffer_flush_to_disk(log_sys.is_pmem());
+ log_write_up_to(lsn, is_pmem());
if (ex)
- log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ latch.wr_lock(SRW_LOCK_CALL);
else
- log_sys.latch.rd_lock(SRW_LOCK_CALL);
+ latch.rd_lock(SRW_LOCK_CALL);
- log_sys.lock_lsn();
+ lock_lsn();
}
/** Reserve space in the log buffer for appending data.
@@ -878,34 +909,30 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
# endif
#endif
ut_ad(pmem == is_pmem());
- const lsn_t checkpoint_margin{last_checkpoint_lsn + log_capacity - size};
- const size_t avail{(pmem ? size_t(capacity()) : buf_size) - size};
lock_lsn();
write_to_buf++;
- for (ut_d(int count= 50);
- UNIV_UNLIKELY((pmem
- ? size_t(get_lsn() -
- get_flushed_lsn(std::memory_order_relaxed))
- : size_t{buf_free}) > avail); )
+ const lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size};
+ size_t b{buf_free};
+
+ if (UNIV_UNLIKELY(pmem
+ ? (end_lsn -
+ get_flushed_lsn(std::memory_order_relaxed)) > capacity()
+ : b + size >= buf_size))
{
- append_prepare_wait(ex);
- ut_ad(count--);
+ append_prepare_wait(l, ex);
+ b= buf_free;
}
- const lsn_t l{lsn.load(std::memory_order_relaxed)};
- lsn.store(l + size, std::memory_order_relaxed);
- const size_t b{buf_free};
- size_t new_buf_free{b};
- new_buf_free+= size;
+ lsn.store(end_lsn, std::memory_order_relaxed);
+ size_t new_buf_free= b + size;
if (pmem && new_buf_free >= file_size)
new_buf_free-= size_t(capacity());
buf_free= new_buf_free;
unlock_lsn();
- if (UNIV_UNLIKELY(l > checkpoint_margin) ||
- (!pmem && b >= max_buf_free))
- set_check_flush_or_checkpoint();
+ if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity))
+ set_check_for_checkpoint();
return {l, &buf[b]};
}
@@ -930,7 +957,7 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age))
return mtr_t::PAGE_FLUSH_ASYNC;
- log_sys.set_check_flush_or_checkpoint();
+ log_sys.set_check_for_checkpoint();
return mtr_t::PAGE_FLUSH_SYNC;
}
@@ -989,10 +1016,9 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
#ifndef DBUG_OFF
do
{
- if (m_log_mode != MTR_LOG_ALL)
+ if (m_log_mode != MTR_LOG_ALL ||
+ _db_keyword_(nullptr, "skip_page_checksum", 1))
continue;
- DBUG_EXECUTE_IF("skip_page_checksum", continue;);
-
for (const mtr_memo_slot_t& slot : m_memo)
if (slot.type & MTR_MEMO_MODIFY)
{
@@ -1150,9 +1176,6 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
}
}
-/** Write the mini-transaction log to the redo log buffer.
-@param len number of bytes to write
-@return {start_lsn,flush_ahead} */
std::pair<lsn_t,mtr_t::page_flush_ahead>
mtr_t::finish_write(size_t len)
{
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 5e674806..31bec346 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -975,7 +975,7 @@ os_file_create_simple_func(
*success = false;
int create_flag;
- const char* mode_str = NULL;
+ const char* mode_str __attribute__((unused));
ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
@@ -1051,6 +1051,7 @@ os_file_create_simple_func(
} while (retry);
+#ifdef HAVE_FCNTL_DIRECT
/* This function is always called for data files, we should disable
OS caching (O_DIRECT) here as we do in os_file_create_func(), so
we open the same file in the same mode, see man page of open(2). */
@@ -1065,6 +1066,7 @@ os_file_create_simple_func(
break;
}
}
+#endif
#ifndef _WIN32
if (!read_only
@@ -1150,7 +1152,7 @@ os_file_create_func(
);
int create_flag;
- const char* mode_str = NULL;
+ const char* mode_str __attribute__((unused));
on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
? true : false;
@@ -1192,10 +1194,13 @@ os_file_create_func(
return(OS_FILE_CLOSED);
}
+#ifdef HAVE_FCNTL_DIRECT
ut_a(type == OS_LOG_FILE
|| type == OS_DATA_FILE
|| type == OS_DATA_FILE_NO_O_DIRECT);
-
+#else
+ ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
+#endif
ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
/* We let O_DSYNC only affect log files */
@@ -1241,7 +1246,7 @@ os_file_create_func(
return file;
}
-#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT
+#ifdef HAVE_FCNTL_DIRECT
if (type == OS_DATA_FILE) {
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
@@ -2175,10 +2180,8 @@ os_file_create_func(
if (srv_file_flush_method == SRV_O_DSYNC)
attributes|= FILE_FLAG_WRITE_THROUGH;
}
- else if (type == OS_DATA_FILE)
- {
- switch (srv_file_flush_method)
- {
+ else if (type == OS_DATA_FILE) {
+ switch (srv_file_flush_method) {
case SRV_FSYNC:
case SRV_LITTLESYNC:
case SRV_NOSYNC:
@@ -3042,30 +3045,15 @@ os_file_handle_error_cond_exit(
return(false);
}
-#ifndef _WIN32
+#ifdef HAVE_FCNTL_DIRECT
/** Tries to disable OS caching on an opened file descriptor.
@param[in] fd file descriptor to alter
@param[in] file_name file name, used in the diagnostic message
@param[in] name "open" or "create"; used in the diagnostic
message */
void
-os_file_set_nocache(
- int fd MY_ATTRIBUTE((unused)),
- const char* file_name MY_ATTRIBUTE((unused)),
- const char* operation_name MY_ATTRIBUTE((unused)))
+os_file_set_nocache(int fd, const char *file_name, const char *operation_name)
{
- /* some versions of Solaris may not have DIRECTIO_ON */
-#if defined(__sun__) && defined(DIRECTIO_ON)
- if (directio(fd, DIRECTIO_ON) == -1) {
- int errno_save = errno;
-
- ib::error()
- << "Failed to set DIRECTIO_ON on file "
- << file_name << "; " << operation_name << ": "
- << strerror(errno_save) << ","
- " continuing anyway.";
- }
-#elif defined(O_DIRECT)
if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
int errno_save = errno;
static bool warning_message_printed = false;
@@ -3084,10 +3072,8 @@ os_file_set_nocache(
<< ", continuing anyway.";
}
}
-#endif /* defined(__sun__) && defined(DIRECTIO_ON) */
}
-
-#endif /* _WIN32 */
+#endif /* HAVE_FCNTL_DIRECT */
/** Check if the file system supports sparse files.
@param fh file handle
@@ -3177,8 +3163,18 @@ fallback:
return true;
}
current_size &= ~4095ULL;
+# ifdef __linux__
+ if (!fallocate(file, 0, current_size,
+ size - current_size)) {
+ err = 0;
+ break;
+ }
+
+ err = errno;
+# else
err = posix_fallocate(file, current_size,
size - current_size);
+# endif
}
} while (err == EINTR
&& srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
@@ -3457,7 +3453,7 @@ static void write_io_callback(void *c)
if (UNIV_UNLIKELY(cb->m_err != 0))
ib::info () << "IO Error: " << cb->m_err
- << "during write of "
+ << " during write of "
<< cb->m_len << " bytes, for file "
<< request.node->name << "(" << cb->m_fh << "), returned "
<< cb->m_ret_len;
@@ -4194,7 +4190,6 @@ bool fil_node_t::read_page0()
!= DB_SUCCESS)
{
sql_print_error("InnoDB: Unable to read first page of file %s", name);
-corrupted:
aligned_free(page);
return false;
}
@@ -4211,25 +4206,35 @@ corrupted:
if (!fil_space_t::is_valid_flags(flags, space->id))
{
uint32_t cflags= fsp_flags_convert_from_101(flags);
- if (cflags == UINT32_MAX)
+ if (cflags != UINT32_MAX)
{
-invalid:
- ib::error() << "Expected tablespace flags "
- << ib::hex(space->flags)
- << " but found " << ib::hex(flags)
- << " in the file " << name;
- goto corrupted;
+ uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK;
+ uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK;
+
+ if (fil_space_t::is_flags_equal(cf, sf) ||
+ fil_space_t::is_flags_equal(sf, cf))
+ {
+ flags= cflags;
+ goto flags_ok;
+ }
}
- uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK;
- uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK;
+ aligned_free(page);
+ goto invalid;
+ }
- if (!fil_space_t::is_flags_equal(cf, sf) &&
- !fil_space_t::is_flags_equal(sf, cf))
- goto invalid;
- flags= cflags;
+ if (!fil_space_t::is_flags_equal((flags & ~FSP_FLAGS_MEM_MASK),
+ (space->flags & ~FSP_FLAGS_MEM_MASK)) &&
+ !fil_space_t::is_flags_equal((space->flags & ~FSP_FLAGS_MEM_MASK),
+ (flags & ~FSP_FLAGS_MEM_MASK)))
+ {
+invalid:
+ sql_print_error("InnoDB: Expected tablespace flags 0x%zx but found 0x%zx"
+ " in the file %s", space->flags, flags, name);
+ return false;
}
+ flags_ok:
ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
/* Try to read crypt_data from page 0 if it is not yet read. */
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
index 61614007..51bcc954 100644
--- a/storage/innobase/pars/pars0pars.cc
+++ b/storage/innobase/pars/pars0pars.cc
@@ -1778,9 +1778,6 @@ pars_create_table(
ulint flags = 0;
ulint flags2 = DICT_TF2_FTS_AUX_HEX_NAME;
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
-
n_cols = que_node_list_get_len(column_defs);
table = dict_table_t::create(
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 17a2f034..9d85e2b1 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -1630,9 +1630,6 @@ row_fts_merge_insert(
/* We should set the flags2 with aux_table_name here,
in order to get the correct aux table names. */
index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
- DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
- index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
- & ((1U << DICT_TF2_BITS) - 1););
fts_table.type = FTS_INDEX_TABLE;
fts_table.index_id = index->id;
fts_table.table_id = table->id;
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index d2609fdb..2516e24e 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -117,7 +117,6 @@ struct row_import {
row_import() UNIV_NOTHROW
:
m_table(NULL),
- m_version(0),
m_hostname(NULL),
m_table_name(NULL),
m_autoinc(0),
@@ -196,8 +195,6 @@ struct row_import {
dict_table_t* m_table; /*!< Table instance */
- ulint m_version; /*!< Version of config file */
-
byte* m_hostname; /*!< Hostname where the
tablespace was exported */
byte* m_table_name; /*!< Exporting instance table
@@ -2992,17 +2989,13 @@ row_import_read_meta_data(
return(DB_IO_ERROR);
}
- cfg.m_version = mach_read_from_4(row);
-
/* Check the version number. */
- switch (cfg.m_version) {
+ switch (mach_read_from_4(row)) {
case IB_EXPORT_CFG_VERSION_V1:
-
return(row_import_read_v1(file, thd, &cfg));
default:
- ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
- "Unsupported meta-data version number (" ULINTPF "), "
- "file ignored", cfg.m_version);
+ ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_NOT_SUPPORTED_YET,
+ "meta-data version");
}
return(DB_ERROR);
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index bdee0ed1..9c3c5d22 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -2638,14 +2638,17 @@ row_ins_clust_index_entry_low(
ut_ad(!dict_index_is_online_ddl(index));
ut_ad(!index->table->persistent_autoinc);
ut_ad(!index->is_instant());
+ ut_ad(!entry->info_bits);
mtr.set_log_mode(MTR_LOG_NO_REDO);
} else {
index->set_modified(mtr);
- if (UNIV_UNLIKELY(entry->is_metadata())) {
+ if (UNIV_UNLIKELY(entry->info_bits != 0)) {
+ ut_ad(entry->is_metadata());
ut_ad(index->is_instant());
ut_ad(!dict_index_is_online_ddl(index));
ut_ad(mode == BTR_MODIFY_TREE);
+ ut_ad(flags == BTR_NO_LOCKING_FLAG);
} else {
if (mode == BTR_MODIFY_LEAF
&& dict_index_is_online_ddl(index)) {
@@ -2787,11 +2790,6 @@ avoid_bulk:
skip_bulk_insert:
if (UNIV_UNLIKELY(entry->info_bits != 0)) {
- ut_ad(entry->is_metadata());
- ut_ad(flags == BTR_NO_LOCKING_FLAG);
- ut_ad(index->is_instant());
- ut_ad(!dict_index_is_online_ddl(index));
-
const rec_t* rec = btr_pcur_get_rec(&pcur);
if (rec_get_info_bits(rec, page_rec_is_comp(rec))
@@ -2895,9 +2893,20 @@ do_insert:
}
}
+ if (err == DB_SUCCESS && entry->info_bits) {
+ if (buf_block_t* root
+ = btr_root_block_get(index, RW_X_LATCH, &mtr,
+ &err)) {
+ btr_set_instant(root, *index, &mtr);
+ } else {
+ ut_ad("cannot find root page" == 0);
+ }
+ }
+
mtr.commit();
if (big_rec) {
+ ut_ad(err == DB_SUCCESS);
/* Online table rebuild could read (and
ignore) the incomplete record at this point.
If online rebuild is in progress, the
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 5df93fe6..188d8ba5 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -120,7 +120,7 @@ public:
ut_ad(mtr_started == scan_mtr->is_active());
DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
- log_sys.set_check_flush_or_checkpoint(););
+ log_sys.set_check_for_checkpoint(););
for (idx_tuple_vec::iterator it = m_dtuple_vec.begin();
it != m_dtuple_vec.end();
@@ -128,7 +128,7 @@ public:
dtuple = *it;
ut_ad(dtuple);
- if (log_sys.check_flush_or_checkpoint()) {
+ if (log_sys.check_for_checkpoint()) {
if (mtr_started) {
if (!btr_pcur_move_to_prev_on_page(pcur)) {
error = DB_CORRUPTION;
@@ -2235,6 +2235,8 @@ end_of_index:
goto err_exit;
}
+ buf_page_make_young_if_needed(&block->page);
+
page_cur_set_before_first(block, cur);
if (!page_cur_move_to_next(cur)
|| page_cur_is_after_last(cur)) {
@@ -3545,17 +3547,6 @@ row_merge_sort(
of file marker). Thus, it must be at least one block. */
ut_ad(file->offset > 0);
- /* These thd_progress* calls will crash on sol10-64 when innodb_plugin
- is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on
- sol10-64 in buildbot.
- */
-#ifndef __sun__
- /* Progress report only for "normal" indexes. */
- if (dup && !(dup->index->type & DICT_FTS)) {
- thd_progress_init(trx->mysql_thd, 1);
- }
-#endif /* __sun__ */
-
if (global_system_variables.log_warnings > 2) {
sql_print_information("InnoDB: Online DDL : merge-sorting"
" has estimated " ULINTPF " runs",
@@ -3564,15 +3555,6 @@ row_merge_sort(
/* Merge the runs until we have one big run */
do {
- /* Report progress of merge sort to MySQL for
- show processlist progress field */
- /* Progress report only for "normal" indexes. */
-#ifndef __sun__
- if (dup && !(dup->index->type & DICT_FTS)) {
- thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset);
- }
-#endif /* __sun__ */
-
error = row_merge(trx, dup, file, block, tmpfd,
&num_runs, run_offset, stage,
crypt_block, space);
@@ -3596,13 +3578,6 @@ row_merge_sort(
ut_free(run_offset);
- /* Progress report only for "normal" indexes. */
-#ifndef __sun__
- if (dup && !(dup->index->type & DICT_FTS)) {
- thd_progress_end(trx->mysql_thd);
- }
-#endif /* __sun__ */
-
DBUG_RETURN(error);
}
@@ -4436,13 +4411,14 @@ row_merge_file_create(
merge_file->fd = row_merge_file_create_low(path);
merge_file->offset = 0;
merge_file->n_rec = 0;
-
+#ifdef HAVE_FCNTL_DIRECT
if (merge_file->fd != OS_FILE_CLOSED) {
if (srv_disable_sort_file_cache) {
os_file_set_nocache(merge_file->fd,
"row0merge.cc", "sort");
}
}
+#endif
return(merge_file->fd);
}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
index 4756cc37..d83ab861 100644
--- a/storage/innobase/row/row0purge.cc
+++ b/storage/innobase/row/row0purge.cc
@@ -822,7 +822,6 @@ skip_secondaries:
buf_page_get(page_id_t(rseg.space->id,
page_no),
0, RW_X_LATCH, &mtr)) {
- block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);
byte* data_field = block->page.frame
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 6c76dd91..33f4d81f 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -1222,6 +1222,7 @@ re_scan:
if (!cur_block) {
goto func_end;
}
+ buf_page_make_young_if_needed(&cur_block->page);
} else {
mtr->start();
goto func_end;
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
index 8a1041c8..f14673c1 100644
--- a/storage/innobase/row/row0undo.cc
+++ b/storage/innobase/row/row0undo.cc
@@ -318,6 +318,8 @@ static buf_block_t* row_undo_rec_get(undo_node_t* node)
return nullptr;
}
+ buf_page_make_young_if_needed(&undo_page->page);
+
uint16_t offset = undo->top_offset;
buf_block_t* prev_page = undo_page;
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index bec53841..a39574d2 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -2158,6 +2158,25 @@ row_upd_clust_rec_by_insert_inherit_func(
return(inherit);
}
+/** Mark 'disowned' BLOBs as 'owned' and 'inherited' again,
+after resuming from a lock wait.
+@param entry clustered index entry */
+static ATTRIBUTE_COLD void row_upd_reown_inherited_fields(dtuple_t *entry)
+{
+ for (ulint i= 0; i < entry->n_fields; i++)
+ {
+ const dfield_t *dfield= dtuple_get_nth_field(entry, i);
+ if (dfield_is_ext(dfield))
+ {
+ byte *blob_len= static_cast<byte*>(dfield->data) +
+ dfield->len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_LEN);
+ ut_ad(*blob_len & BTR_EXTERN_OWNER_FLAG);
+ *blob_len= byte((*blob_len & ~BTR_EXTERN_OWNER_FLAG) |
+ BTR_EXTERN_INHERITED_FLAG);
+ }
+ }
+}
+
/***********************************************************//**
Marks the clustered index record deleted and inserts the updated version
of the record to the index. This function should be used when the ordering
@@ -2236,12 +2255,16 @@ row_upd_clust_rec_by_insert(
/* If the clustered index record is already delete
marked, then we are here after a DB_LOCK_WAIT.
Skip delete marking clustered index and disowning
- its blobs. */
+ its blobs. Mark the BLOBs in the index entry
+ (which we copied from the already "disowned" rec)
+ as "owned", like it was on the previous call of
+ row_upd_clust_rec_by_insert(). */
ut_ad(row_get_rec_trx_id(rec, index, offsets)
== trx->id);
ut_ad(!trx_undo_roll_ptr_is_insert(
row_get_rec_roll_ptr(rec, index,
offsets)));
+ row_upd_reown_inherited_fields(entry);
goto check_fk;
}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index bf9755fb..7c0c4b92 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -106,9 +106,6 @@ segment). It is quite possible that some of the tablespaces doesn't host
any of the rollback-segment based on configuration used. */
uint32_t srv_undo_tablespaces_active;
-/** Rate at which UNDO records should be purged. */
-ulong srv_purge_rseg_truncate_frequency;
-
/** Enable or Disable Truncate of UNDO tablespace.
Note: If enabled then UNDO tablespace will be selected for truncate.
While Server waits for undo-tablespace to truncate if user disables
@@ -901,6 +898,9 @@ srv_export_innodb_status(void)
export_vars.innodb_data_written = srv_stats.data_written
+ (dblwr << srv_page_size_shift);
+ export_vars.innodb_buffer_pool_read_requests
+ = buf_pool.stat.n_page_gets;
+
export_vars.innodb_buffer_pool_bytes_data =
buf_pool.stat.LRU_bytes
+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
@@ -1503,7 +1503,8 @@ inline void purge_coordinator_state::do_purge()
ulint n_pages_handled= trx_purge(n_threads, history_size);
if (!trx_sys.history_exists())
goto no_history;
- if (purge_sys.truncate.current || srv_shutdown_state != SRV_SHUTDOWN_NONE)
+ if (purge_sys.truncating_tablespace() ||
+ srv_shutdown_state != SRV_SHUTDOWN_NONE)
{
purge_truncation_task.wait();
trx_purge_truncate_history();
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index ef5bcb67..738e0a7e 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -468,7 +468,7 @@ ATTRIBUTE_COLD static dberr_t srv_undo_tablespaces_reinit()
rseg->init(nullptr, FIL_NULL);
}
- if (trx_sys.recovered_binlog_lsn
+ if (*trx_sys.recovered_binlog_filename
#ifdef WITH_WSREP
|| !trx_sys.recovered_wsrep_xid.is_null()
#endif /* WITH_WSREP */
@@ -476,7 +476,7 @@ ATTRIBUTE_COLD static dberr_t srv_undo_tablespaces_reinit()
{
/* Update binlog offset, binlog file name & wsrep xid in
system tablespace rollback segment */
- if (trx_sys.recovered_binlog_lsn)
+ if (*trx_sys.recovered_binlog_filename)
{
ut_d(const size_t len = strlen(trx_sys.recovered_binlog_filename) + 1);
ut_ad(len > 1);
@@ -1122,10 +1122,14 @@ dberr_t srv_start(bool create_new_db)
if (srv_force_recovery) {
ib::info() << "!!! innodb_force_recovery is set to "
<< srv_force_recovery << " !!!";
+ if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+ srv_read_only_mode = true;
+ }
}
- if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
- srv_read_only_mode = true;
+ if (srv_read_only_mode) {
+ sql_print_information("InnoDB: Started in read only mode");
+ srv_use_doublewrite_buf = false;
}
high_level_read_only = srv_read_only_mode
@@ -1302,6 +1306,10 @@ dberr_t srv_start(bool create_new_db)
ut_ad(buf_page_cleaner_is_active);
}
+ if (innodb_encrypt_temporary_tables && !log_crypt_init()) {
+ return srv_init_abort(DB_ERROR);
+ }
+
/* Check if undo tablespaces and redo log files exist before creating
a new system tablespace */
if (create_new_db) {
@@ -1310,6 +1318,11 @@ dberr_t srv_start(bool create_new_db)
return(srv_init_abort(DB_ERROR));
}
recv_sys.debug_free();
+ } else {
+ err = recv_recovery_read_checkpoint();
+ if (err != DB_SUCCESS) {
+ return srv_init_abort(err);
+ }
}
/* Open or create the data files. */
@@ -1334,12 +1347,9 @@ dberr_t srv_start(bool create_new_db)
" old data files which contain your precious data!";
/* fall through */
default:
- /* Other errors might come from Datafile::validate_first_page() */
- return(srv_init_abort(err));
- }
-
- if (innodb_encrypt_temporary_tables && !log_crypt_init()) {
- return srv_init_abort(DB_ERROR);
+ /* Other errors might be flagged by
+ Datafile::validate_first_page() */
+ return srv_init_abort(err);
}
if (create_new_db) {
@@ -1355,10 +1365,10 @@ dberr_t srv_start(bool create_new_db)
return srv_init_abort(err);
}
- srv_undo_space_id_start= 1;
+ srv_undo_space_id_start = 1;
}
- /* Open log file and data files in the systemtablespace: we keep
+ /* Open data files in the system tablespace: we keep
them open until database shutdown */
ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
@@ -1771,21 +1781,13 @@ dberr_t srv_start(bool create_new_db)
}
if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
- /* The following call is necessary for the insert
+ /* The following call is necessary for the change
buffer to work with multiple tablespaces. We must
know the mapping between space id's and .ibd file
names.
- In a crash recovery, we check that the info in data
- dictionary is consistent with what we already know
- about space id's from the calls to fil_ibd_load().
-
- In a normal startup, we create the space objects for
- every table in the InnoDB data dictionary that has
- an .ibd file.
-
We also determine the maximum tablespace id used. */
- dict_check_tablespaces_and_store_max_id();
+ dict_check_tablespaces_and_store_max_id(nullptr);
}
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
@@ -1933,7 +1935,7 @@ void innodb_preshutdown()
better prevent any further changes from being buffered. */
innodb_change_buffering= 0;
- if (trx_sys.is_initialised())
+ if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO && srv_was_started)
while (trx_sys.any_active_transactions())
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc
index e41451d8..5afb79f2 100644
--- a/storage/innobase/sync/srw_lock.cc
+++ b/storage/innobase/sync/srw_lock.cc
@@ -143,8 +143,7 @@ static inline void srw_pause(unsigned delay)
HMT_medium();
}
-#ifdef SUX_LOCK_GENERIC
-# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
template<> void pthread_mutex_wrapper<true>::wr_wait()
{
const unsigned delay= srw_pause_delay();
@@ -158,8 +157,9 @@ template<> void pthread_mutex_wrapper<true>::wr_wait()
pthread_mutex_lock(&lock);
}
-# endif
+#endif
+#ifdef SUX_LOCK_GENERIC
template void ssux_lock_impl<false>::init();
template void ssux_lock_impl<true>::init();
template void ssux_lock_impl<false>::destroy();
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index 1f31ceda..cff16d9c 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -41,6 +41,7 @@ Created 3/26/1996 Heikki Tuuri
#include "dict0load.h"
#include <mysql/service_thd_mdl.h>
#include <mysql/service_wsrep.h>
+#include "log.h"
/** Maximum allowable purge history length. <=0 means 'infinite'. */
ulong srv_max_purge_lag = 0;
@@ -168,10 +169,15 @@ void purge_sys_t::create()
ut_ad(this == &purge_sys);
ut_ad(!m_initialized);
ut_ad(!enabled());
+ ut_ad(!m_active);
+ /* If innodb_undo_tablespaces>0, the rollback segment 0
+ (which always resides in the system tablespace) will
+ never be used; @see trx_assign_rseg_low() */
+ skipped_rseg= srv_undo_tablespaces > 0;
m_paused= 0;
query= purge_graph_build();
next_stored= false;
- rseg= NULL;
+ rseg= nullptr;
page_no= 0;
offset= 0;
hdr_page_no= 0;
@@ -179,8 +185,8 @@ void purge_sys_t::create()
latch.SRW_LOCK_INIT(trx_purge_latch_key);
end_latch.init();
mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr);
- truncate.current= NULL;
- truncate.last= NULL;
+ truncate_undo_space.current= nullptr;
+ truncate_undo_space.last= 0;
m_initialized= true;
}
@@ -350,14 +356,21 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
}
/** Free an undo log segment.
-@param block rollback segment header page
+@param rseg_hdr rollback segment header page
+@param block undo segment header page
@param mtr mini-transaction */
-static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr)
+static void trx_purge_free_segment(buf_block_t *rseg_hdr, buf_block_t *block,
+ mtr_t &mtr)
{
+ ut_ad(mtr.memo_contains_flagged(rseg_hdr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr.memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
block->page.frame, &mtr))
{
+ rseg_hdr->fix();
block->fix();
+ ut_d(const page_id_t rseg_hdr_id{rseg_hdr->page.id()});
ut_d(const page_id_t id{block->page.id()});
mtr.commit();
/* NOTE: If the server is killed after the log that was produced
@@ -368,26 +381,62 @@ static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr)
This does not matter when using multiple innodb_undo_tablespaces;
innodb_undo_log_truncate=ON will be able to reclaim the space. */
mtr.start();
+ rseg_hdr->page.lock.x_lock();
+ ut_ad(rseg_hdr->page.id() == rseg_hdr_id);
block->page.lock.x_lock();
ut_ad(block->page.id() == id);
- mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+ mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX);
+ mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
}
while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
block->page.frame, &mtr));
}
+void purge_sys_t::rseg_enable(trx_rseg_t &rseg)
+{
+ ut_ad(this == &purge_sys);
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(rseg.latch.is_write_locked());
+#endif
+ uint8_t skipped= skipped_rseg;
+ ut_ad(skipped < TRX_SYS_N_RSEGS);
+ if (&rseg == &trx_sys.rseg_array[skipped])
+ {
+ /* If this rollback segment is subject to innodb_undo_log_truncate=ON,
+ we must not clear the flag. But we will advance purge_sys.skipped_rseg
+ to be able to choose another candidate for this soft truncation, and
+ to prevent the following scenario:
+
+ (1) purge_sys_t::iterator::free_history_rseg() had invoked
+ rseg.set_skip_allocation()
+ (2) undo log truncation had completed on this rollback segment
+ (3) SET GLOBAL innodb_undo_log_truncate=OFF
+ (4) purge_sys_t::iterator::free_history_rseg() would not be able to
+ invoke rseg.set_skip_allocation() on any other rollback segment
+ before this rseg has grown enough */
+ if (truncate_undo_space.current != rseg.space)
+ rseg.clear_skip_allocation();
+ skipped++;
+ /* If innodb_undo_tablespaces>0, the rollback segment 0
+ (which always resides in the system tablespace) will
+ never be used; @see trx_assign_rseg_low() */
+ if (!(skipped&= (TRX_SYS_N_RSEGS - 1)) && srv_undo_tablespaces)
+ skipped++;
+ skipped_rseg= skipped;
+ }
+}
+
/** Remove unnecessary history data from a rollback segment.
@param rseg rollback segment
@param limit truncate anything before this
-@param all whether everything can be truncated
@return error code */
-static dberr_t
-trx_purge_truncate_rseg_history(trx_rseg_t &rseg,
- const purge_sys_t::iterator &limit, bool all)
+inline dberr_t purge_sys_t::iterator::free_history_rseg(trx_rseg_t &rseg) const
{
fil_addr_t hdr_addr;
mtr_t mtr;
+ bool freed= false;
+ uint32_t rseg_ref= 0;
mtr.start();
@@ -397,6 +446,8 @@ trx_purge_truncate_rseg_history(trx_rseg_t &rseg,
{
func_exit:
mtr.commit();
+ if (freed && (rseg.SKIP & rseg_ref))
+ purge_sys.rseg_enable(rseg);
return err;
}
@@ -418,16 +469,40 @@ loop:
const trx_id_t undo_trx_no=
mach_read_from_8(b->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO);
- if (undo_trx_no >= limit.trx_no)
+ if (undo_trx_no >= trx_no)
{
- if (undo_trx_no == limit.trx_no)
- err = trx_undo_truncate_start(&rseg, hdr_addr.page,
- hdr_addr.boffset, limit.undo_no);
+ if (undo_trx_no == trx_no)
+ err= trx_undo_truncate_start(&rseg, hdr_addr.page,
+ hdr_addr.boffset, undo_no);
goto func_exit;
}
-
- if (!all)
- goto func_exit;
+ else
+ {
+ rseg_ref= rseg.ref_load();
+ if (rseg_ref >= rseg.REF || !purge_sys.sees(rseg.needs_purge))
+ {
+ /* We cannot clear this entire rseg because trx_assign_rseg_low()
+ has already chosen it for a future trx_undo_assign(), or
+ because some recently started transaction needs purging.
+
+ If this invocation could not reduce rseg.history_size at all
+ (!freed), we will try to ensure progress and prevent our
+ starvation by disabling one rollback segment for future
+ trx_assign_rseg_low() invocations until a future invocation has
+ made progress and invoked purge_sys_t::rseg_enable(rseg) on that
+ rollback segment. */
+
+ if (!(rseg.SKIP & rseg_ref) && !freed &&
+ ut_d(!trx_rseg_n_slots_debug &&)
+ &rseg == &trx_sys.rseg_array[purge_sys.skipped_rseg])
+ /* If rseg.space == purge_sys.truncate_undo_space.current
+ the following will be a no-op. A possible conflict
+ with innodb_undo_log_truncate=ON will be handled in
+ purge_sys_t::rseg_enable(). */
+ rseg.set_skip_allocation();
+ goto func_exit;
+ }
+ }
fil_addr_t prev_hdr_addr=
flst_get_prev_addr(b->page.frame + hdr_addr.boffset +
@@ -459,7 +534,7 @@ loop:
free_segment:
ut_ad(rseg.curr_size >= seg_size);
rseg.curr_size-= seg_size;
- trx_purge_free_segment(b, mtr);
+ trx_purge_free_segment(rseg_hdr, b, mtr);
break;
case TRX_UNDO_CACHED:
/* rseg.undo_cached must point to this page */
@@ -490,10 +565,11 @@ loop:
mtr.commit();
ut_ad(rseg.history_size > 0);
rseg.history_size--;
+ freed= true;
mtr.start();
rseg_hdr->page.lock.x_lock();
ut_ad(rseg_hdr->page.id() == rseg.page_id());
- mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_MODIFY);
+ mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX);
goto loop;
}
@@ -544,9 +620,7 @@ dberr_t purge_sys_t::iterator::free_history() const
ut_ad(rseg.is_persistent());
log_free_check();
rseg.latch.wr_lock(SRW_LOCK_CALL);
- dberr_t err=
- trx_purge_truncate_rseg_history(rseg, *this, !rseg.is_referenced() &&
- purge_sys.sees(rseg.needs_purge));
+ dberr_t err= free_history_rseg(rseg);
rseg.latch.wr_unlock();
if (err)
return err;
@@ -554,6 +628,62 @@ dberr_t purge_sys_t::iterator::free_history() const
return DB_SUCCESS;
}
+inline void trx_sys_t::undo_truncate_start(fil_space_t &space)
+{
+ ut_ad(this == &trx_sys);
+ /* Undo tablespace always are a single file. */
+ ut_a(UT_LIST_GET_LEN(space.chain) == 1);
+ fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
+ /* The undo tablespace files are never closed. */
+ ut_ad(file->is_open());
+ sql_print_information("InnoDB: Starting to truncate %s", file->name);
+
+ for (auto &rseg : rseg_array)
+ if (rseg.space == &space)
+ {
+ /* Prevent a race with purge_sys_t::iterator::free_history_rseg() */
+ rseg.latch.rd_lock(SRW_LOCK_CALL);
+ /* Once set, this rseg will not be allocated to subsequent
+ transactions, but we will wait for existing active
+ transactions to finish. */
+ rseg.set_skip_allocation();
+ rseg.latch.rd_unlock();
+ }
+}
+
+inline fil_space_t *purge_sys_t::undo_truncate_try(uint32_t id, uint32_t size)
+{
+ ut_ad(srv_is_undo_tablespace(id));
+ fil_space_t *space= fil_space_get(id);
+ if (space && space->get_size() > size)
+ {
+ truncate_undo_space.current= space;
+ trx_sys.undo_truncate_start(*space);
+ return space;
+ }
+ return nullptr;
+}
+
+fil_space_t *purge_sys_t::truncating_tablespace()
+{
+ ut_ad(this == &purge_sys);
+
+ fil_space_t *space= truncate_undo_space.current;
+ if (space || srv_undo_tablespaces_active < 2 || !srv_undo_log_truncate)
+ return space;
+
+ const uint32_t size= uint32_t(srv_max_undo_log_size >> srv_page_size_shift);
+ for (uint32_t i= truncate_undo_space.last, j= i;; )
+ {
+ if (fil_space_t *s= undo_truncate_try(srv_undo_space_id_start + i, size))
+ return s;
+ ++i;
+ i%= srv_undo_tablespaces_active;
+ if (i == j)
+ return nullptr;
+ }
+}
+
#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
# if defined __arm__ || defined __aarch64__
/* Work around an internal compiler error in GCC 4.8.5 */
@@ -579,55 +709,14 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
head.undo_no= 0;
}
- if (head.free_history() != DB_SUCCESS || srv_undo_tablespaces_active < 2)
+ if (head.free_history() != DB_SUCCESS)
return;
- while (srv_undo_log_truncate)
+ while (fil_space_t *space= purge_sys.truncating_tablespace())
{
- if (!purge_sys.truncate.current)
- {
- const ulint threshold=
- ulint(srv_max_undo_log_size >> srv_page_size_shift);
- for (uint32_t i= purge_sys.truncate.last
- ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0,
- j= i;; )
- {
- const uint32_t space_id= srv_undo_space_id_start + i;
- ut_ad(srv_is_undo_tablespace(space_id));
- fil_space_t *space= fil_space_get(space_id);
- ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-
- if (space && space->get_size() > threshold)
- {
- purge_sys.truncate.current= space;
- break;
- }
-
- ++i;
- i %= srv_undo_tablespaces_active;
- if (i == j)
- return;
- }
- }
-
- fil_space_t &space= *purge_sys.truncate.current;
- /* Undo tablespace always are a single file. */
- fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
- /* The undo tablespace files are never closed. */
- ut_ad(file->is_open());
-
- DBUG_LOG("undo", "marking for truncate: " << file->name);
-
- for (auto &rseg : trx_sys.rseg_array)
- if (rseg.space == &space)
- /* Once set, this rseg will not be allocated to subsequent
- transactions, but we will wait for existing active
- transactions to finish. */
- rseg.set_skip_allocation();
-
for (auto &rseg : trx_sys.rseg_array)
{
- if (rseg.space != &space)
+ if (rseg.space != space)
continue;
rseg.latch.rd_lock(SRW_LOCK_CALL);
@@ -660,15 +749,9 @@ not_free:
rseg.latch.rd_unlock();
}
- ib::info() << "Truncating " << file->name;
- trx_purge_cleanse_purge_queue(space);
-
- log_free_check();
-
- mtr_t mtr;
- mtr.start();
- mtr.x_lock_space(&space);
- const auto space_id= space.id;
+ const char *file_name= UT_LIST_GET_FIRST(space->chain)->name;
+ sql_print_information("InnoDB: Truncating %s", file_name);
+ trx_purge_cleanse_purge_queue(*space);
/* Lock all modified pages of the tablespace.
@@ -678,104 +761,41 @@ not_free:
mini-transaction commit and the server was killed, then
discarding the to-be-trimmed pages without flushing would
break crash recovery. */
- rescan:
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
- {
- ut_ad(bpage->oldest_modification());
- ut_ad(bpage->in_file());
-
- buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
-
- if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id)
- {
- ut_ad(bpage->frame);
- bpage->fix();
- {
- /* Try to acquire an exclusive latch while the cache line is
- fresh after fix(). */
- const bool got_lock{bpage->lock.x_lock_try()};
- buf_pool.flush_hp.set(prev);
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- if (!got_lock)
- bpage->lock.x_lock();
- }
-
-#ifdef BTR_CUR_HASH_ADAPT
- /* There is no AHI on undo tablespaces. */
- ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
-#endif
- ut_ad(!bpage->is_io_fixed());
- ut_ad(bpage->id().space() == space_id);
-
- if (bpage->oldest_modification() > 2)
- {
- mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage),
- MTR_MEMO_PAGE_X_FIX);
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- ut_ad(bpage->oldest_modification() > 2);
- bpage->reset_oldest_modification();
- }
- else
- {
- bpage->unfix();
- bpage->lock.x_unlock();
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- }
-
- if (prev != buf_pool.flush_hp.get())
- {
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- goto rescan;
- }
- }
- bpage= prev;
- }
-
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-
- /* Re-initialize tablespace, in a single mini-transaction. */
- const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+ if (UNIV_UNLIKELY(srv_shutdown_state != SRV_SHUTDOWN_NONE) &&
+ srv_fast_shutdown)
+ return;
/* Adjust the tablespace metadata. */
mysql_mutex_lock(&fil_system.mutex);
- space.set_stopping();
- space.is_being_truncated= true;
- if (space.crypt_data)
+ if (space->crypt_data)
{
- space.reacquire();
+ space->reacquire();
mysql_mutex_unlock(&fil_system.mutex);
- fil_space_crypt_close_tablespace(&space);
- space.release();
+ fil_space_crypt_close_tablespace(space);
+ space->release();
}
else
mysql_mutex_unlock(&fil_system.mutex);
- for (auto i= 6000; space.referenced();
- std::this_thread::sleep_for(std::chrono::milliseconds(10)))
- {
- if (!--i)
- {
- mtr.commit();
- ib::error() << "Failed to freeze UNDO tablespace " << file->name;
- return;
- }
- }
+ /* Re-initialize tablespace, in a single mini-transaction. */
+ const uint32_t size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+
+ log_free_check();
+ mtr_t mtr;
+ mtr.start();
+ mtr.x_lock_space(space);
/* Associate the undo tablespace with mtr.
During mtr::commit_shrink(), InnoDB can use the undo
tablespace object to clear all freed ranges */
- mtr.set_named_space(&space);
- mtr.trim_pages(page_id_t(space.id, size));
- ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
- mysql_mutex_lock(&fil_system.mutex);
- space.size= file->size= size;
- mysql_mutex_unlock(&fil_system.mutex);
+ mtr.set_named_space(space);
+ mtr.trim_pages(page_id_t(space->id, size));
+ ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
for (auto &rseg : trx_sys.rseg_array)
{
- if (rseg.space != &space)
+ if (rseg.space != space)
continue;
ut_ad(!rseg.is_referenced());
@@ -784,7 +804,7 @@ not_free:
possibly before this server had been started up. */
dberr_t err;
- buf_block_t *rblock= trx_rseg_header_create(&space,
+ buf_block_t *rblock= trx_rseg_header_create(space,
&rseg - trx_sys.rseg_array,
trx_sys.get_max_trx_id(),
&mtr, &err);
@@ -797,7 +817,7 @@ not_free:
rseg.reinit(rblock->page.id().page_no());
}
- mtr.commit_shrink(space);
+ mtr.commit_shrink(*space, size);
/* No mutex; this is only updated by the purge coordinator. */
export_vars.innodb_undo_truncations++;
@@ -814,14 +834,15 @@ not_free:
purge_sys.next_stored= false;
}
- DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
+ DBUG_EXECUTE_IF("ib_undo_trunc",
+ sql_print_information("InnoDB: ib_undo_trunc");
log_buffer_flush_to_disk();
DBUG_SUICIDE(););
- ib::info() << "Truncated " << file->name;
- purge_sys.truncate.last= purge_sys.truncate.current;
- ut_ad(&space == purge_sys.truncate.current);
- purge_sys.truncate.current= nullptr;
+ sql_print_information("InnoDB: Truncated %s", file_name);
+ ut_ad(space == purge_sys.truncate_undo_space.current);
+ purge_sys.truncate_undo_space.current= nullptr;
+ purge_sys.truncate_undo_space.last= space->id - srv_undo_space_id_start;
}
}
@@ -853,7 +874,9 @@ void purge_sys_t::rseg_get_next_history_log()
{
fil_addr_t prev_log_addr;
+#ifndef SUX_LOCK_GENERIC
ut_ad(rseg->latch.is_write_locked());
+#endif
ut_a(rseg->last_page_no != FIL_NULL);
tail.trx_no= rseg->last_trx_no() + 1;
@@ -969,7 +992,9 @@ inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr)
{
ut_ad(next_stored);
ut_ad(tail.trx_no < low_limit_no());
+#ifndef SUX_LOCK_GENERIC
ut_ad(rseg->latch.is_write_locked());
+#endif
if (!offset)
{
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index b381c9de..2923dc64 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -2069,9 +2069,10 @@ trx_undo_get_undo_rec_low(
mtr.start();
trx_undo_rec_t *undo_rec= nullptr;
- if (const buf_block_t* undo_page=
+ if (buf_block_t* undo_page=
buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr))
{
+ buf_page_make_young_if_needed(&undo_page->page);
undo_rec= undo_page->page.frame + offset;
const size_t end= mach_read_from_2(undo_rec);
if (UNIV_UNLIKELY(end <= offset ||
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 8d1a381c..87a2ac7b 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -296,8 +296,13 @@ buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const
if (err) *err= DB_TABLESPACE_NOT_FOUND;
return nullptr;
}
- return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
- BUF_GET, mtr, err);
+
+ buf_block_t *block= buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
+ BUF_GET, mtr, err);
+ if (UNIV_LIKELY(block != nullptr))
+ buf_page_make_young_if_needed(&block->page);
+
+ return block;
}
/** Upgrade a rollback segment header page to MariaDB 10.3 format.
@@ -462,20 +467,32 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame;
if (*binlog_name)
{
- lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
- (FIL_PAGE_LSN + rseg_hdr->page.frame));
static_assert(TRX_RSEG_BINLOG_NAME_LEN ==
sizeof trx_sys.recovered_binlog_filename, "compatibility");
- if (lsn > trx_sys.recovered_binlog_lsn)
- {
- trx_sys.recovered_binlog_lsn= lsn;
- trx_sys.recovered_binlog_offset=
+
+ /* Always prefer a position from rollback segment over
+ a legacy position from before version 10.3.5. */
+ int cmp= *trx_sys.recovered_binlog_filename &&
+ !trx_sys.recovered_binlog_is_legacy_pos
+ ? strncmp(reinterpret_cast<const char*>(binlog_name),
+ trx_sys.recovered_binlog_filename,
+ TRX_RSEG_BINLOG_NAME_LEN)
+ : 1;
+
+ if (cmp >= 0) {
+ uint64_t binlog_offset =
mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET +
rseg_hdr->page.frame);
- memcpy(trx_sys.recovered_binlog_filename, binlog_name,
- TRX_RSEG_BINLOG_NAME_LEN);
+ if (cmp)
+ {
+ memcpy(trx_sys.recovered_binlog_filename, binlog_name,
+ TRX_RSEG_BINLOG_NAME_LEN);
+ trx_sys.recovered_binlog_offset= binlog_offset;
+ }
+ else if (binlog_offset > trx_sys.recovered_binlog_offset)
+ trx_sys.recovered_binlog_offset= binlog_offset;
+ trx_sys.recovered_binlog_is_legacy_pos= false;
}
-
#ifdef WITH_WSREP
trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
#endif
@@ -548,6 +565,7 @@ static void trx_rseg_init_binlog_info(const page_t* page)
trx_sys.recovered_binlog_offset = mach_read_from_8(
TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+ TRX_SYS + page);
+ trx_sys.recovered_binlog_is_legacy_pos= true;
}
#ifdef WITH_WSREP
@@ -562,6 +580,7 @@ dberr_t trx_rseg_array_init()
*trx_sys.recovered_binlog_filename = '\0';
trx_sys.recovered_binlog_offset = 0;
+ trx_sys.recovered_binlog_is_legacy_pos= false;
#ifdef WITH_WSREP
trx_sys.recovered_wsrep_xid.null();
XID wsrep_sys_xid;
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index e5e2ef9e..942b8bd4 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -582,6 +582,7 @@ static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo)
undo.top_page_no), 0, RW_S_LATCH, nullptr,
BUF_GET, &mtr, &err))
{
+ buf_page_make_young_if_needed(&block->page);
buf_block_t *undo_block= block;
const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset;
@@ -980,7 +981,13 @@ void trx_t::commit_empty(mtr_t *mtr)
trx_undo_t *&undo= rsegs.m_redo.undo;
ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED);
- ut_ad(undo->size == 1);
+
+ if (UNIV_UNLIKELY(undo->size != 1))
+ {
+ sql_print_error("InnoDB: Undo log for transaction " TRX_ID_FMT
+ " is corrupted (" UINT32PF "!=1)", id, undo->size);
+ ut_ad("corrupted undo log" == 0);
+ }
if (buf_block_t *u=
buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0,
@@ -1504,6 +1511,7 @@ void trx_t::commit_cleanup()
mutex.wr_lock();
state= TRX_STATE_NOT_STARTED;
+ *detailed_error= '\0';
mod_tables.clear();
check_foreigns= true;
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index 203edd9f..ccc68dfe 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -25,8 +25,8 @@ Created 3/26/1996 Heikki Tuuri
*******************************************************/
#include "trx0undo.h"
+#include "buf0rea.h"
#include "fsp0fsp.h"
-#include "mach0data.h"
#include "mtr0log.h"
#include "srv0mon.h"
#include "srv0srv.h"
@@ -178,8 +178,12 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+ if (UNIV_UNLIKELY(!block))
+ return nullptr;
- return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr;
+ if (!buf_page_make_young_if_needed(&block->page))
+ buf_read_ahead_linear(block->page.id(), 0, false);
+ return trx_undo_page_get_last_rec(block, page_no, offset);
}
/** Get the previous undo log record.
@@ -268,12 +272,16 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
uint16_t offset, ulint mode, const buf_block_t*& block,
mtr_t *mtr, dberr_t *err)
{
- block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
- nullptr, BUF_GET, mtr, err);
+ buf_block_t *b= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
+ nullptr, BUF_GET, mtr, err);
+ block= b;
if (!block)
return nullptr;
- if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+ if (!buf_page_make_young_if_needed(&b->page))
+ buf_read_ahead_linear(b->page.id(), 0, false);
+
+ if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset))
return rec;
return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
@@ -663,6 +671,8 @@ buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
if (!header_block)
goto func_exit;
+ buf_page_make_young_if_needed(&header_block->page);
+
*err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr);
if (UNIV_UNLIKELY(*err != DB_SUCCESS))
@@ -732,6 +742,8 @@ trx_undo_free_page(
return FIL_NULL;
}
+ buf_page_make_young_if_needed(&header_block->page);
+
*err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
mtr);
@@ -740,6 +752,14 @@ trx_undo_free_page(
return FIL_NULL;
}
+ const fil_addr_t last_addr = flst_get_last(
+ TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+ + header_block->page.frame);
+ if (UNIV_UNLIKELY(last_addr.page == page_no)) {
+ *err = DB_CORRUPTION;
+ return FIL_NULL;
+ }
+
*err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+ header_block->page.frame,
rseg->space, page_no, mtr);
@@ -748,9 +768,6 @@ trx_undo_free_page(
}
buf_page_free(rseg->space, page_no, mtr);
- const fil_addr_t last_addr = flst_get_last(
- TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
- + header_block->page.frame);
rseg->curr_size--;
if (!in_history) {
@@ -794,6 +811,9 @@ static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit,
{
ut_ad(is_temp == !undo.rseg->is_persistent());
+ if (UNIV_UNLIKELY(undo.last_page_no == FIL_NULL))
+ return DB_CORRUPTION;
+
for (mtr_t mtr;;)
{
mtr.start();
@@ -887,15 +907,13 @@ trx_undo_truncate_start(
trx_undo_rec_t* last_rec;
mtr_t mtr;
+ ut_ad(rseg->is_persistent());
+
if (!limit) {
return DB_SUCCESS;
}
loop:
- mtr_start(&mtr);
-
- if (!rseg->is_persistent()) {
- mtr.set_log_mode(MTR_LOG_NO_REDO);
- }
+ mtr.start();
dberr_t err;
const buf_block_t* undo_page;
@@ -1263,6 +1281,8 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
return NULL;
}
+ buf_page_make_young_if_needed(&block->page);
+
UT_LIST_REMOVE(rseg->undo_cached, undo);
*pundo = undo;
@@ -1297,19 +1317,24 @@ trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
trx_undo_t* undo = trx->rsegs.m_redo.undo;
+ buf_block_t* block;
if (undo) {
- return buf_page_get_gen(
+ block = buf_page_get_gen(
page_id_t(undo->rseg->space->id, undo->last_page_no),
0, RW_X_LATCH, undo->guess_block,
BUF_GET, mtr, err);
+ if (UNIV_LIKELY(block != nullptr)) {
+ buf_page_make_young_if_needed(&block->page);
+ }
+ return block;
}
*err = DB_SUCCESS;
trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
rseg->latch.wr_lock(SRW_LOCK_CALL);
- buf_block_t* block = trx_undo_reuse_cached(
+ block = trx_undo_reuse_cached(
trx, rseg, &trx->rsegs.m_redo.undo, mtr, err);
if (!block) {
@@ -1350,12 +1375,17 @@ trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
: &trx->rsegs.m_redo.undo));
ut_ad(mtr->get_log_mode()
== (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+ buf_block_t* block;
if (*undo) {
- return buf_page_get_gen(
+ block = buf_page_get_gen(
page_id_t(rseg->space->id, (*undo)->last_page_no),
0, RW_X_LATCH, (*undo)->guess_block,
BUF_GET, mtr, err);
+ if (UNIV_LIKELY(block != nullptr)) {
+ buf_page_make_young_if_needed(&block->page);
+ }
+ return block;
}
DBUG_EXECUTE_IF(
@@ -1365,7 +1395,6 @@ trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
*err = DB_SUCCESS;
rseg->latch.wr_lock(SRW_LOCK_CALL);
- buf_block_t* block;
if (is_temp) {
ut_ad(!UT_LIST_GET_LEN(rseg->undo_cached));
} else {