summaryrefslogtreecommitdiffstats
path: root/storage/innobase/btr/btr0defragment.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/btr/btr0defragment.cc')
-rw-r--r--storage/innobase/btr/btr0defragment.cc820
1 files changed, 820 insertions, 0 deletions
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000..642db0e9
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,820 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "mysqld.h"
+
+#include <list>
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ /** persistent cursor where btr_defragment_n_pages should start */
+ btr_pcur_t * const pcur;
+ /** completion signal */
+ pthread_cond_t *cond;
+ /** timestamp of last time this index is processed by defragment thread */
+ ulonglong last_processed= 0;
+
+ btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond)
+ : pcur(pcur), cond(cond) {}
+};
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+static mysql_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+Atomic_counter<ulint> btr_defragment_compression_failures;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+Atomic_counter<ulint> btr_defragment_failures;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+Atomic_counter<ulint> btr_defragment_count;
+
+bool btr_defragment_active;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
+static void submit_defragment_task(void*arg=0)
+{
+ srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+ mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex,
+ nullptr);
+ btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+ btr_defragment_active = true;
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ if (!btr_defragment_timer)
+ return;
+ delete btr_defragment_timer;
+ btr_defragment_timer = 0;
+ task_group.cancel_pending(&btr_defragment_task);
+ mysql_mutex_lock(&btr_defragment_mutex);
+ std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ if (item->cond) {
+ pthread_cond_signal(item->cond);
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ mysql_mutex_destroy(&btr_defragment_mutex);
+ btr_defragment_active = false;
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mysql_mutex_lock(&btr_defragment_mutex);
+ for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return false;
+}
+
+/** Defragment an index.
+@param pcur persistent cursor
+@param thd current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd)
+{
+ dict_stats_empty_defrag_summary(pcur->index());
+ pthread_cond_t cond;
+ pthread_cond_init(&cond, nullptr);
+ btr_defragment_item_t item(pcur, &cond);
+ mysql_mutex_lock(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(&item);
+ if (btr_defragment_wq.size() == 1)
+ /* Kick off defragmentation work */
+ btr_defragment_start();
+ bool interrupted= false;
+ for (;;)
+ {
+ timespec abstime;
+ set_timespec(abstime, 1);
+ if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime))
+ break;
+ if (thd_killed(thd))
+ {
+ item.cond= nullptr;
+ interrupted= true;
+ break;
+ }
+ }
+
+ pthread_cond_destroy(&cond);
+ mysql_mutex_unlock(&btr_defragment_mutex);
+ return interrupted;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mysql_mutex_lock(&btr_defragment_mutex);
+ for (auto item : btr_defragment_wq)
+ {
+ if (item->cond && table == item->pcur->index()->table)
+ {
+ pthread_cond_signal(item->cond);
+ item->cond= nullptr;
+ }
+ }
+ mysql_mutex_unlock(&btr_defragment_mutex);
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index)
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && index->table->space_id != 0 // do not track system tables
+ && !index->table->is_temporary()
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+static
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+ page_cur_set_before_first(block, &cur);
+ while (rec_t* cur_rec = page_cur_move_to_next(&cur)) {
+ if (page_rec_is_supremum(cur_rec)) {
+ break;
+ }
+ offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ }
+ *n_recs_size = size;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return n_recs;
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_search_father_node_ptr(
+ rec_offs* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+ dict_index_t* index = btr_cur_get_index(cursor);
+ ut_ad(!index->is_spatial());
+
+ ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+ | MTR_MEMO_SX_LOCK));
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+ const rec_t* user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+
+ if (btr_cur_search_to_nth_level(level + 1,
+ dict_index_build_node_ptr(index,
+ user_rec, 0,
+ heap, level),
+ RW_X_LATCH,
+ cursor, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+
+ const rec_t* node_ptr = btr_cur_get_rec(cursor);
+ ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive()
+ || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+ offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+ offsets = nullptr;
+ }
+
+ return(offsets);
+}
+
+static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor)
+{
+ rec_t *rec=
+ page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+ if (UNIV_UNLIKELY(!rec))
+ return false;
+ cursor->page_cur.rec= rec;
+ mem_heap_t *heap= mem_heap_create(100);
+ const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr);
+ mem_heap_free(heap);
+ return got;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation.
+@retval nullptr if corruption was noticed */
+static
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint level = btr_page_get_level(from_page);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = srv_page_size - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ dberr_t err = btr_page_reorganize_block(page_zip_level,
+ to_block, index, mtr);
+ if (err != DB_SUCCESS) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return err == DB_FAIL ? from_block : nullptr;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ if (max_ins_size < move_size) {
+ return nullptr;
+ }
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ dberr_t err;
+ while (n_recs_to_move > 0) {
+ if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) {
+ return nullptr;
+ }
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr, &err);
+ if (orig_pred)
+ break;
+ if (err != DB_FAIL) {
+ return nullptr;
+ }
+
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ btr_defragment_compression_failures++;
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ srv_page_size,
+ ULINT_UNDEFINED);
+ }
+ }
+ btr_cur_t parent;
+ parent.page_cur.index = index;
+ parent.page_cur.block = from_block;
+
+ if (!btr_page_search_father(mtr, &parent)) {
+ to_block = nullptr;
+ } else if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(*to_block, orig_pred,
+ from_block->page.id());
+ btr_search_drop_page_hash_index(from_block, false);
+ if (btr_level_list_remove(*from_block, *index, mtr)
+ != DB_SUCCESS
+ || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS
+ || btr_page_free(index, from_block, mtr) != DB_SUCCESS) {
+ return nullptr;
+ }
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ // FIXME: reuse the node_ptr!
+ if (btr_cur_node_ptr_delete(&parent, mtr)
+ != DB_SUCCESS) {
+ return nullptr;
+ }
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ if (!rec) {
+ return nullptr;
+ }
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level);
+ if (btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr)
+ != DB_SUCCESS) {
+ return nullptr;
+ }
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+static
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ if (!page_is_leaf(block->page.frame)) {
+ return NULL;
+ }
+
+ if (!index->table->space || !index->table->space_id) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ first_page = buf_block_get_frame(block);
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ uint32_t page_no = btr_page_get_next(page);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+
+ blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+ mtr);
+ if (!blocks[i]) {
+ return nullptr;
+ }
+ }
+
+ if (n_pages == 1) {
+ if (!page_has_prev(first_page)) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ dberr_t err;
+ btr_lift_page_up(index, block, mtr, &err);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ uint i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size /= i;
+ optimal_page_size = ut_min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = ut_min(static_cast<ulint>(
+ static_cast<double>(optimal_page_size)
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = uint((total_data_size + optimal_page_size - 1)
+ / optimal_page_size);
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ if (!new_block) {
+ break;
+ }
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ btr_defragment_count++;
+ if (n_pages == n_defragmented) {
+ btr_defragment_failures++;
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+
+
+void btr_defragment_start() {
+ if (!srv_defragment)
+ return;
+ ut_ad(!btr_defragment_wq.empty());
+ submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
+{
+ THD *thd = innobase_create_background_thd("InnoDB defragment");
+ set_current_thd(thd);
+
+ btr_defragment_item_t* item = nullptr;
+ mtr_t mtr;
+
+ mysql_mutex_lock(&btr_defragment_mutex);
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ if (!item) {
+ if (btr_defragment_wq.empty()) {
+release_and_exit:
+ mysql_mutex_unlock(&btr_defragment_mutex);
+func_exit:
+ set_current_thd(nullptr);
+ destroy_background_thd(thd);
+ return;
+ }
+ item = *btr_defragment_wq.begin();
+ ut_ad(item);
+ }
+
+ if (!item->cond) {
+processed:
+ btr_defragment_wq.remove(item);
+ item = nullptr;
+ continue;
+ }
+
+ mysql_mutex_unlock(&btr_defragment_mutex);
+
+ ulonglong now = my_interval_timer();
+ ulonglong elapsed = now - item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+ if (sleep_ms) {
+ btr_defragment_timer->set_time(sleep_ms, 0);
+ goto func_exit;
+ }
+ }
+ log_free_check();
+ mtr_start(&mtr);
+ dict_index_t *index = item->pcur->index();
+ index->set_modified(mtr);
+ /* To follow the latching order defined in WL#6326,
+ acquire index->lock X-latch. This entitles us to
+ acquire page latches in any order for the index. */
+ mtr_x_lock_index(index, &mtr);
+ if (buf_block_t *last_block =
+ item->pcur->restore_position(
+ BTR_PURGE_TREE_ALREADY_LATCHED, &mtr)
+ == btr_pcur_t::CORRUPTED
+ ? nullptr
+ : btr_defragment_n_pages(btr_pcur_get_block(item->pcur),
+ index, srv_defragment_n_pages,
+ &mtr)) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ if (rec && page_rec_is_user_rec(rec)) {
+ page_cur_position(rec, last_block,
+ btr_pcur_get_page_cur(
+ item->pcur));
+ }
+ btr_pcur_store_position(item->pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ item->last_processed = now;
+ mysql_mutex_lock(&btr_defragment_mutex);
+ } else {
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ if (dberr_t err= dict_stats_save_defrag_stats(index)) {
+ ib::error() << "Saving defragmentation stats for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ } else {
+ err = dict_stats_save_defrag_summary(index,
+ thd);
+
+ if (err != DB_SUCCESS) {
+ ib::error() << "Saving defragmentation summary for table "
+ << index->table->name
+ << " index " << index->name()
+ << " failed with error " << err;
+ }
+ }
+
+ mysql_mutex_lock(&btr_defragment_mutex);
+ if (item->cond) {
+ pthread_cond_signal(item->cond);
+ }
+ goto processed;
+ }
+ }
+
+ goto release_and_exit;
+}