summaryrefslogtreecommitdiffstats
path: root/storage/innobase/page
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:07:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:07:14 +0000
commita175314c3e5827eb193872241446f2f8f5c9d33c (patch)
treecd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/page
parentInitial commit. (diff)
downloadmariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz
mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/page')
-rw-r--r--storage/innobase/page/page0cur.cc2983
-rw-r--r--storage/innobase/page/page0page.cc2499
-rw-r--r--storage/innobase/page/page0zip.cc4713
3 files changed, 10195 insertions, 0 deletions
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
new file mode 100644
index 00000000..cc6b1797
--- /dev/null
+++ b/storage/innobase/page/page0cur.cc
@@ -0,0 +1,2983 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.cc
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "rem0cmp.h"
+#include "gis0rtree.h"
+
+#include <algorithm>
+
+#ifdef BTR_CUR_HASH_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint page_cur_short_succ;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/** Try a search shortcut based on the last insert.
+@param[in] block index page
+@param[in] index index tree
+@param[in] tuple search key
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[out] cursor page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* iup_matched_fields,
+ ulint* ilow_matched_fields,
+ page_cur_t* cursor)
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint low_match;
+ ulint up_match;
+ ibool success = FALSE;
+ const page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(page_is_leaf(page));
+
+ rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ ut_ad(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ low_match = up_match = std::min(*ilow_matched_fields,
+ *iup_matched_fields);
+
+ if (cmp_dtuple_rec_with_match(tuple, rec, offsets, &low_match) < 0) {
+ goto exit_func;
+ }
+
+ next_rec = page_rec_get_next_const(rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ if (cmp_dtuple_rec_with_match(tuple, next_rec, offsets,
+ &up_match) >= 0) {
+ goto exit_func;
+ }
+
+ *iup_matched_fields = up_match;
+ }
+
+ page_cur_position(rec, block, cursor);
+
+ *ilow_matched_fields = low_match;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ page_cur_short_succ++;
+#endif
+ success = TRUE;
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+/** Try a search shortcut based on the last insert.
+@param[in] block index page
+@param[in] index index tree
+@param[in] tuple search key
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[out] cursor page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut_bytes(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor)
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint low_match;
+ ulint low_bytes;
+ ulint up_match;
+ ulint up_bytes;
+ ibool success = FALSE;
+ const page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(page_is_leaf(page));
+
+ rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ ut_ad(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+ if (ut_pair_cmp(*ilow_matched_fields, *ilow_matched_bytes,
+ *iup_matched_fields, *iup_matched_bytes) < 0) {
+ up_match = low_match = *ilow_matched_fields;
+ up_bytes = low_bytes = *ilow_matched_bytes;
+ } else {
+ up_match = low_match = *iup_matched_fields;
+ up_bytes = low_bytes = *iup_matched_bytes;
+ }
+
+ if (cmp_dtuple_rec_with_match_bytes(
+ tuple, rec, index, offsets, &low_match, &low_bytes) < 0) {
+ goto exit_func;
+ }
+
+ next_rec = page_rec_get_next_const(rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ index->n_core_fields,
+ dtuple_get_n_fields(tuple), &heap);
+
+ if (cmp_dtuple_rec_with_match_bytes(
+ tuple, next_rec, index, offsets,
+ &up_match, &up_bytes)
+ >= 0) {
+ goto exit_func;
+ }
+
+ *iup_matched_fields = up_match;
+ *iup_matched_bytes = up_bytes;
+ }
+
+ page_cur_position(rec, block, cursor);
+
+ *ilow_matched_fields = low_match;
+ *ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ page_cur_short_succ++;
+#endif
+ success = TRUE;
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+ const dtuple_t* tuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: compare nth field */
+{
+ const dtype_t* type;
+ const dfield_t* dfield;
+ const byte* rec_f;
+ ulint rec_f_len;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ dfield = dtuple_get_nth_field(tuple, n);
+
+ type = dfield_get_type(dfield);
+
+ rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+ if (type->mtype == DATA_VARCHAR
+ || type->mtype == DATA_CHAR
+ || type->mtype == DATA_FIXBINARY
+ || type->mtype == DATA_BINARY
+ || type->mtype == DATA_BLOB
+ || DATA_GEOMETRY_MTYPE(type->mtype)
+ || type->mtype == DATA_VARMYSQL
+ || type->mtype == DATA_MYSQL) {
+
+ if (dfield_get_len(dfield) != UNIV_SQL_NULL
+ && rec_f_len != UNIV_SQL_NULL
+ && rec_f_len >= dfield_get_len(dfield)
+ && !cmp_data_data(type->mtype, type->prtype,
+ dfield_get_data(dfield),
+ dfield_get_len(dfield),
+ rec_f, dfield_get_len(dfield))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in/out: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ page_cur_mode_t mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ page_cur_t* cursor, /*!< out: page cursor */
+ rtr_info_t* rtr_info)/*!< in/out: rtree search stack */
+{
+ ulint up;
+ ulint low;
+ ulint mid;
+ const page_t* page;
+ const page_dir_slot_t* slot;
+ const rec_t* up_rec;
+ const rec_t* low_rec;
+ const rec_t* mid_rec;
+ ulint up_matched_fields;
+ ulint low_matched_fields;
+ ulint cur_matched_fields;
+ int cmp;
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+ if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_G || mode == PAGE_CUR_GE
+ || dict_index_is_spatial(index));
+#endif /* UNIV_DEBUG */
+ page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_d(page_check_dir(page));
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (n_core
+ && page_get_direction(page) == PAGE_RIGHT
+ && page_header_get_offs(page, PAGE_LAST_INSERT)
+ && mode == PAGE_CUR_LE
+ && !index->is_spatial()
+ && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+ && page_cur_try_search_shortcut(
+ block, index, tuple,
+ iup_matched_fields, ilow_matched_fields, cursor)) {
+ return;
+ }
+# ifdef PAGE_CUR_DBG
+ if (mode == PAGE_CUR_DBG) {
+ mode = PAGE_CUR_LE;
+ }
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* If the mode is for R-tree indexes, use the special MBR
+ related compare functions */
+ if (index->is_spatial() && mode > PAGE_CUR_LE) {
+ /* For leaf level insert, we still use the traditional
+ compare function for now */
+ if (mode == PAGE_CUR_RTREE_INSERT && n_core) {
+ mode = PAGE_CUR_LE;
+ } else {
+ rtr_cur_search_with_match(
+ block, (dict_index_t*)index, tuple, mode,
+ cursor, rtr_info);
+ return;
+ }
+ }
+
+ /* The following flag does not work for non-latin1 char sets because
+ cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+ /* If mode PAGE_CUR_G is specified, we are trying to position the
+ cursor to answer a query of the form "tuple < X", where tuple is
+ the input parameter, and X denotes an arbitrary physical record on
+ the page. We want to position the cursor on the first X which
+ satisfies the condition. */
+
+ up_matched_fields = *iup_matched_fields;
+ low_matched_fields = *ilow_matched_fields;
+
+ /* Perform binary search. First the search is done through the page
+ directory, after that as a linear search in the list of records
+ owned by the upper limit directory slot. */
+
+ low = 0;
+ up = ulint(page_dir_get_n_slots(page)) - 1;
+
+ /* Perform binary search until the lower and upper limit directory
+ slots come to the distance 1 of each other */
+
+ while (up - low > 1) {
+ mid = (low + up) / 2;
+ slot = page_dir_get_nth_slot(page, mid);
+ mid_rec = page_dir_slot_get_rec(slot);
+
+ cur_matched_fields = std::min(low_matched_fields,
+ up_matched_fields);
+
+ offsets = offsets_;
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, mid_rec, offsets, &cur_matched_fields);
+
+ if (cmp > 0) {
+low_slot_match:
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_slot_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_slot_match;
+ } else {
+
+ goto up_slot_match;
+ }
+ }
+
+ slot = page_dir_get_nth_slot(page, low);
+ low_rec = page_dir_slot_get_rec(slot);
+ slot = page_dir_get_nth_slot(page, up);
+ up_rec = page_dir_slot_get_rec(slot);
+
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
+
+ while (page_rec_get_next_const(low_rec) != up_rec) {
+
+ mid_rec = page_rec_get_next_const(low_rec);
+
+ cur_matched_fields = std::min(low_matched_fields,
+ up_matched_fields);
+
+ offsets = offsets_;
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match(
+ tuple, mid_rec, offsets, &cur_matched_fields);
+
+ if (cmp > 0) {
+low_rec_match:
+ low_rec = mid_rec;
+ low_matched_fields = cur_matched_fields;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_rec_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ if (!cmp && !cur_matched_fields) {
+#ifdef UNIV_DEBUG
+ mtr_t mtr;
+ mtr_start(&mtr);
+
+ /* We got a match, but cur_matched_fields is
+ 0, it must have REC_INFO_MIN_REC_FLAG */
+ ulint rec_info = rec_get_info_bits(mid_rec,
+ rec_offs_comp(offsets));
+ ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+ ut_ad(!page_has_prev(page));
+ mtr_commit(&mtr);
+#endif
+
+ cur_matched_fields = dtuple_get_n_fields_cmp(tuple);
+ }
+
+ goto low_rec_match;
+ } else {
+
+ goto up_rec_match;
+ }
+ }
+
+ if (mode <= PAGE_CUR_GE) {
+ page_cur_position(up_rec, block, cursor);
+ } else {
+ page_cur_position(low_rec, block, cursor);
+ }
+
+ *iup_matched_fields = up_matched_fields;
+ *ilow_matched_fields = low_matched_fields;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Search the right position for a page cursor.
+@param[in] block buffer block
+@param[in] index index tree
+@param[in] tuple key to be searched for
+@param[in] mode search mode
+@param[in,out] iup_matched_fields already matched fields in the
+upper limit record
+@param[in,out] iup_matched_bytes already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out] ilow_matched_fields already matched fields in the
+lower limit record
+@param[in,out] ilow_matched_bytes already matched bytes in the
+first partially matched field in the lower limit record
+@param[out] cursor page cursor */
+void
+page_cur_search_with_match_bytes(
+ const buf_block_t* block,
+ const dict_index_t* index,
+ const dtuple_t* tuple,
+ page_cur_mode_t mode,
+ ulint* iup_matched_fields,
+ ulint* iup_matched_bytes,
+ ulint* ilow_matched_fields,
+ ulint* ilow_matched_bytes,
+ page_cur_t* cursor)
+{
+ ulint up;
+ ulint low;
+ ulint mid;
+ const page_t* page;
+ const page_dir_slot_t* slot;
+ const rec_t* up_rec;
+ const rec_t* low_rec;
+ const rec_t* mid_rec;
+ ulint up_matched_fields;
+ ulint up_matched_bytes;
+ ulint low_matched_fields;
+ ulint low_matched_bytes;
+ ulint cur_matched_fields;
+ ulint cur_matched_bytes;
+ int cmp;
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_validate(tuple));
+ ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+ if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+ page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_d(page_check_dir(page));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (page_is_leaf(page)
+ && page_get_direction(page) == PAGE_RIGHT
+ && page_header_get_offs(page, PAGE_LAST_INSERT)
+ && mode == PAGE_CUR_LE
+ && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+ && page_cur_try_search_shortcut_bytes(
+ block, index, tuple,
+ iup_matched_fields, iup_matched_bytes,
+ ilow_matched_fields, ilow_matched_bytes,
+ cursor)) {
+ return;
+ }
+# ifdef PAGE_CUR_DBG
+ if (mode == PAGE_CUR_DBG) {
+ mode = PAGE_CUR_LE;
+ }
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ /* The following flag does not work for non-latin1 char sets because
+ cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+ /* If mode PAGE_CUR_G is specified, we are trying to position the
+ cursor to answer a query of the form "tuple < X", where tuple is
+ the input parameter, and X denotes an arbitrary physical record on
+ the page. We want to position the cursor on the first X which
+ satisfies the condition. */
+
+ up_matched_fields = *iup_matched_fields;
+ up_matched_bytes = *iup_matched_bytes;
+ low_matched_fields = *ilow_matched_fields;
+ low_matched_bytes = *ilow_matched_bytes;
+
+ /* Perform binary search. First the search is done through the page
+ directory, after that as a linear search in the list of records
+ owned by the upper limit directory slot. */
+
+ low = 0;
+ up = ulint(page_dir_get_n_slots(page)) - 1;
+
+ /* Perform binary search until the lower and upper limit directory
+ slots come to the distance 1 of each other */
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+ while (up - low > 1) {
+ mid = (low + up) / 2;
+ slot = page_dir_get_nth_slot(page, mid);
+ mid_rec = page_dir_slot_get_rec(slot);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets_, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match_bytes(
+ tuple, mid_rec, index, offsets,
+ &cur_matched_fields, &cur_matched_bytes);
+
+ if (cmp > 0) {
+low_slot_match:
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_slot_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_slot_match;
+ } else {
+
+ goto up_slot_match;
+ }
+ }
+
+ slot = page_dir_get_nth_slot(page, low);
+ low_rec = page_dir_slot_get_rec(slot);
+ slot = page_dir_get_nth_slot(page, up);
+ up_rec = page_dir_slot_get_rec(slot);
+
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
+
+ while (page_rec_get_next_const(low_rec) != up_rec) {
+
+ mid_rec = page_rec_get_next_const(low_rec);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ if (UNIV_UNLIKELY(rec_get_info_bits(
+ mid_rec,
+ dict_table_is_comp(index->table))
+ & REC_INFO_MIN_REC_FLAG)) {
+ ut_ad(!page_has_prev(page_align(mid_rec)));
+ ut_ad(!page_rec_is_leaf(mid_rec)
+ || rec_is_metadata(mid_rec, *index));
+ cmp = 1;
+ goto low_rec_match;
+ }
+
+ offsets = rec_get_offsets(
+ mid_rec, index, offsets_, n_core,
+ dtuple_get_n_fields_cmp(tuple), &heap);
+
+ cmp = cmp_dtuple_rec_with_match_bytes(
+ tuple, mid_rec, index, offsets,
+ &cur_matched_fields, &cur_matched_bytes);
+
+ if (cmp > 0) {
+low_rec_match:
+ low_rec = mid_rec;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_rec_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+ goto low_rec_match;
+ } else {
+
+ goto up_rec_match;
+ }
+ }
+
+ if (mode <= PAGE_CUR_GE) {
+ page_cur_position(up_rec, block, cursor);
+ } else {
+ page_cur_position(low_rec, block, cursor);
+ }
+
+ *iup_matched_fields = up_matched_fields;
+ *iup_matched_bytes = up_matched_bytes;
+ *ilow_matched_fields = low_matched_fields;
+ *ilow_matched_bytes = low_matched_bytes;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ const ulint n_recs = page_get_n_recs(block->frame);
+
+ page_cur_set_before_first(block, cursor);
+
+ if (UNIV_UNLIKELY(n_recs == 0)) {
+
+ return;
+ }
+
+ cursor->rec = page_rec_get_nth(block->frame,
+ ut_rnd_interval(n_recs) + 1);
+}
+
+/**
+Set the number of owned records.
+@param[in,out] rec record in block.frame
+@param[in] n_owned number of records skipped in the sparse page directory
+@param[in] comp whether ROW_FORMAT is COMPACT or DYNAMIC */
+static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp)
+{
+ rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+ *rec= static_cast<byte>((*rec & ~REC_N_OWNED_MASK) |
+ (n_owned << REC_N_OWNED_SHIFT));
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out] block index page
+@param[in,out] slot the slot that needs to be split */
+static void page_dir_split_slot(const buf_block_t &block,
+ page_dir_slot_t *slot)
+{
+ ut_ad(slot <= &block.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
+ slot= my_assume_aligned<2>(slot);
+
+ const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+ ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+ static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+ PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+ /* Find a record approximately in the middle. */
+ const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+
+ for (ulint i= n_owned / 2; i--; )
+ rec= page_rec_get_next_const(rec);
+
+ /* Add a directory slot immediately below this one. */
+ constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.frame);
+ const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+ page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+ (block.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+ n_slots * PAGE_DIR_SLOT_SIZE);
+ ut_ad(slot >= last_slot);
+ memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+ slot - last_slot);
+
+ const ulint half_owned= n_owned / 2;
+
+ mach_write_to_2(n_slots_p, n_slots + 1);
+
+ mach_write_to_2(slot, rec - block.frame);
+ const bool comp= page_is_comp(block.frame) != 0;
+ page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp);
+ page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+ n_owned - half_owned, comp);
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out] block index page (ROW_FORMAT=COMPRESSED)
+@param[in] s the slot that needs to be split
+@param[in,out] mtr mini-transaction */
+static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
+{
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(block->frame));
+ ut_ad(s);
+
+ page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, s);
+ const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+ ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+ static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+ PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+ /* 1. We loop to find a record approximately in the middle of the
+ records owned by the slot. */
+
+ const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+
+ for (ulint i= n_owned / 2; i--; )
+ rec= page_rec_get_next_const(rec);
+
+ /* Add a directory slot immediately below this one. */
+ constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame);
+ const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+ page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+ (block->frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+ n_slots * PAGE_DIR_SLOT_SIZE);
+ memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+ slot - last_slot);
+
+ const ulint half_owned= n_owned / 2;
+
+ mtr->write<2>(*block, n_slots_p, 1U + n_slots);
+
+ /* Log changes to the compressed page header and the dense page directory. */
+ memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2);
+ mach_write_to_2(slot, page_offset(rec));
+ page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned,
+ true, mtr);
+ page_rec_set_n_owned<true>(block,
+ page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+ n_owned - half_owned, true, mtr);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] s the slot to be balanced
+@param[in,out] mtr mini-transaction */
+static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
+{
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(block->frame));
+ ut_ad(s > 0);
+
+ const ulint n_slots = page_dir_get_n_slots(block->frame);
+
+ if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+ /* The last directory slot cannot be balanced. */
+ return;
+ }
+
+ ut_ad(s < n_slots);
+
+ page_dir_slot_t* slot = page_dir_get_nth_slot(block->frame, s);
+ rec_t* const up_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+ rec_t* const slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot));
+ const ulint up_n_owned = rec_get_n_owned_new(up_rec);
+
+ ut_ad(rec_get_n_owned_new(page_dir_slot_get_rec(slot))
+ == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+ if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ /* Merge the slots. */
+ page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+ page_rec_set_n_owned<true>(block, up_rec, up_n_owned
+ + (PAGE_DIR_SLOT_MIN_N_OWNED - 1),
+ true, mtr);
+ /* Shift the slots */
+ page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ block->frame, n_slots - 1);
+ memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+ slot - last_slot);
+ constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>
+ (n_slots_f + block->frame);
+ mtr->write<2>(*block, n_slots_p, n_slots - 1);
+ memcpy_aligned<2>(n_slots_f + block->page.zip.data,
+ n_slots_p, 2);
+ memset_aligned<2>(last_slot, 0, 2);
+ return;
+ }
+
+ /* Transfer one record to the underfilled slot */
+ page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+ rec_t* new_rec = rec_get_next_ptr(slot_rec, TRUE);
+ page_rec_set_n_owned<true>(block, new_rec,
+ PAGE_DIR_SLOT_MIN_N_OWNED,
+ true, mtr);
+ mach_write_to_2(slot, page_offset(new_rec));
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out] block index page
+@param[in] s the slot to be balanced */
+static void page_dir_balance_slot(const buf_block_t &block, ulint s)
+{
+ const bool comp= page_is_comp(block.frame);
+ ut_ad(!block.page.zip.data);
+ ut_ad(s > 0);
+
+ const ulint n_slots = page_dir_get_n_slots(block.frame);
+
+ if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+ /* The last directory slot cannot be balanced. */
+ return;
+ }
+
+ ut_ad(s < n_slots);
+
+ page_dir_slot_t* slot = page_dir_get_nth_slot(block.frame, s);
+ rec_t* const up_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+ rec_t* const slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(slot));
+ const ulint up_n_owned = comp
+ ? rec_get_n_owned_new(up_rec)
+ : rec_get_n_owned_old(up_rec);
+
+ ut_ad(page_dir_slot_get_n_owned(slot)
+ == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+ if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ /* Merge the slots. */
+ page_rec_set_n_owned(slot_rec, 0, comp);
+ page_rec_set_n_owned(up_rec, up_n_owned
+ + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp);
+ /* Shift the slots */
+ page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ block.frame, n_slots - 1);
+ memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+ slot - last_slot);
+ memset_aligned<2>(last_slot, 0, 2);
+ constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+ byte *n_slots_p= my_assume_aligned<2>
+ (n_slots_f + block.frame);
+ mach_write_to_2(n_slots_p, n_slots - 1);
+ return;
+ }
+
+ /* Transfer one record to the underfilled slot */
+ rec_t* new_rec;
+
+ if (comp) {
+ page_rec_set_n_owned(slot_rec, 0, true);
+ new_rec = rec_get_next_ptr(slot_rec, TRUE);
+ page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED, true);
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+ } else {
+ page_rec_set_n_owned(slot_rec, 0, false);
+ new_rec = rec_get_next_ptr(slot_rec, FALSE);
+ page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED,
+ false);
+ page_rec_set_n_owned(up_rec, up_n_owned - 1, false);
+ }
+
+ mach_write_to_2(slot, page_offset(new_rec));
+}
+
+/** Allocate space for inserting an index record.
+@tparam compressed whether to update the ROW_FORMAT=COMPRESSED
+@param[in,out] block index page
+@param[in] need number of bytes needed
+@param[out] heap_no record heap number
+@return pointer to the start of the allocated buffer
+@retval NULL if allocation fails */
+template<bool compressed=false>
+static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
+ ulint *heap_no)
+{
+ ut_ad(!compressed || block->page.zip.data);
+
+ byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+ block->frame);
+
+ const uint16_t top= mach_read_from_2(heap_top);
+
+ if (need > page_get_max_insert_size(block->frame, 1))
+ return NULL;
+
+ byte *n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block->frame);
+
+ const uint16_t h= mach_read_from_2(n_heap);
+ if (UNIV_UNLIKELY((h + 1) & 0x6000))
+ {
+ /* At the minimum record size of 5+2 bytes, we can only reach this
+ condition when using innodb_page_size=64k. */
+ ut_ad((h & 0x7fff) == 8191);
+ ut_ad(srv_page_size == 65536);
+ return NULL;
+ }
+
+ *heap_no= h & 0x7fff;
+ ut_ad(*heap_no < srv_page_size / REC_N_NEW_EXTRA_BYTES);
+ compile_time_assert(UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES < 0x3fff);
+
+ mach_write_to_2(heap_top, top + need);
+ mach_write_to_2(n_heap, h + 1);
+
+ if (compressed)
+ {
+ ut_ad(h & 0x8000);
+ memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER],
+ heap_top, 4);
+ }
+
+ return &block->frame[top];
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=REDUNDANT.
+@param block B-tree or R-tree page
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_OLD_INFIMUM
+@param info_bits info_bits of the record
+@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+@param hdr_c number of common record header bytes with prev_rec
+@param data_c number of common data bytes with prev_rec
+@param hdr record header bytes to copy to the log
+@param hdr_l number of copied record header bytes
+@param data record payload bytes to copy to the log
+@param data_l number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_bits,
+ ulint n_fields_s, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l)
+{
+ ut_ad(!block.page.zip.data);
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+ ut_ad(n_slots >= 2);
+ ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+ ut_ad(&block.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
+ ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+ page_end);
+ ut_ad(fil_page_index_page_check(block.frame));
+ ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits));
+ ut_ad(n_fields_s >= 2);
+ ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS);
+ ut_ad(data_l + data_c <= REDUNDANT_REC_MAX_DATA_SIZE);
+
+ set_modified(block);
+
+ static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+ static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+ n_fields_s= (n_fields_s - 2) << 2 | info_bits >> 4;
+
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ static_assert((REC_MAX_N_FIELDS << 1 | 1) <= MIN_3BYTE, "compatibility");
+ len+= n_fields_s < MIN_2BYTE ? 1 : 2;
+ len+= hdr_c < MIN_2BYTE ? 1 : 2;
+ static_assert(REDUNDANT_REC_MAX_DATA_SIZE <= MIN_3BYTE, "compatibility");
+ len+= data_c < MIN_2BYTE ? 1 : 2;
+ len+= hdr_l + data_l;
+
+ const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+ if (UNIV_LIKELY(small))
+ {
+ ut_d(const byte * const end = l + len);
+ *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, n_fields_s);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ::memcpy(l, hdr, hdr_l);
+ l+= hdr_l;
+ ::memcpy(l, data, data_l);
+ l+= data_l;
+ ut_ad(end == l);
+ m_log.close(l);
+ }
+ else
+ {
+ m_log.close(l);
+ l= m_log.open(len - hdr_l - data_l);
+ ut_d(const byte * const end = l + len - hdr_l - data_l);
+ *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+ l= mlog_encode_varint(l, prev_rec);
+ l= mlog_encode_varint(l, n_fields_s);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+ m_log.push(data, static_cast<uint32_t>(data_l));
+ }
+
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+@param block B-tree or R-tree page
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec byte offset of the predecessor of the record to insert,
+ starting from PAGE_NEW_INFIMUM
+@param info_status rec_get_info_and_status_bits()
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param hdr_c number of common record header bytes with prev_rec
+@param data_c number of common data bytes with prev_rec
+@param hdr record header bytes to copy to the log
+@param hdr_l number of copied record header bytes
+@param data record payload bytes to copy to the log
+@param data_l number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+ ulint prev_rec, byte info_status,
+ ssize_t shift, size_t hdr_c, size_t data_c,
+ const byte *hdr, size_t hdr_l,
+ const byte *data, size_t data_l)
+{
+ ut_ad(!block.page.zip.data);
+ ut_ad(m_log_mode == MTR_LOG_ALL);
+ ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+ ut_ad(n_slots >= 2);
+ ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
+ ut_ad(&block.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
+ ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
+ page_end);
+ ut_ad(fil_page_index_page_check(block.frame));
+ ut_ad(hdr_l + hdr_c + data_l + data_c <=
+ static_cast<size_t>(page_end - &block.frame[PAGE_NEW_SUPREMUM_END]));
+ ut_ad(reuse || shift == 0);
+#ifdef UNIV_DEBUG
+ switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) {
+ default:
+ ut_ad(0);
+ break;
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!page_is_leaf(block.frame));
+ break;
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_ORDINARY:
+ ut_ad(page_is_leaf(block.frame));
+ }
+#endif
+
+ set_modified(block);
+
+ static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+ static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+ static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+
+ const size_t enc_hdr_l= hdr_l << 3 |
+ (info_status & REC_STATUS_INSTANT) | info_status >> 4;
+ size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+ static_assert(REC_MAX_N_FIELDS * 2 < MIN_3BYTE, "compatibility");
+ if (reuse)
+ {
+ if (shift < 0)
+ shift= -shift << 1 | 1;
+ else
+ shift<<= 1;
+ len+= static_cast<size_t>(shift) < MIN_2BYTE
+ ? 1 : static_cast<size_t>(shift) < MIN_3BYTE ? 2 : 3;
+ }
+ ut_ad(hdr_c + hdr_l <= REC_MAX_N_FIELDS * 2);
+ len+= hdr_c < MIN_2BYTE ? 1 : 2;
+ len+= enc_hdr_l < MIN_2BYTE ? 1 : enc_hdr_l < MIN_3BYTE ? 2 : 3;
+ len+= data_c < MIN_2BYTE ? 1 : data_c < MIN_3BYTE ? 2 : 3;
+ len+= hdr_l + data_l;
+
+ const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+ byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+ if (UNIV_LIKELY(small))
+ {
+ ut_d(const byte * const end = l + len);
+ *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ if (reuse)
+ l= mlog_encode_varint(l, shift);
+ l= mlog_encode_varint(l, enc_hdr_l);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ::memcpy(l, hdr, hdr_l);
+ l+= hdr_l;
+ ::memcpy(l, data, data_l);
+ l+= data_l;
+ ut_ad(end == l);
+ m_log.close(l);
+ }
+ else
+ {
+ m_log.close(l);
+ l= m_log.open(len - hdr_l - data_l);
+ ut_d(const byte * const end = l + len - hdr_l - data_l);
+ *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+ l= mlog_encode_varint(l, prev_rec);
+ if (reuse)
+ l= mlog_encode_varint(l, shift);
+ l= mlog_encode_varint(l, enc_hdr_l);
+ l= mlog_encode_varint(l, hdr_c);
+ l= mlog_encode_varint(l, data_c);
+ ut_ad(end == l);
+ m_log.close(l);
+ m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+ m_log.push(data, static_cast<uint32_t>(data_l));
+ }
+
+ m_last_offset= FIL_PAGE_TYPE;
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ const page_cur_t*cur, /*!< in: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: record to insert after cur */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ buf_block_t* block= cur->block;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_n_fields(offsets) > 0);
+ ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+ ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets));
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) ==
+ index->id ||
+ mtr->is_inside_ibuf());
+ ut_ad(page_dir_get_n_slots(block->frame) >= 2);
+
+ ut_ad(!page_rec_is_supremum(cur->rec));
+
+ /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
+ ut_ad(mtr->get_log_mode() != MTR_LOG_ALL ||
+ !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
+
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+ {
+ const void *rec_start __attribute__((unused))=
+ rec - rec_offs_extra_size(offsets);
+ ulint extra_size __attribute__((unused))=
+ rec_offs_extra_size(offsets) -
+ (page_is_comp(block->frame)
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES);
+ /* All data bytes of the record must be valid. */
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ MEM_CHECK_DEFINED(rec_start, extra_size);
+ }
+#endif /* HAVE_MEM_CHECK */
+
+ /* 2. Try to find suitable space from page memory management */
+ bool reuse= false;
+ ssize_t free_offset= 0;
+ ulint heap_no;
+ byte *insert_buf;
+
+ const bool comp= page_is_comp(block->frame);
+ const ulint extra_size= rec_offs_extra_size(offsets);
+
+ if (rec_t* free_rec= page_header_get_ptr(block->frame, PAGE_FREE))
+ {
+ /* Try to reuse the head of PAGE_FREE. */
+ rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_,
+ page_is_leaf(block->frame)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ const ulint fextra_size= rec_offs_extra_size(foffsets);
+ insert_buf= free_rec - fextra_size;
+ const bool too_small= (fextra_size + rec_offs_data_size(foffsets)) <
+ rec_size;
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+
+ if (too_small)
+ goto use_heap;
+
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->frame);
+ if (comp)
+ {
+ heap_no= rec_get_heap_no_new(free_rec);
+ uint16_t next= mach_read_from_2(free_rec - REC_NEXT);
+ mach_write_to_2(page_free, next
+ ? static_cast<uint16_t>(free_rec + next - block->frame)
+ : 0);
+ }
+ else
+ {
+ heap_no= rec_get_heap_no_old(free_rec);
+ memcpy(page_free, free_rec - REC_NEXT, 2);
+ }
+
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(page_garbage) >= rec_size);
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - rec_size);
+ reuse= true;
+ free_offset= extra_size - fextra_size;
+ }
+ else
+ {
+use_heap:
+ insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(!insert_buf))
+ return nullptr;
+ }
+
+ ut_ad(cur->rec != insert_buf + extra_size);
+
+ rec_t *next_rec= block->frame + rec_get_next_offs(cur->rec, comp);
+ ut_ad(next_rec != block->frame);
+
+ /* Update page header fields */
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block->frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ ut_ad(!last_insert || !comp ||
+ rec_get_node_ptr_flag(block->frame + last_insert) ==
+ rec_get_node_ptr_flag(rec));
+
+ /* Write PAGE_LAST_INSERT */
+ mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size));
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block->frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block->frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n= my_assume_aligned<2>
+ (&block->frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n, 0, 2);
+ }
+ else if (block->frame + last_insert == cur->rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (next_rec == block->frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ /* Update the preceding record header, the 'owner' record and
+ prepare the record to insert. */
+ rec_t *insert_rec= insert_buf + extra_size;
+ const ulint data_size= rec_offs_data_size(offsets);
+ memcpy(insert_buf, rec - extra_size, extra_size + data_size);
+ size_t hdr_common= 0;
+ ulint n_owned;
+ const byte info_status= static_cast<byte>
+ (rec_get_info_and_status_bits(rec, comp));
+ ut_ad(!(rec_get_info_bits(rec, comp) &
+ ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)));
+
+ if (comp)
+ {
+#ifdef UNIV_DEBUG
+ switch (rec_get_status(cur->rec)) {
+ case REC_STATUS_ORDINARY:
+ case REC_STATUS_NODE_PTR:
+ case REC_STATUS_INSTANT:
+ case REC_STATUS_INFIMUM:
+ break;
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong status on cur->rec" == 0);
+ }
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_NODE_PTR:
+ ut_ad(!page_is_leaf(block->frame));
+ break;
+ case REC_STATUS_INSTANT:
+ ut_ad(index->is_instant());
+ ut_ad(page_is_leaf(block->frame));
+ if (!rec_is_metadata(rec, true))
+ break;
+ ut_ad(cur->rec == &block->frame[PAGE_NEW_INFIMUM]);
+ break;
+ case REC_STATUS_ORDINARY:
+ ut_ad(page_is_leaf(block->frame));
+ ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG));
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad("wrong status on rec" == 0);
+ }
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+#endif
+
+ rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ insert_rec[-REC_NEW_STATUS]= rec[-REC_NEW_STATUS];
+ rec_set_bit_field_2(insert_rec, heap_no,
+ REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(insert_rec - REC_NEXT,
+ static_cast<uint16_t>(next_rec - insert_rec));
+ mach_write_to_2(cur->rec - REC_NEXT,
+ static_cast<uint16_t>(insert_rec - cur->rec));
+ while (!(n_owned= rec_get_n_owned_new(next_rec)))
+ {
+ next_rec= block->frame + rec_get_next_offs(next_rec, true);
+ ut_ad(next_rec != block->frame);
+ }
+ rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ mtr->set_modified(*block);
+ goto copied;
+ }
+
+ const byte * const c_start= cur->rec - extra_size;
+ if (extra_size > REC_N_NEW_EXTRA_BYTES &&
+ c_start >=
+ &block->frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
+ {
+ /* Find common header bytes with the preceding record. */
+ const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ for (const byte *c= cur->rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ *r == *c && c-- != c_start; r--);
+ hdr_common= static_cast<size_t>((rec - (REC_N_NEW_EXTRA_BYTES + 1)) - r);
+ ut_ad(hdr_common <= extra_size - REC_N_NEW_EXTRA_BYTES);
+ }
+ }
+ else
+ {
+#ifdef UNIV_DEBUG
+ if (!page_is_leaf(block->frame));
+ else if (rec_is_metadata(rec, false))
+ {
+ ut_ad(index->is_instant());
+ ut_ad(cur->rec == &block->frame[PAGE_OLD_INFIMUM]);
+ }
+#endif
+ rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(insert_rec, heap_no,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ memcpy(insert_rec - REC_NEXT, cur->rec - REC_NEXT, 2);
+ mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec));
+ while (!(n_owned= rec_get_n_owned_old(next_rec)))
+ {
+ next_rec= block->frame + rec_get_next_offs(next_rec, false);
+ ut_ad(next_rec != block->frame);
+ }
+ rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ mtr->set_modified(*block);
+ goto copied;
+ }
+
+ ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES);
+ const byte * const c_start= cur->rec - extra_size;
+ if (c_start >=
+ &block->frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
+ {
+ /* Find common header bytes with the preceding record. */
+ const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1);
+ for (const byte *c= cur->rec - (REC_N_OLD_EXTRA_BYTES + 1);
+ *r == *c && c-- != c_start; r--);
+ hdr_common= static_cast<size_t>((rec - (REC_N_OLD_EXTRA_BYTES + 1)) - r);
+ ut_ad(hdr_common <= extra_size - REC_N_OLD_EXTRA_BYTES);
+ }
+ }
+
+ /* Insert the record, possibly copying from the preceding record. */
+ ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+ {
+ const byte *r= rec;
+ const byte *c= cur->rec;
+ const byte *c_end= cur->rec + data_size;
+ if (c <= insert_buf && c_end > insert_buf)
+ c_end= insert_buf;
+ else
+ c_end= std::min<const byte*>(c_end, block->frame + srv_page_size -
+ PAGE_DIR - PAGE_DIR_SLOT_SIZE *
+ page_dir_get_n_slots(block->frame));
+ size_t data_common;
+ /* Copy common data bytes of the preceding record. */
+ for (; c != c_end && *r == *c; c++, r++);
+ data_common= static_cast<size_t>(r - rec);
+
+ if (comp)
+ mtr->page_insert(*block, reuse,
+ cur->rec - block->frame - PAGE_NEW_INFIMUM,
+ info_status, free_offset, hdr_common, data_common,
+ insert_buf,
+ extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES,
+ r, data_size - data_common);
+ else
+ mtr->page_insert(*block, reuse,
+ cur->rec - block->frame - PAGE_OLD_INFIMUM,
+ info_status, rec_get_n_fields_old(insert_rec) << 1 |
+ rec_get_1byte_offs_flag(insert_rec),
+ hdr_common, data_common,
+ insert_buf,
+ extra_size - hdr_common - REC_N_OLD_EXTRA_BYTES,
+ r, data_size - data_common);
+ }
+
+copied:
+ ut_ad(!memcmp(insert_buf, rec - extra_size, extra_size -
+ (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)));
+ ut_ad(!memcmp(insert_rec, rec, data_size));
+ /* We have incremented the n_owned field of the owner record.
+ If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the
+ corresponding directory slot in two. */
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ {
+ const auto owner= page_dir_find_owner_slot(next_rec);
+ page_dir_split_slot(*block, page_dir_get_nth_slot(block->frame, owner));
+ }
+
+ rec_offs_make_valid(insert_buf + extra_size, index,
+ page_is_leaf(block->frame), offsets);
+ return insert_buf + extra_size;
+}
+
+/** Add a slot to the dense page directory.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] index the index that the page belongs to
+@param[in,out] mtr mini-transaction */
+static inline void page_zip_dir_add_slot(buf_block_t *block,
+ const dict_index_t *index, mtr_t *mtr)
+{
+ page_zip_des_t *page_zip= &block->page.zip;
+
+ ut_ad(page_is_comp(page_zip->data));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ /* Read the old n_dense (n_heap has already been incremented). */
+ ulint n_dense= page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW +
+ 1U);
+
+ byte *dir= page_zip->data + page_zip_get_size(page_zip) -
+ PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ byte *stored= dir;
+
+ if (!page_is_leaf(page_zip->data))
+ {
+ ut_ad(!page_zip->n_blobs);
+ stored-= n_dense * REC_NODE_PTR_SIZE;
+ }
+ else if (index->is_clust())
+ {
+ /* Move the BLOB pointer array backwards to make space for the
+ columns DB_TRX_ID,DB_ROLL_PTR and the dense directory slot. */
+
+ stored-= n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ byte *externs= stored - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ byte *dst= externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+ ut_ad(!memcmp(dst, field_ref_zero, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE));
+ if (const ulint len = ulint(stored - externs))
+ {
+ memmove(dst, externs, len);
+ mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data,
+ len);
+ }
+ }
+ else
+ {
+ stored-= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(!memcmp(stored - PAGE_ZIP_DIR_SLOT_SIZE, field_ref_zero,
+ PAGE_ZIP_DIR_SLOT_SIZE));
+ }
+
+ /* Move the uncompressed area backwards to make space
+ for one directory slot. */
+ if (const ulint len = ulint(dir - stored))
+ {
+ byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE;
+ memmove(dst, stored, len);
+ mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len);
+ }
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+ ut_ad(page_zip);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(index->table->not_redundant());
+ ut_ad(page_is_comp(cursor->block->frame));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX ||
+ fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE);
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) ==
+ index->id || mtr->is_inside_ibuf());
+ ut_ad(!page_get_instant(cursor->block->frame));
+ ut_ad(!page_cur_is_after_last(cursor));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, cursor->block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* 1. Get the size of the physical record in the page */
+ const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+ {
+ const void *rec_start __attribute__((unused))=
+ rec - rec_offs_extra_size(offsets);
+ ulint extra_size __attribute__((unused))=
+ rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES;
+ /* All data bytes of the record must be valid. */
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ MEM_CHECK_DEFINED(rec_start, extra_size);
+ }
+#endif /* HAVE_MEM_CHECK */
+ const bool reorg_before_insert= page_has_garbage(cursor->block->frame) &&
+ rec_size > page_get_max_insert_size(cursor->block->frame, 1) &&
+ rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame,
+ 1);
+ constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
+ byte* const page_free = my_assume_aligned<4>(page_free_f +
+ cursor->block->frame);
+ uint16_t free_rec= 0;
+
+ /* 2. Try to find suitable space from page memory management */
+ ulint heap_no;
+ byte *insert_buf;
+
+ if (reorg_before_insert ||
+ !page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ {
+ /* SET GLOBAL might be executed concurrently. Sample the value once. */
+ ulint level= page_zip_level;
+#ifdef UNIV_DEBUG
+ const rec_t * const cursor_rec= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+ if (page_is_empty(cursor->block->frame))
+ {
+ ut_ad(page_cur_is_before_first(cursor));
+
+ /* This is an empty page. Recreate to remove the modification log. */
+ page_create_zip(cursor->block, index,
+ page_header_get_field(cursor->block->frame, PAGE_LEVEL),
+ 0, mtr);
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+
+ /* The cursor should remain on the page infimum. */
+ return nullptr;
+ }
+
+ if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame))
+ {
+ ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+ if (!page_zip_reorganize(cursor->block, index, level, mtr, true))
+ {
+ ut_ad(cursor->rec == cursor_rec);
+ return nullptr;
+ }
+
+ if (pos)
+ cursor->rec= page_rec_get_nth(cursor->block->frame, pos);
+ else
+ ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame));
+
+ ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+
+ if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+ goto use_heap;
+ }
+
+ /* Try compressing the whole page afterwards. */
+ const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
+ rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets,
+ mtr);
+ mtr->set_log_mode(log_mode);
+
+ if (insert_rec)
+ {
+ ulint pos= page_rec_get_n_recs_before(insert_rec);
+ ut_ad(pos > 0);
+
+ /* We are writing entire page images to the log. Reduce the redo
+ log volume by reorganizing the page at the same time. */
+ if (page_zip_reorganize(cursor->block, index, level, mtr))
+ {
+ /* The page was reorganized: Seek to pos. */
+ cursor->rec= pos > 1
+ ? page_rec_get_nth(cursor->block->frame, pos - 1)
+ : cursor->block->frame + PAGE_NEW_INFIMUM;
+ insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1);
+ rec_offs_make_valid(insert_rec, index,
+ page_is_leaf(cursor->block->frame), offsets);
+ return insert_rec;
+ }
+
+ /* Theoretically, we could try one last resort of
+ page_zip_reorganize() followed by page_zip_available(), but that
+ would be very unlikely to succeed. (If the full reorganized page
+ failed to compress, why would it succeed to compress the page,
+ plus log the insert of this record?) */
+
+ /* Out of space: restore the page */
+ if (!page_zip_decompress(page_zip, cursor->block->frame, false))
+ ut_error; /* Memory corrupted? */
+ ut_ad(page_validate(cursor->block->frame, index));
+ insert_rec= nullptr;
+ }
+ return insert_rec;
+ }
+
+ free_rec= mach_read_from_2(page_free);
+ if (free_rec)
+ {
+ /* Try to allocate from the head of the free list. */
+ rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t *heap= nullptr;
+
+ rec_offs_init(foffsets_);
+
+ rec_offs *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index,
+ foffsets_,
+ page_is_leaf(cursor->block->frame)
+ ? index->n_core_fields : 0,
+ ULINT_UNDEFINED, &heap);
+ insert_buf= cursor->block->frame + free_rec -
+ rec_offs_extra_size(foffsets);
+
+ if (rec_offs_size(foffsets) < rec_size)
+ {
+too_small:
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ free_rec= 0;
+ goto use_heap;
+ }
+
+ /* On compressed pages, do not relocate records from
+ the free list. If extra_size would grow, use the heap. */
+ const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) -
+ rec_offs_extra_size(foffsets));
+
+ if (UNIV_UNLIKELY(extra_size_diff < 0))
+ {
+ /* Add an offset to the extra_size. */
+ if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff))
+ goto too_small;
+
+ insert_buf-= extra_size_diff;
+ }
+ else if (UNIV_UNLIKELY(extra_size_diff))
+ /* Do not allow extra_size to grow */
+ goto too_small;
+
+ byte *const free_rec_ptr= cursor->block->frame + free_rec;
+ heap_no= rec_get_heap_no_new(free_rec_ptr);
+ int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT);
+ /* With innodb_page_size=64k, int16_t would be unsafe to use here,
+ but that cannot be used with ROW_FORMAT=COMPRESSED. */
+ static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+ if (next_rec)
+ {
+ next_rec= static_cast<int16_t>(next_rec + free_rec);
+ ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec);
+ ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+ }
+
+ byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
+ mach_write_to_2(hdr, static_cast<uint16_t>(next_rec));
+ const byte *const garbage= my_assume_aligned<2>(page_free + 2);
+ ut_ad(mach_read_from_2(garbage) >= rec_size);
+ mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+ mach_read_from_2(garbage) - rec_size);
+ static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+ mtr->memcpy(*cursor->block, page_free, hdr, 4);
+
+ if (!page_is_leaf(cursor->block->frame))
+ {
+ /* Zero out the node pointer of free_rec, in case it will not be
+ overwritten by insert_rec. */
+ ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+ if (rec_offs_size(foffsets) > rec_size)
+ memset(rec_get_end(free_rec_ptr, foffsets) -
+ REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE);
+ }
+ else if (index->is_clust())
+ {
+ /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will
+ not be overwritten by insert_rec. */
+
+ ulint len;
+ ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs +
+ rec_offs_extra_size(foffsets) > rec_size)
+ memset(free_rec_ptr + trx_id_offs, 0,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN ==
+ rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(),
+ &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ }
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+ else
+ {
+use_heap:
+ ut_ad(!free_rec);
+ insert_buf= page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(!insert_buf))
+ return insert_buf;
+
+ static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+ mtr->memcpy(*cursor->block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
+ page_zip_dir_add_slot(cursor->block, index, mtr);
+ }
+
+ /* 3. Create the record */
+ byte *insert_rec= rec_copy(insert_buf, rec, offsets);
+ rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame),
+ offsets);
+
+ /* 4. Insert the record in the linked list of records */
+ ut_ad(cursor->rec != insert_rec);
+
+ /* next record after current before the insertion */
+ const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+ ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+ mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - insert_rec));
+ mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
+ (insert_rec - cursor->rec));
+ byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ cursor->block->frame);
+ mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+ memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
+
+ /* 5. Set the n_owned field in the inserted record to zero,
+ and set the heap_no field */
+ rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+ MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
+ rec_offs_size(offsets));
+
+ /* 6. Update the last insertion info in page header */
+ byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data);
+ const uint16_t last_insert_rec= mach_read_from_2(last_insert);
+ ut_ad(!last_insert_rec ||
+ rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) ==
+ rec_get_node_ptr_flag(insert_rec));
+ mach_write_to_2(last_insert, page_offset(insert_rec));
+
+ if (!index->is_spatial())
+ {
+ byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B];
+ ut_ad(!(*dir & ~((1U << 3) - 1)));
+ byte *n= my_assume_aligned<2>
+ (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]);
+ if (UNIV_UNLIKELY(!last_insert_rec))
+ {
+no_direction:
+ *dir= PAGE_NO_DIRECTION;
+ memset(n, 0, 2);
+ }
+ else if (*dir != PAGE_LEFT &&
+ cursor->block->frame + last_insert_rec == cursor->rec)
+ {
+ *dir= PAGE_RIGHT;
+inc_dir:
+ mach_write_to_2(n, mach_read_from_2(n) + 1);
+ }
+ else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
+ cursor->block->frame + last_insert_rec)
+ {
+ *dir= PAGE_LEFT;
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Write the header fields in one record. */
+ mtr->memcpy(*cursor->block,
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ cursor->block->frame),
+ my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+ page_zip->data),
+ PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+ /* 7. It remains to update the owner record. */
+ ulint n_owned;
+
+ while (!(n_owned = rec_get_n_owned_new(next_rec)))
+ next_rec= page_rec_get_next_low(next_rec, true);
+
+ rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+ REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
+
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_zip_dir_split_slot(cursor->block,
+ page_dir_find_owner_slot(next_rec), mtr);
+
+ page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
+ return insert_rec;
+}
+
+/** Prepend a record to the PAGE_FREE list, or shrink PAGE_HEAP_TOP.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] data_size record payload size, in bytes
+@param[in] extra_size record header size, in bytes */
+static void page_mem_free(const buf_block_t &block, rec_t *rec,
+ size_t data_size, size_t extra_size)
+{
+ ut_ad(page_align(rec) == block.frame);
+ ut_ad(!block.page.zip.data);
+ const rec_t *free= page_header_get_ptr(block.frame, PAGE_FREE);
+
+ const uint16_t n_heap= uint16_t(page_header_get_field(block.frame,
+ PAGE_N_HEAP) - 1);
+ ut_ad(page_get_n_recs(block.frame) < (n_heap & 0x7fff));
+ const bool deleting_top= n_heap == ((n_heap & 0x8000)
+ ? (rec_get_heap_no_new(rec) | 0x8000)
+ : rec_get_heap_no_old(rec));
+
+ if (deleting_top)
+ {
+ byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+ block.frame);
+ const uint16_t heap_top= mach_read_from_2(page_heap_top);
+ const size_t extra_savings= heap_top - page_offset(rec + data_size);
+ ut_ad(extra_savings < heap_top);
+
+ /* When deleting the last record, do not add it to the PAGE_FREE list.
+ Instead, decrement PAGE_HEAP_TOP and PAGE_N_HEAP. */
+ mach_write_to_2(page_heap_top, page_offset(rec - extra_size));
+ mach_write_to_2(my_assume_aligned<2>(page_heap_top + 2), n_heap);
+ static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+ if (extra_savings)
+ {
+ byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block.frame);
+ uint16_t garbage= mach_read_from_2(page_garbage);
+ ut_ad(garbage >= extra_savings);
+ mach_write_to_2(page_garbage, garbage - extra_savings);
+ }
+ }
+ else
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block.frame);
+ mach_write_to_2(page_free, page_offset(rec));
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) +
+ extra_size + data_size);
+ }
+
+ memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.frame, 0, 2);
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1);
+
+ const byte* const end= rec + data_size;
+
+ if (!deleting_top)
+ {
+ uint16_t next= free
+ ? ((n_heap & 0x8000)
+ ? static_cast<uint16_t>(free - rec)
+ : static_cast<uint16_t>(free - block.frame))
+ : uint16_t{0};
+ mach_write_to_2(rec - REC_NEXT, next);
+ }
+ else
+ rec-= extra_size;
+
+ memset(rec, 0, end - rec);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(
+ cursor->rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_dir_slot_t* cur_dir_slot;
+ rec_t* current_rec;
+ rec_t* prev_rec = NULL;
+ rec_t* next_rec;
+ ulint cur_slot_no;
+ ulint cur_n_owned;
+ rec_t* rec;
+
+ /* page_zip_validate() will fail here when
+ btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+ Then, both "page_zip" and "block->frame" would have the min-rec-mark
+ set on the smallest user record, but "block->frame" would additionally
+ have it set on the smallest-but-one record. Because sloppy
+ page_zip_validate_low() only ignores min-rec-flag differences
+ in the smallest user record, it cannot be used here either. */
+
+ current_rec = cursor->rec;
+ buf_block_t* const block = cursor->block;
+ ut_ad(rec_offs_validate(current_rec, index, offsets));
+ ut_ad(!!page_is_comp(block->frame) == index->table->not_redundant());
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame)
+ == index->id
+ || mtr->is_inside_ibuf());
+ ut_ad(mtr->is_named_space(index->table->space));
+
+ /* The record must not be the supremum or infimum record. */
+ ut_ad(page_rec_is_user_rec(current_rec));
+
+ if (page_get_n_recs(block->frame) == 1
+ && !rec_is_alter_metadata(current_rec, *index)) {
+ /* Empty the page. */
+ ut_ad(page_is_leaf(block->frame));
+ /* Usually, this should be the root page,
+ and the whole index tree should become empty.
+ However, this could also be a call in
+ btr_cur_pessimistic_update() to delete the only
+ record in the page and to insert another one. */
+ page_cur_move_to_next(cursor);
+ ut_ad(page_cur_is_after_last(cursor));
+ page_create_empty(page_cur_get_block(cursor),
+ const_cast<dict_index_t*>(index), mtr);
+ return;
+ }
+
+ /* Save to local variables some data associated with current_rec */
+ cur_slot_no = page_dir_find_owner_slot(current_rec);
+ ut_ad(cur_slot_no > 0);
+ cur_dir_slot = page_dir_get_nth_slot(block->frame, cur_slot_no);
+ cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+ /* The page gets invalid for btr_pcur_restore_pos().
+ We avoid invoking buf_block_modify_clock_inc(block) because its
+ consistency checks would fail for the dummy block that is being
+ used during IMPORT TABLESPACE. */
+ block->modify_clock++;
+
+ /* Find the next and the previous record. Note that the cursor is
+ left at the next record. */
+
+ rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(cur_dir_slot + PAGE_DIR_SLOT_SIZE));
+
+ /* rec now points to the record of the previous directory slot. Look
+ for the immediate predecessor of current_rec in a loop. */
+
+ while (current_rec != rec) {
+ prev_rec = rec;
+ rec = page_rec_get_next(rec);
+ }
+
+ page_cur_move_to_next(cursor);
+ next_rec = cursor->rec;
+
+ /* Remove the record from the linked list of records */
+ /* If the deleted record is pointed to by a dir slot, update the
+ record pointer in slot. In the following if-clause we assume that
+ prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+ >= 2. */
+ /* Update the number of owned records of the slot */
+
+ compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2);
+ ut_ad(cur_n_owned > 1);
+
+ rec_t* slot_rec = const_cast<rec_t*>
+ (page_dir_slot_get_rec(cur_dir_slot));
+
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ ut_ad(page_is_comp(block->frame));
+ if (current_rec == slot_rec) {
+ page_zip_rec_set_owned(block, prev_rec, 1, mtr);
+ page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+ slot_rec = prev_rec;
+ mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+ } else if (cur_n_owned == 1
+ && !page_rec_is_supremum(slot_rec)) {
+ page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+ }
+
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - prev_rec));
+ slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+
+ page_header_reset_last_insert(block, mtr);
+ page_zip_dir_delete(block, rec, index, offsets,
+ page_header_get_ptr(block->frame,
+ PAGE_FREE),
+ mtr);
+ if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ page_zip_dir_balance_slot(block, cur_slot_no, mtr);
+ }
+ return;
+ }
+
+ if (current_rec == slot_rec) {
+ slot_rec = prev_rec;
+ mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+ }
+
+ const size_t data_size = rec_offs_data_size(offsets);
+ const size_t extra_size = rec_offs_extra_size(offsets);
+
+ if (page_is_comp(block->frame)) {
+ mtr->page_delete(*block, page_offset(prev_rec)
+ - PAGE_NEW_INFIMUM,
+ extra_size - REC_N_NEW_EXTRA_BYTES,
+ data_size);
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (next_rec - prev_rec));
+ slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+ } else {
+ mtr->page_delete(*block, page_offset(prev_rec)
+ - PAGE_OLD_INFIMUM);
+ memcpy(prev_rec - REC_NEXT, current_rec - REC_NEXT, 2);
+ slot_rec[-REC_OLD_N_OWNED] = static_cast<byte>(
+ (slot_rec[-REC_OLD_N_OWNED] & ~REC_N_OWNED_MASK)
+ | (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+ }
+
+ page_mem_free(*block, current_rec, data_size, extra_size);
+
+ /* Now we have decremented the number of owned records of the slot.
+ If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+ slots. */
+
+ if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+ page_dir_balance_slot(*block, cur_slot_no);
+ }
+
+ ut_ad(page_is_comp(block->frame)
+ ? page_simple_validate_new(block->frame)
+ : page_simple_validate_old(block->frame));
+}
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr encoded fixed-size header bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+ ulint prev, ulint enc_hdr,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+ block.frame);
+ const uint16_t h= mach_read_from_2(page_n_heap);
+ const page_id_t id(block.page.id());
+ if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW ||
+ h >= srv_page_size / REC_N_OLD_EXTRA_BYTES ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_OLD_SUPREMUM - REC_NEXT +
+ block.frame))))
+ {
+corrupted:
+ ib::error() << (reuse
+ ? "Not applying INSERT_REUSE_REDUNDANT"
+ " due to corruption on "
+ : "Not applying INSERT_HEAP_REDUNDANT"
+ " due to corruption on ")
+ << id;
+ return true;
+ }
+
+ byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ byte * const page_heap_top= my_assume_aligned<2>
+ (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+ const byte *const heap_bot= &block.frame[PAGE_OLD_SUPREMUM_END];
+ byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+ if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+ PAGE_OLD_INFIMUM))
+ goto corrupted;
+ rec_t * const prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+ if (!prev);
+ else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec ||
+ prev_rec > heap_top))
+ goto corrupted;
+ const ulint pn_fields= rec_get_bit_field_2(prev_rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ if (UNIV_UNLIKELY(pn_fields == 0 || pn_fields > REC_MAX_N_FIELDS))
+ goto corrupted;
+ const ulint pextra_size= REC_N_OLD_EXTRA_BYTES +
+ (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2);
+ if (prev_rec == &block.frame[PAGE_OLD_INFIMUM]);
+ else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot))
+ goto corrupted;
+ const ulint pdata_size= rec_get_data_size_old(prev_rec);
+ if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top))
+ goto corrupted;
+ rec_t * const next_rec= block.frame + mach_read_from_2(prev_rec - REC_NEXT);
+ if (next_rec == block.frame + PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec ||
+ next_rec > heap_top))
+ goto corrupted;
+ const bool is_short= (enc_hdr >> 2) & 1;
+ const ulint n_fields= (enc_hdr >> 3) + 1;
+ if (UNIV_UNLIKELY(n_fields > REC_MAX_N_FIELDS))
+ goto corrupted;
+ const ulint extra_size= REC_N_OLD_EXTRA_BYTES +
+ (is_short ? n_fields : n_fields * 2);
+ hdr_c+= REC_N_OLD_EXTRA_BYTES;
+ if (UNIV_UNLIKELY(hdr_c > extra_size))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size - hdr_c > data_len))
+ goto corrupted;
+ /* We buffer all changes to the record header locally, so that
+ we will avoid modifying the page before all consistency checks
+ have been fulfilled. */
+ alignas(2) byte insert_buf[REC_N_OLD_EXTRA_BYTES + REC_MAX_N_FIELDS * 2];
+
+ ulint n_owned;
+ rec_t *owner_rec= next_rec;
+ for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+ !(n_owned= rec_get_n_owned_old(owner_rec)); )
+ {
+ owner_rec= block.frame + mach_read_from_2(owner_rec - REC_NEXT);
+ if (owner_rec == &block.frame[PAGE_OLD_SUPREMUM]);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec ||
+ owner_rec > heap_top))
+ goto corrupted;
+ if (!ns--)
+ goto corrupted; /* Corrupted (cyclic?) next-record list */
+ }
+
+ page_dir_slot_t *owner_slot= last_slot;
+
+ if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+ goto corrupted;
+ else
+ {
+ mach_write_to_2(insert_buf, owner_rec - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ const page_dir_slot_t * const first_slot=
+ page_dir_get_nth_slot(block.frame, 0);
+
+ while (memcmp_aligned<2>(owner_slot, insert_buf, 2))
+ if ((owner_slot+= 2) == first_slot)
+ goto corrupted;
+ }
+
+ memcpy(insert_buf, data, extra_size - hdr_c);
+ byte *insert_rec= &insert_buf[extra_size];
+ memcpy(insert_rec - hdr_c, prev_rec - hdr_c, hdr_c);
+ rec_set_bit_field_1(insert_rec, (enc_hdr & 3) << 4,
+ REC_OLD_INFO_BITS, REC_INFO_BITS_MASK,
+ REC_INFO_BITS_SHIFT);
+ rec_set_1byte_offs_flag(insert_rec, is_short);
+ rec_set_n_fields_old(insert_rec, n_fields);
+ rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ const ulint data_size= rec_get_data_size_old(insert_rec);
+ if (UNIV_UNLIKELY(data_c > data_size))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size - hdr_c + data_size - data_c != data_len))
+ goto corrupted;
+
+ /* Perform final consistency checks and then apply the change to the page. */
+ byte *buf;
+ if (reuse)
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+ if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec ||
+ free_rec > heap_top))
+ goto corrupted;
+ const ulint fn_fields= rec_get_n_fields_old(free_rec);
+ const ulint fextra_size= REC_N_OLD_EXTRA_BYTES +
+ (rec_get_1byte_offs_flag(free_rec) ? fn_fields : fn_fields * 2);
+ if (UNIV_UNLIKELY(free_rec - fextra_size < heap_bot))
+ goto corrupted;
+ const ulint fdata_size= rec_get_data_size_old(free_rec);
+ if (UNIV_UNLIKELY(free_rec + fdata_size > heap_top))
+ goto corrupted;
+ if (UNIV_UNLIKELY(extra_size + data_size > fextra_size + fdata_size))
+ goto corrupted;
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) <
+ fextra_size + fdata_size))
+ goto corrupted;
+ buf= free_rec - fextra_size;
+ const rec_t *const next_free= block.frame +
+ mach_read_from_2(free_rec - REC_NEXT);
+ if (next_free == block.frame);
+ else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] ||
+ heap_top < next_free))
+ goto corrupted;
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+ extra_size - data_size);
+ rec_set_bit_field_2(insert_rec, rec_get_heap_no_old(free_rec),
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ memcpy(page_free, free_rec - REC_NEXT, 2);
+ }
+ else
+ {
+ if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+ goto corrupted;
+ rec_set_bit_field_2(insert_rec, h,
+ REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+ mach_write_to_2(page_n_heap, h + 1);
+ mach_write_to_2(page_heap_top,
+ mach_read_from_2(page_heap_top) + extra_size + data_size);
+ buf= heap_top;
+ }
+
+ ut_ad(data_size - data_c == data_len - (extra_size - hdr_c));
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block.frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ memcpy(buf, insert_buf, extra_size);
+ buf+= extra_size;
+ mach_write_to_2(page_last_insert, buf - block.frame);
+ memcpy(prev_rec - REC_NEXT, page_last_insert, 2);
+ memcpy(buf, prev_rec, data_c);
+ memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c),
+ data_len - (extra_size - hdr_c));
+ rec_set_bit_field_1(owner_rec, n_owned + 1, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n_dir= my_assume_aligned<2>
+ (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n_dir, 0, 2);
+ }
+ else if (block.frame + last_insert == prev_rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+ }
+ else if (next_rec == block.frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot(block, owner_slot);
+ ut_ad(page_simple_validate_old(block.frame));
+ return false;
+}
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l number of copied record header bytes, plus record type bits
+@param hdr_c number of common record header bytes with prev
+@param data_c number of common data bytes with prev
+@param data literal header and data bytes
+@param data_len length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+ ulint prev, ulint shift, ulint enc_hdr_l,
+ size_t hdr_c, size_t data_c,
+ const void *data, size_t data_len)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
+ block.frame);
+ ulint h= mach_read_from_2(page_n_heap);
+ const page_id_t id(block.page.id());
+ if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) ||
+ (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES ||
+ (h & 0x7fff) < n_slots ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_NEW_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ ((enc_hdr_l & REC_STATUS_INSTANT) &&
+ !page_is_leaf(block.frame)) ||
+ (enc_hdr_l >> 3) > data_len))
+ {
+corrupted:
+ ib::error() << (reuse
+ ? "Not applying INSERT_REUSE_DYNAMIC"
+ " due to corruption on "
+ : "Not applying INSERT_HEAP_DYNAMIC"
+ " due to corruption on ")
+ << id;
+ return true;
+ }
+
+ byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ byte * const page_heap_top= my_assume_aligned<2>
+ (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
+ const byte *const heap_bot= &block.frame[PAGE_NEW_SUPREMUM_END];
+ byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+ if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM))
+ goto corrupted;
+ if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+ PAGE_NEW_INFIMUM))
+ goto corrupted;
+
+ uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+ rec_t *prev_rec= block.frame + n;
+ n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+ if (!prev);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec ||
+ prev_rec > heap_top))
+ goto corrupted;
+
+ rec_t * const next_rec= block.frame + n;
+ if (next_rec == block.frame + PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec ||
+ next_rec > heap_top))
+ goto corrupted;
+
+ ulint n_owned;
+ rec_t *owner_rec= next_rec;
+ n= static_cast<uint16_t>(next_rec - block.frame);
+
+ for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+ !(n_owned= rec_get_n_owned_new(owner_rec)); )
+ {
+ n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT));
+ owner_rec= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec ||
+ owner_rec > heap_top))
+ goto corrupted;
+ if (!ns--)
+ goto corrupted; /* Corrupted (cyclic?) next-record list */
+ }
+
+ page_dir_slot_t* owner_slot= last_slot;
+
+ if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+ goto corrupted;
+ else
+ {
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+ alignas(2) byte slot_buf[2];
+ mach_write_to_2(slot_buf, owner_rec - block.frame);
+ const page_dir_slot_t * const first_slot=
+ page_dir_get_nth_slot(block.frame, 0);
+
+ while (memcmp_aligned<2>(owner_slot, slot_buf, 2))
+ if ((owner_slot+= 2) == first_slot)
+ goto corrupted;
+ }
+
+ const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_c + (enc_hdr_l >> 3);
+ const ulint data_size= data_c + data_len - (enc_hdr_l >> 3);
+
+ /* Perform final consistency checks and then apply the change to the page. */
+ byte *buf;
+ if (reuse)
+ {
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block.frame);
+ rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+ if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec ||
+ free_rec > heap_top))
+ goto corrupted;
+ buf= free_rec - extra_size;
+ if (shift & 1)
+ buf-= shift >> 1;
+ else
+ buf+= shift >> 1;
+
+ if (UNIV_UNLIKELY(heap_bot > buf ||
+ &buf[extra_size + data_size] > heap_top))
+ goto corrupted;
+ byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+ if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < extra_size + data_size))
+ goto corrupted;
+ if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0)
+ {
+ n= static_cast<uint16_t>(n + free_rec - block.frame);
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ heap_top < block.frame + n))
+ goto corrupted;
+ }
+ mach_write_to_2(page_free, n);
+ mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+ (extra_size + data_size));
+ h= rec_get_heap_no_new(free_rec);
+ }
+ else
+ {
+ if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+ goto corrupted;
+ mach_write_to_2(page_n_heap, h + 1);
+ h&= 0x7fff;
+ mach_write_to_2(page_heap_top,
+ mach_read_from_2(page_heap_top) + extra_size + data_size);
+ buf= heap_top;
+ }
+
+ memcpy(buf, data, (enc_hdr_l >> 3));
+ buf+= enc_hdr_l >> 3;
+ data_len-= enc_hdr_l >> 3;
+ data= &static_cast<const byte*>(data)[enc_hdr_l >> 3];
+
+ memcpy(buf, prev_rec - REC_N_NEW_EXTRA_BYTES - hdr_c, hdr_c);
+ buf+= hdr_c;
+ *buf++= static_cast<byte>((enc_hdr_l & 3) << 4); /* info_bits; n_owned=0 */
+ *buf++= static_cast<byte>(h >> 5); /* MSB of heap number */
+ h= (h & ((1U << 5) - 1)) << 3;
+ static_assert(REC_STATUS_ORDINARY == 0, "compatibility");
+ static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+ if (page_is_leaf(block.frame))
+ h|= enc_hdr_l & REC_STATUS_INSTANT;
+ else
+ {
+ ut_ad(!(enc_hdr_l & REC_STATUS_INSTANT)); /* Checked at the start */
+ h|= REC_STATUS_NODE_PTR;
+ }
+ *buf++= static_cast<byte>(h); /* LSB of heap number, and status */
+ static_assert(REC_NEXT == 2, "compatibility");
+ buf+= REC_NEXT;
+ mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf));
+ byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+ block.frame);
+ const uint16_t last_insert= mach_read_from_2(page_last_insert);
+ mach_write_to_2(page_last_insert, buf - block.frame);
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec));
+ memcpy(buf, prev_rec, data_c);
+ buf+= data_c;
+ memcpy(buf, data, data_len);
+
+ rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+ /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+ if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+ {
+ byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+ byte *n_dir= my_assume_aligned<2>
+ (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+ if (UNIV_UNLIKELY(!last_insert))
+ {
+no_direction:
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+ memset(n_dir, 0, 2);
+ }
+ else if (block.frame + last_insert == prev_rec &&
+ (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+ mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+ }
+ else if (next_rec == block.frame + last_insert &&
+ (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+ {
+ *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+ goto inc_dir;
+ }
+ else
+ goto no_direction;
+ }
+
+ /* Update PAGE_N_RECS. */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block.frame);
+
+ mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+ page_dir_split_slot(block, owner_slot);
+ ut_ad(page_simple_validate_new(block.frame));
+ return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ ulint n_recs= page_get_n_recs(block.frame);
+ const page_id_t id(block.page.id());
+
+ if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_OLD_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ page_is_comp(block.frame)))
+ {
+corrupted:
+ ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT"
+ " due to corruption on " << id;
+ return true;
+ }
+
+ byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ rec_t *prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+ if (UNIV_UNLIKELY(prev_rec > slot))
+ goto corrupted;
+ uint16_t n= mach_read_from_2(prev_rec - REC_NEXT);
+ rec_t *rec= block.frame + n;
+ if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < rec))
+ goto corrupted;
+ const ulint extra_size= REC_N_OLD_EXTRA_BYTES + rec_get_n_fields_old(rec) *
+ (rec_get_1byte_offs_flag(rec) ? 1 : 2);
+ const ulint data_size= rec_get_data_size_old(rec);
+ if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + extra_size ||
+ slot < rec + data_size))
+ goto corrupted;
+
+ n= mach_read_from_2(rec - REC_NEXT);
+ rec_t *next= block.frame + n;
+ if (n == PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < next))
+ goto corrupted;
+
+ rec_t *s= rec;
+ ulint slot_owned;
+ for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); )
+ {
+ n= mach_read_from_2(s - REC_NEXT);
+ s= block.frame + n;
+ if (n == PAGE_OLD_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+ slot < s))
+ goto corrupted;
+ if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+ goto corrupted;
+ }
+ slot_owned--;
+
+ /* The first slot is always pointing to the infimum record.
+ Find the directory slot pointing to s. */
+ const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+ alignas(2) byte slot_offs[2];
+ mach_write_to_2(slot_offs, s - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+ while (memcmp_aligned<2>(slot, slot_offs, 2))
+ if ((slot+= 2) == first_slot)
+ goto corrupted;
+
+ if (rec == s)
+ {
+ s= prev_rec;
+ mach_write_to_2(slot, s - block.frame);
+ }
+
+ memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2);
+ s-= REC_OLD_N_OWNED;
+ *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+ slot_owned << REC_N_OWNED_SHIFT);
+ page_mem_free(block, rec, data_size, extra_size);
+
+ if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+ page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+ ut_ad(page_simple_validate_old(block.frame));
+ return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+ size_t hdr_size, size_t data_size)
+{
+ const uint16_t n_slots= page_dir_get_n_slots(block.frame);
+ ulint n_recs= page_get_n_recs(block.frame);
+ const page_id_t id(block.page.id());
+
+ if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+ !fil_page_index_page_check(block.frame) ||
+ page_get_page_no(block.frame) != id.page_no() ||
+ mach_read_from_2(my_assume_aligned<2>
+ (PAGE_NEW_SUPREMUM - REC_NEXT +
+ block.frame)) ||
+ !page_is_comp(block.frame)))
+ {
+corrupted:
+ ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC"
+ " due to corruption on " << id;
+ return true;
+ }
+
+ byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+ uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+ rec_t *prev_rec= block.frame + n;
+ if (UNIV_UNLIKELY(prev_rec > slot))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+ rec_t *rec= block.frame + n;
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < rec))
+ goto corrupted;
+ const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_size;
+ if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + extra_size ||
+ slot < rec + data_size))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT));
+ rec_t *next= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < next))
+ goto corrupted;
+
+ rec_t *s= rec;
+ n= static_cast<uint16_t>(rec - block.frame);
+ ulint slot_owned;
+ for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); )
+ {
+ const uint16_t next= mach_read_from_2(s - REC_NEXT);
+ if (UNIV_UNLIKELY(next < REC_N_NEW_EXTRA_BYTES ||
+ next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES)))
+ goto corrupted;
+ n= static_cast<uint16_t>(n + next);
+ s= block.frame + n;
+ if (n == PAGE_NEW_SUPREMUM);
+ else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+ slot < s))
+ goto corrupted;
+ if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+ goto corrupted;
+ }
+ slot_owned--;
+
+ /* The first slot is always pointing to the infimum record.
+ Find the directory slot pointing to s. */
+ const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+ alignas(2) byte slot_offs[2];
+ mach_write_to_2(slot_offs, s - block.frame);
+ static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+ while (memcmp_aligned<2>(slot, slot_offs, 2))
+ if ((slot+= 2) == first_slot)
+ goto corrupted;
+
+ if (rec == s)
+ {
+ s= prev_rec;
+ mach_write_to_2(slot, s - block.frame);
+ }
+
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec));
+ s-= REC_NEW_N_OWNED;
+ *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+ slot_owned << REC_N_OWNED_SHIFT);
+ page_mem_free(block, rec, data_size, extra_size);
+
+ if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+ page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+ ut_ad(page_simple_validate_new(block.frame));
+ return false;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by ut_rnd_gen() to make sure
+(visually) that it works properly. */
+void
+test_ut_rnd_gen(
+ int n) /*!< in: print first n numbers */
+{
+ int i;
+ unsigned long long rnd;
+
+ for (i = 0; i < n; i++) {
+ rnd = ut_rnd_gen();
+ printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+ rnd,
+ rnd % 2,
+ rnd % 3,
+ rnd % 5,
+ rnd % 7,
+ rnd % 11);
+ }
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
new file mode 100644
index 00000000..9b83470e
--- /dev/null
+++ b/storage/innobase/page/page0page.cc
@@ -0,0 +1,2499 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.cc
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "btr0btr.h"
+#include "srv0srv.h"
+#include "lock0lock.h"
+#include "fut0lst.h"
+#include "btr0sea.h"
+#include "trx0sys.h"
+#include <algorithm>
+
+/* THE INDEX PAGE
+ ==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ ut_ad(page_rec_check(rec));
+
+ const page_t* page = page_align(rec);
+ const page_dir_slot_t* first_slot = page_dir_get_nth_slot(page, 0);
+ const page_dir_slot_t* slot = page_dir_get_nth_slot(
+ page, ulint(page_dir_get_n_slots(page)) - 1);
+ const rec_t* r = rec;
+
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(r) == 0) {
+ r = rec_get_next_ptr_const(r, TRUE);
+ ut_ad(r >= page + PAGE_NEW_SUPREMUM);
+ ut_ad(r < page + (srv_page_size - PAGE_DIR));
+ }
+ } else {
+ while (rec_get_n_owned_old(r) == 0) {
+ r = rec_get_next_ptr_const(r, FALSE);
+ ut_ad(r >= page + PAGE_OLD_SUPREMUM);
+ ut_ad(r < page + (srv_page_size - PAGE_DIR));
+ }
+ }
+
+ uint16 rec_offs_bytes = mach_encode_2(ulint(r - page));
+
+ while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
+
+ if (UNIV_UNLIKELY(slot == first_slot)) {
+ ib::error() << "Probable data corruption on page "
+ << page_get_page_no(page)
+ << ". Original record on that page;";
+
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, rec);
+ }
+
+ ib::error() << "Cannot find the dir slot for this"
+ " record on that page;";
+
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, page
+ + mach_decode_2(rec_offs_bytes));
+ }
+
+ ut_error;
+ }
+
+ slot += PAGE_DIR_SLOT_SIZE;
+ }
+
+ return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+ const page_dir_slot_t* slot) /*!< in: slot */
+{
+ const page_t* page;
+ ulint n_slots;
+ ulint n_owned;
+
+ ut_a(slot);
+
+ page = page_align(slot);
+
+ n_slots = page_dir_get_n_slots(page);
+
+ ut_a(slot <= page_dir_get_nth_slot(page, 0));
+ ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+ ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+ if (page_is_comp(page)) {
+ n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+ } else {
+ n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+ }
+
+ if (slot == page_dir_get_nth_slot(page, 0)) {
+ ut_a(n_owned == 1);
+ } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+ ut_a(n_owned >= 1);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ } else {
+ ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */
+{
+ ut_ad(!mtr || mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!page_zip || page_zip == &block->page.zip);
+ static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+ byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
+ PAGE_HEADER + block->frame);
+
+ mtr->write<8>(*block, max_trx_id, trx_id);
+ if (UNIV_LIKELY_NULL(page_zip))
+ memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER],
+ max_trx_id, 8);
+}
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out] block clustered index root page
+@param[in] index clustered index
+@param[in] autoinc next available AUTO_INCREMENT value
+@param[in,out] mtr mini-transaction
+@param[in] reset whether to reset the AUTO_INCREMENT
+ to a possibly smaller value than currently
+ exists in the page */
+void
+page_set_autoinc(
+ buf_block_t* block,
+ ib_uint64_t autoinc,
+ mtr_t* mtr,
+ bool reset)
+{
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+ MTR_MEMO_PAGE_SX_FIX));
+
+ byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
+ block->frame);
+ ib_uint64_t old= mach_read_from_8(field);
+ if (old == autoinc || (old > autoinc && !reset))
+ return; /* nothing to update */
+
+ mtr->write<8>(*block, field, autoinc);
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data,
+ field, 8);
+}
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */
+static const byte infimum_supremum_redundant[] = {
+ /* the infimum record */
+ 0x08/*end offset*/,
+ 0x01/*n_owned*/,
+ 0x00, 0x00/*heap_no=0*/,
+ 0x03/*n_fields=1, 1-byte offsets*/,
+ 0x00, 0x74/* pointer to supremum */,
+ 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+ /* the supremum record */
+ 0x09/*end offset*/,
+ 0x01/*n_owned*/,
+ 0x00, 0x08/*heap_no=1*/,
+ 0x03/*n_fields=1, 1-byte offsets*/,
+ 0x00, 0x00/* end of record list */,
+ 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm', 0
+};
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=COMPACT */
+static const byte infimum_supremum_compact[] = {
+ /* the infimum record */
+ 0x01/*n_owned=1*/,
+ 0x00, 0x02/* heap_no=0, REC_STATUS_INFIMUM */,
+ 0x00, 0x0d/* pointer to supremum */,
+ 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+ /* the supremum record */
+ 0x01/*n_owned=1*/,
+ 0x00, 0x0b/* heap_no=1, REC_STATUS_SUPREMUM */,
+ 0x00, 0x00/* end of record list */,
+ 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm'
+};
+
+/** Create an index page.
+@param[in,out] block buffer block
+@param[in] comp nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp)
+{
+ page_t* page;
+
+ compile_time_assert(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE
+ <= PAGE_DATA);
+ compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE
+ <= PAGE_DATA);
+
+ page = buf_block_get_frame(block);
+
+ fil_page_set_type(page, FIL_PAGE_INDEX);
+
+ memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END);
+ page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2;
+ page[PAGE_HEADER + PAGE_INSTANT] = 0;
+ page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION;
+
+ if (comp) {
+ page[PAGE_HEADER + PAGE_N_HEAP] = 0x80;/*page_is_comp()*/
+ page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+ page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_NEW_SUPREMUM_END;
+ memcpy(page + PAGE_DATA, infimum_supremum_compact,
+ sizeof infimum_supremum_compact);
+ memset(page
+ + PAGE_NEW_SUPREMUM_END, 0,
+ srv_page_size - PAGE_DIR - PAGE_NEW_SUPREMUM_END);
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+ = PAGE_NEW_SUPREMUM;
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+ = PAGE_NEW_INFIMUM;
+ } else {
+ page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+ page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_OLD_SUPREMUM_END;
+ memcpy(page + PAGE_DATA, infimum_supremum_redundant,
+ sizeof infimum_supremum_redundant);
+ memset(page
+ + PAGE_OLD_SUPREMUM_END, 0,
+ srv_page_size - PAGE_DIR - PAGE_OLD_SUPREMUM_END);
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+ = PAGE_OLD_SUPREMUM;
+ page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+ = PAGE_OLD_INFIMUM;
+ }
+}
+
+/** Create an uncompressed index page.
+@param[in,out] block buffer block
+@param[in,out] mtr mini-transaction
+@param[in] comp set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp)
+{
+ mtr->page_create(*block, comp);
+ buf_block_modify_clock_inc(block);
+ page_create_low(block, comp);
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame
+ where the page is created */
+ dict_index_t* index, /*!< in: the index of the
+ page */
+ ulint level, /*!< in: the B-tree level
+ of the page */
+ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */
+ mtr_t* mtr) /*!< in/out: mini-transaction
+ handle */
+{
+ ut_ad(block);
+ ut_ad(buf_block_get_page_zip(block));
+ ut_ad(dict_table_is_comp(index->table));
+
+ /* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for
+ temporary tables. */
+ ut_ad(max_trx_id == 0 || !index->table->is_temporary());
+ /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID
+ must be zero on non-leaf pages. max_trx_id can be 0 when the
+ index consists of an empty root (leaf) page. */
+ ut_ad(max_trx_id == 0
+ || level == 0
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary());
+ /* In the clustered index, PAGE_ROOT_AUTOINC or
+ PAGE_MAX_TRX_ID must be 0 on other pages than the root. */
+ ut_ad(level == 0 || max_trx_id == 0
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary());
+
+ buf_block_modify_clock_inc(block);
+ page_create_low(block, true);
+
+ if (index->is_spatial()) {
+ mach_write_to_2(FIL_PAGE_TYPE + block->frame, FIL_PAGE_RTREE);
+ memset(block->frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+ memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+ }
+
+ mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->frame, level);
+ mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->frame,
+ max_trx_id);
+
+ if (!page_zip_compress(block, index, page_zip_level, mtr)) {
+ /* The compression of a newly created
+ page should always succeed. */
+ ut_error;
+ }
+}
+
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+ buf_block_t* block, /*!< in/out: B-tree block */
+ dict_index_t* index, /*!< in: the index of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ trx_id_t max_trx_id;
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+ ut_ad(fil_page_index_page_check(block->frame));
+ ut_ad(!index->is_dummy);
+ ut_ad(block->page.id().space() == index->table->space->id);
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()
+ && page_is_leaf(block->frame)) {
+ max_trx_id = page_get_max_trx_id(block->frame);
+ ut_ad(max_trx_id);
+ } else if (block->page.id().page_no() == index->page) {
+ /* Preserve PAGE_ROOT_AUTO_INC. */
+ max_trx_id = page_get_max_trx_id(block->frame);
+ } else {
+ max_trx_id = 0;
+ }
+
+ if (page_zip) {
+ ut_ad(!index->table->is_temporary());
+ page_create_zip(block, index,
+ page_header_get_field(block->frame,
+ PAGE_LEVEL),
+ max_trx_id, mtr);
+ } else {
+ page_create(block, mtr, index->table->not_redundant());
+ if (index->is_spatial()) {
+ static_assert(((FIL_PAGE_INDEX & 0xff00)
+ | byte(FIL_PAGE_RTREE))
+ == FIL_PAGE_RTREE, "compatibility");
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+ byte(FIL_PAGE_RTREE));
+ if (mach_read_from_8(block->frame
+ + FIL_RTREE_SPLIT_SEQ_NUM)) {
+ mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+ 8, 0);
+ }
+ }
+
+ if (max_trx_id) {
+ mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+ + block->frame, max_trx_id);
+ }
+ }
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_cur_t cur1;
+ page_cur_t cur2;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page_cur_position(rec, block, &cur1);
+
+ if (page_cur_is_before_first(&cur1)) {
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ btr_assert_not_corrupted(new_block, index);
+ ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+ ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
+ (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+ const ulint n_core = page_is_leaf(block->frame)
+ ? index->n_core_fields : 0;
+
+ page_cur_set_before_first(new_block, &cur2);
+
+ /* Copy records from the original page to the new page */
+
+ while (!page_cur_is_after_last(&cur1)) {
+ rec_t* ins_rec;
+ offsets = rec_get_offsets(cur1.rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ins_rec = page_cur_insert_rec_low(&cur2, index,
+ cur1.rec, offsets, mtr);
+ if (UNIV_UNLIKELY(!ins_rec)) {
+ ib::fatal() << "Rec offset " << page_offset(rec)
+ << ", cur1 offset " << page_offset(cur1.rec)
+ << ", cur2 offset " << page_offset(cur2.rec);
+ }
+
+ page_cur_move_to_next(&cur1);
+ ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page))
+ & REC_INFO_MIN_REC_FLAG));
+ cur2.rec = ins_rec;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_t* page = block->frame;
+ rec_t* ret = page_rec_get_next(
+ page_get_infimum_rec(new_page));
+ ulint num_moved = 0;
+ rtr_rec_move_t* rec_move = NULL;
+ mem_heap_t* heap = NULL;
+ ut_ad(page_align(rec) == page);
+
+#ifdef UNIV_ZIP_DEBUG
+ if (new_page_zip) {
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+
+ /* Strict page_zip_validate() may fail here.
+ Furthermore, btr_compress() may set FIL_PAGE_PREV to
+ FIL_NULL on new_page while leaving it intact on
+ new_page_zip. So, we cannot validate new_page_zip. */
+ ut_a(page_zip_validate_low(page_zip, page, index, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+ ut_ad(buf_block_get_frame(block) == page);
+ ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+ ut_ad(page_is_comp(page) == page_is_comp(new_page));
+ /* Here, "ret" may be pointing to a user record or the
+ predefined supremum record. */
+
+ const mtr_log_t log_mode = new_page_zip
+ ? mtr->set_log_mode(MTR_LOG_NONE) : MTR_LOG_NONE;
+ const bool was_empty = page_dir_get_n_heap(new_page)
+ == PAGE_HEAP_NO_USER_LOW;
+ alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT];
+ memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page,
+ sizeof h);
+
+ if (index->is_spatial()) {
+ ulint max_to_move = page_get_n_recs(
+ buf_block_get_frame(block));
+ heap = mem_heap_create(256);
+
+ rec_move = static_cast<rtr_rec_move_t*>(
+ mem_heap_alloc(heap, max_to_move * sizeof *rec_move));
+
+ /* For spatial index, we need to insert recs one by one
+ to keep recs ordered. */
+ rtr_page_copy_rec_list_end_no_locks(new_block,
+ block, rec, index,
+ heap, rec_move,
+ max_to_move,
+ &num_moved,
+ mtr);
+ } else {
+ page_copy_rec_list_end_no_locks(new_block, block, rec,
+ index, mtr);
+ if (was_empty) {
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+ + PAGE_LAST_INSERT
+ + new_page, h, sizeof h);
+ }
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below.
+ Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && page_is_leaf(page)
+ && !index->table->is_temporary()) {
+ ut_ad(!was_empty || page_dir_get_n_heap(new_page)
+ == PAGE_HEAP_NO_USER_LOW
+ + page_header_get_field(new_page, PAGE_N_RECS));
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page), mtr);
+ }
+
+ if (new_page_zip) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(new_block, index,
+ page_zip_level, mtr)) {
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ulint ret_pos
+ = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the successor of
+ the predefined infimum record. It must still
+ have at least one predecessor (the predefined
+ infimum record, or a freshly copied record
+ that is smaller than "ret"). */
+ ut_a(ret_pos > 0);
+
+ if (!page_zip_reorganize(new_block, index,
+ page_zip_level, mtr)) {
+
+ if (!page_zip_decompress(new_page_zip,
+ new_page, FALSE)) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ return(NULL);
+ } else {
+ /* The page was reorganized:
+ Seek to ret_pos. */
+ ret = page_rec_get_nth(new_page, ret_pos);
+ }
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ if (dict_table_is_locking_disabled(index->table)) {
+ } else if (rec_move && dict_index_is_spatial(index)) {
+ lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+ } else {
+ lock_move_rec_list_end(new_block, block, rec);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ btr_search_move_or_delete_hash_entries(new_block, block);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page_align(rec) == block->frame);
+
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_cur_t cur1;
+ page_cur_t cur2;
+ mem_heap_t* heap = NULL;
+ ulint num_moved = 0;
+ rtr_rec_move_t* rec_move = NULL;
+ rec_t* ret
+ = page_rec_get_prev(page_get_supremum_rec(new_page));
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ /* Here, "ret" may be pointing to a user record or the
+ predefined infimum record. */
+
+ if (page_rec_is_infimum(rec)) {
+ return(ret);
+ }
+
+ mtr_log_t log_mode = MTR_LOG_NONE;
+
+ if (new_page_zip) {
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+ }
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ page_cur_position(ret, new_block, &cur2);
+
+ const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0;
+
+ /* Copy records from the original page to the new page */
+ if (index->is_spatial()) {
+ ut_ad(!index->is_instant());
+ ulint max_to_move = page_get_n_recs(
+ buf_block_get_frame(block));
+ heap = mem_heap_create(256);
+
+ rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+ heap,
+ sizeof (*rec_move) * max_to_move));
+
+ /* For spatial index, we need to insert recs one by one
+ to keep recs ordered. */
+ rtr_page_copy_rec_list_start_no_locks(new_block,
+ block, rec, index, heap,
+ rec_move, max_to_move,
+ &num_moved, mtr);
+ } else {
+ while (page_cur_get_rec(&cur1) != rec) {
+ offsets = rec_get_offsets(cur1.rec, index, offsets,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+ cur2.rec = page_cur_insert_rec_low(&cur2, index,
+ cur1.rec, offsets,
+ mtr);
+ ut_a(cur2.rec);
+
+ page_cur_move_to_next(&cur1);
+ ut_ad(!(rec_get_info_bits(cur1.rec,
+ page_is_comp(new_page))
+ & REC_INFO_MIN_REC_FLAG));
+ }
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below.
+ Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (n_core && dict_index_is_sec_or_ibuf(index)
+ && !index->table->is_temporary()) {
+ page_update_max_trx_id(new_block,
+ new_page_zip,
+ page_get_max_trx_id(block->frame),
+ mtr);
+ }
+
+ if (new_page_zip) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
+ goto zip_reorganize;);
+
+ if (!page_zip_compress(new_block, index,
+ page_zip_level, mtr)) {
+ ulint ret_pos;
+#ifndef DBUG_OFF
+zip_reorganize:
+#endif /* DBUG_OFF */
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ret_pos = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the predecessor
+ of the predefined supremum record. If it was
+ the predefined infimum record, then it would
+ still be the infimum, and we would have
+ ret_pos == 0. */
+
+ if (UNIV_UNLIKELY
+ (!page_zip_reorganize(new_block, index,
+ page_zip_level, mtr))) {
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress(new_page_zip,
+ new_page, FALSE))) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(NULL);
+ }
+
+ /* The page was reorganized: Seek to ret_pos. */
+ ret = page_rec_get_nth(new_page, ret_pos);
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ if (dict_table_is_locking_disabled(index->table)) {
+ } else if (dict_index_is_spatial(index)) {
+ lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+ } else {
+ lock_move_rec_list_start(new_block, block, rec, ret);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ btr_search_move_or_delete_hash_entries(new_block, block);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
+ ut_ad(page_align(rec) == block->frame);
+ ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!block->page.zip.data ||
+ page_zip_validate(&block->page.zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_supremum(rec))
+ {
+ ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+ /* Nothing to do, there are no records bigger than the page supremum. */
+ return;
+ }
+
+ if (page_rec_is_infimum(rec) || n_recs == page_get_n_recs(block->frame) ||
+ rec == (page_is_comp(block->frame)
+ ? page_rec_get_next_low(block->frame + PAGE_NEW_INFIMUM, 1)
+ : page_rec_get_next_low(block->frame + PAGE_OLD_INFIMUM, 0)))
+ {
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
+ return;
+ }
+
+#if 0 // FIXME: consider deleting the last record as a special case
+ if (page_rec_is_last(rec))
+ {
+ page_cur_t cursor= { index, rec, offsets, block };
+ page_cur_delete_rec(&cursor, index, offsets, mtr);
+ return;
+ }
+#endif
+
+ /* The page becomes invalid for optimistic searches */
+ buf_block_modify_clock_inc(block);
+
+ const ulint n_core= page_is_leaf(block->frame) ? index->n_core_fields : 0;
+ mem_heap_t *heap= nullptr;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs *offsets= offsets_;
+ rec_offs_init(offsets_);
+
+#if 1 // FIXME: remove this, and write minimal amount of log! */
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ ut_ad(page_is_comp(block->frame));
+ do
+ {
+ page_cur_t cur;
+ page_cur_position(rec, block, &cur);
+ offsets= rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ rec= rec_get_next_ptr(rec, TRUE);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&block->page.zip, block->frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(&cur, index, offsets, mtr);
+ }
+ while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ return;
+ }
+#endif
+
+ byte *prev_rec= page_rec_get_prev(rec);
+ byte *last_rec= page_rec_get_prev(page_get_supremum_rec(block->frame));
+
+ // FIXME: consider a special case of shrinking PAGE_HEAP_TOP
+
+ const bool scrub= srv_immediate_scrub_data_uncompressed;
+ if (scrub || size == ULINT_UNDEFINED || n_recs == ULINT_UNDEFINED)
+ {
+ rec_t *rec2= rec;
+ /* Calculate the sum of sizes and the number of records */
+ size= 0;
+ n_recs= 0;
+
+ do
+ {
+ offsets = rec_get_offsets(rec2, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ ulint s= rec_offs_size(offsets);
+ ut_ad(ulint(rec2 - block->frame) + s - rec_offs_extra_size(offsets) <
+ srv_page_size);
+ ut_ad(size + s < srv_page_size);
+ size+= s;
+ n_recs++;
+
+ if (scrub)
+ mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0);
+
+ rec2 = page_rec_get_next(rec2);
+ }
+ while (!page_rec_is_supremum(rec2));
+
+ if (UNIV_LIKELY_NULL(heap))
+ mem_heap_free(heap);
+ }
+
+ ut_ad(size < srv_page_size);
+
+ ulint slot_index, n_owned;
+ {
+ const rec_t *owner_rec= rec;
+ ulint count= 0;
+
+ if (page_is_comp(block->frame))
+ while (!(n_owned= rec_get_n_owned_new(owner_rec)))
+ {
+ count++;
+ owner_rec= rec_get_next_ptr_const(owner_rec, TRUE);
+ }
+ else
+ while (!(n_owned= rec_get_n_owned_old(owner_rec)))
+ {
+ count++;
+ owner_rec= rec_get_next_ptr_const(owner_rec, FALSE);
+ }
+
+ ut_ad(n_owned > count);
+ n_owned-= count;
+ slot_index= page_dir_find_owner_slot(owner_rec);
+ ut_ad(slot_index > 0);
+ }
+
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+ (PAGE_N_DIR_SLOTS + PAGE_HEADER +
+ block->frame), slot_index + 1);
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+ (PAGE_LAST_INSERT + PAGE_HEADER +
+ block->frame), 0U);
+ /* Catenate the deleted chain segment to the page free list */
+ alignas(4) byte page_header[4];
+ byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE +
+ block->frame);
+ const uint16_t free= page_header_get_field(block->frame, PAGE_FREE);
+ static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility");
+
+ mach_write_to_2(page_header, page_offset(rec));
+ mach_write_to_2(my_assume_aligned<2>(page_header + 2),
+ mach_read_from_2(my_assume_aligned<2>(page_free + 2)) +
+ size);
+ mtr->memcpy(*block, page_free, page_header, 4);
+
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_n_recs,
+ ulint{mach_read_from_2(page_n_recs)} - n_recs);
+
+ /* Update the page directory; there is no need to balance the number
+ of the records owned by the supremum record, as it is allowed to be
+ less than PAGE_DIR_SLOT_MIN_N_OWNED */
+ page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, slot_index);
+
+ if (page_is_comp(block->frame))
+ {
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM);
+ byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + block->frame;
+ byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+ n_owned << REC_N_OWNED_SHIFT);
+#if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED
+ if (UNIV_LIKELY_NULL(block->page.zip.data))
+ {
+ *owned= new_owned;
+ memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data,
+ PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame,
+ PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS);
+ // TODO: the equivalent of page_zip_dir_delete() for all records
+ mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+ mach_write_to_2(last_rec - REC_NEXT, free
+ ? static_cast<uint16_t>(free - page_offset(last_rec))
+ : 0U);
+ return;
+ }
+#endif
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+ mtr->write<2>(*block, prev_rec - REC_NEXT, static_cast<uint16_t>
+ (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+ mtr->write<2>(*block, last_rec - REC_NEXT, free
+ ? static_cast<uint16_t>(free - page_offset(last_rec))
+ : 0U);
+ }
+ else
+ {
+ mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM);
+ byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + block->frame;
+ byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+ n_owned << REC_N_OWNED_SHIFT);
+ mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+ mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM);
+ mtr->write<2>(*block, last_rec - REC_NEXT, free);
+ }
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t cur1;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+
+ rec_offs_init(offsets_);
+
+ ut_ad(page_align(rec) == block->frame);
+ ut_ad((ibool) !!page_rec_is_comp(rec)
+ == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+
+ /* page_zip_validate() would detect a min_rec_mark mismatch
+ in btr_page_split_and_insert()
+ between btr_attach_half_pages() and insert_page = ...
+ when btr_page_get_split_rec_to_left() holds
+ (direction == FSP_DOWN). */
+ ut_a(!page_zip
+ || page_zip_validate_low(page_zip, page, index, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_infimum(rec)) {
+ return;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
+ return;
+ }
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ const ulint n_core = page_rec_is_leaf(rec)
+ ? index->n_core_fields : 0;
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+ offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ page_cur_delete_rec(&cur1, index, offsets, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ ulint old_data_size;
+ ulint new_data_size;
+ ulint old_n_recs;
+ ulint new_n_recs;
+
+ ut_ad(!dict_index_is_spatial(index));
+
+ old_data_size = page_get_data_size(new_page);
+ old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* new_page_zip
+ = buf_block_get_page_zip(new_block);
+ page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!new_page_zip == !page_zip);
+ ut_a(!new_page_zip
+ || page_zip_validate(new_page_zip, new_page, index));
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, page_align(split_rec),
+ index));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ new_data_size = page_get_data_size(new_page);
+ new_n_recs = page_get_n_recs(new_page);
+
+ ut_ad(new_data_size >= old_data_size);
+
+ page_delete_rec_list_end(split_rec, block, index,
+ new_n_recs - old_n_recs,
+ new_data_size - old_data_size, mtr);
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure */
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ page_delete_rec_list_start(split_rec, block, index, mtr);
+
+ return(TRUE);
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+ const page_t* page, /*!< in: page */
+ ulint nth) /*!< in: nth record */
+{
+ const page_dir_slot_t* slot;
+ ulint i;
+ ulint n_owned;
+ const rec_t* rec;
+
+ if (nth == 0) {
+ return(page_get_infimum_rec(page));
+ }
+
+ ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+
+ for (i = 0;; i++) {
+
+ slot = page_dir_get_nth_slot(page, i);
+ n_owned = page_dir_slot_get_n_owned(slot);
+
+ if (n_owned > nth) {
+ break;
+ } else {
+ nth -= n_owned;
+ }
+ }
+
+ ut_ad(i > 0);
+ slot = page_dir_get_nth_slot(page, i - 1);
+ rec = page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ do {
+ rec = page_rec_get_next_low(rec, TRUE);
+ ut_ad(rec);
+ } while (nth--);
+ } else {
+ do {
+ rec = page_rec_get_next_low(rec, FALSE);
+ ut_ad(rec);
+ } while (nth--);
+ }
+
+ return(rec);
+}
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return number of records */
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ const page_dir_slot_t* slot;
+ const rec_t* slot_rec;
+ const page_t* page;
+ ulint i;
+ lint n = 0;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, TRUE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += lint(rec_get_n_owned_new(slot_rec));
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ } else {
+ while (rec_get_n_owned_old(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, FALSE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += lint(rec_get_n_owned_old(slot_rec));
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ }
+
+ n--;
+
+ ut_ad(n >= 0);
+ ut_ad((ulong) n < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+
+ return((ulint) n);
+}
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: record descriptor */
+{
+ ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+ rec_print_new(stderr, rec, offsets);
+ if (page_rec_is_comp(rec)) {
+ ib::info() << "n_owned: " << rec_get_n_owned_new(rec)
+ << "; heap_no: " << rec_get_heap_no_new(rec)
+ << "; next rec: " << rec_get_next_offs(rec, TRUE);
+ } else {
+ ib::info() << "n_owned: " << rec_get_n_owned_old(rec)
+ << "; heap_no: " << rec_get_heap_no_old(rec)
+ << "; next rec: " << rec_get_next_offs(rec, FALSE);
+ }
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+}
+
+#ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ ulint n;
+ ulint i;
+ page_dir_slot_t* slot;
+
+ n = page_dir_get_n_slots(page);
+
+ fprintf(stderr, "--------------------------------\n"
+ "PAGE DIRECTORY\n"
+ "Page address %p\n"
+ "Directory stack top at offs: %lu; number of slots: %lu\n",
+ page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+ (ulong) n);
+ for (i = 0; i < n; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ if ((i == pr_n) && (i < n - pr_n)) {
+ fputs(" ... \n", stderr);
+ }
+ if ((i < pr_n) || (i >= n - pr_n)) {
+ fprintf(stderr,
+ "Contents of slot: %lu: n_owned: %lu,"
+ " rec offs: %lu\n",
+ (ulong) i,
+ (ulong) page_dir_slot_get_n_owned(slot),
+ (ulong)
+ page_offset(page_dir_slot_get_rec(slot)));
+ }
+ }
+ fprintf(stderr, "Total of %lu records\n"
+ "--------------------------------\n",
+ (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ page_t* page = block->frame;
+ page_cur_t cur;
+ ulint count;
+ ulint n_recs;
+ mem_heap_t* heap = NULL;
+ rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ fprint(stderr,
+ "--------------------------------\n"
+ "PAGE RECORD LIST\n"
+ "Page address %p\n", page);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cur);
+ count = 0;
+ for (;;) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ page_rec_is_leaf(cur.rec),
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+
+ if (count == pr_n) {
+ break;
+ }
+ if (page_cur_is_after_last(&cur)) {
+ break;
+ }
+ page_cur_move_to_next(&cur);
+ count++;
+ }
+
+ if (n_recs > 2 * pr_n) {
+ fputs(" ... \n", stderr);
+ }
+
+ while (!page_cur_is_after_last(&cur)) {
+ page_cur_move_to_next(&cur);
+
+ if (count + pr_n >= n_recs) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ page_rec_is_leaf(cur.rec),
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+ }
+ count++;
+ }
+
+ fprintf(stderr,
+ "Total of %lu records \n"
+ "--------------------------------\n",
+ (ulong) (count + 1));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+ const page_t* page)
+{
+ fprintf(stderr,
+ "--------------------------------\n"
+ "PAGE HEADER INFO\n"
+ "Page address %p, n records %u (%s)\n"
+ "n dir slots %u, heap top %u\n"
+ "Page n heap %u, free %u, garbage %u\n"
+ "Page last insert %u, direction %u, n direction %u\n",
+ page, page_header_get_field(page, PAGE_N_RECS),
+ page_is_comp(page) ? "compact format" : "original format",
+ page_header_get_field(page, PAGE_N_DIR_SLOTS),
+ page_header_get_field(page, PAGE_HEAP_TOP),
+ page_dir_get_n_heap(page),
+ page_header_get_field(page, PAGE_FREE),
+ page_header_get_field(page, PAGE_GARBAGE),
+ page_header_get_field(page, PAGE_LAST_INSERT),
+ page_get_direction(page),
+ page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn) /*!< in: print rn first and last records
+ in directory */
+{
+ page_t* page = block->frame;
+
+ page_header_print(page);
+ page_dir_print(page, dn);
+ page_print_list(block, index, rn);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_owned;
+ ulint heap_no;
+ const page_t* page;
+
+ page = page_align(rec);
+ ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+
+ if (page_rec_is_comp(rec)) {
+ n_owned = rec_get_n_owned_new(rec);
+ heap_no = rec_get_heap_no_new(rec);
+ } else {
+ n_owned = rec_get_n_owned_old(rec);
+ heap_no = rec_get_heap_no_old(rec);
+ }
+
+ if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+ ib::warn() << "Dir slot of rec " << page_offset(rec)
+ << ", n owned too big " << n_owned;
+ return(FALSE);
+ }
+
+ if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+ ib::warn() << "Heap no of rec " << page_offset(rec)
+ << " too big " << heap_no << " "
+ << page_dir_get_n_heap(page);
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint n_slots;
+ ulint infimum_offs;
+ ulint supremum_offs;
+
+ n_slots = page_dir_get_n_slots(page);
+ infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+ supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+ n_slots - 1));
+
+ if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+ ib::fatal() << "Page directory corruption: infimum not"
+ " pointed to";
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+ ib::fatal() << "Page directory corruption: supremum not"
+ " pointed to";
+ }
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+ const page_t* page) /*!< in: index page in ROW_FORMAT=REDUNDANT */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ const rec_t* rec;
+ const byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(!page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+ ib::error() << "Nonsensical number of page dir slots: "
+ << n_slots;
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+ ib::error()
+ << "Record heap and dir overlap on a page, heap top "
+ << page_header_get_field(page, PAGE_HEAP_TOP)
+ << ", dir "
+ << page_offset(page_dir_get_nth_slot(page,
+ n_slots - 1));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Record " << (rec - page)
+ << " is above rec heap top "
+ << (rec_heap_top - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != 0)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+ != own_count)) {
+
+ ib::error() << "Wrong owned count "
+ << rec_get_n_owned_old(rec)
+ << ", " << own_count << ", rec "
+ << (rec - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ ib::error() << "Dir slot does not point"
+ " to right rec " << (rec - page);
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, FALSE) >= srv_page_size)) {
+
+ ib::error() << "Next record offset nonsensical "
+ << rec_get_next_offs(rec, FALSE) << " for rec "
+ << (rec - page);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page record list appears"
+ " to be circular " << count;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_const(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+ ib::error() << "n owned is zero in a supremum rec";
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong "
+ << slot_no << ", " << (n_slots - 1);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + srv_page_size)) {
+ ib::error() << "Free list record has"
+ " a nonsensical offset " << (rec - page);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Free list record " << (rec - page)
+ << " is above rec heap top "
+ << (rec_heap_top - page);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page free list appears"
+ " to be circular " << count;
+ goto func_exit;
+ }
+
+ ulint offs = rec_get_next_offs(rec, FALSE);
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted " << count;
+ goto func_exit;
+ }
+
+ rec = page + offs;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << ", " << (count + 1);
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+ const page_t* page) /*!< in: index page in ROW_FORMAT!=REDUNDANT */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ const rec_t* rec;
+ const byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+ ib::error() << "Nonsensical number of page dir slots: "
+ << n_slots;
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+ ib::error() << "Record heap and dir overlap on a page,"
+ " heap top "
+ << page_header_get_field(page, PAGE_HEAP_TOP)
+ << ", dir " << page_offset(
+ page_dir_get_nth_slot(page, n_slots - 1));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+
+ ib::error() << "Record " << page_offset(rec)
+ << " is above rec heap top "
+ << page_offset(rec_heap_top);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+ != own_count)) {
+
+ ib::error() << "Wrong owned count "
+ << rec_get_n_owned_new(rec) << ", "
+ << own_count << ", rec "
+ << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ ib::error() << "Dir slot does not point"
+ " to right rec " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, TRUE) >= srv_page_size)) {
+
+ ib::error() << "Next record offset nonsensical "
+ << rec_get_next_offs(rec, TRUE)
+ << " for rec " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page record list appears to be"
+ " circular " << count;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_const(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+ ib::error() << "n owned is zero in a supremum rec";
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong " << slot_no << ", "
+ << (n_slots - 1);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + srv_page_size)) {
+
+ ib::error() << "Free list record has"
+ " a nonsensical offset " << page_offset(rec);
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ ib::error() << "Free list record " << page_offset(rec)
+ << " is above rec heap top "
+ << page_offset(rec_heap_top);
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > srv_page_size)) {
+ ib::error() << "Page free list appears to be"
+ " circular " << count;
+ goto func_exit;
+ }
+
+ const ulint offs = rec_get_next_offs(rec, TRUE);
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted " << count;
+ goto func_exit;
+ }
+
+ rec = page + offs;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << ", " << (count + 1);
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/** Check the consistency of an index page.
+@param[in] page index page
+@param[in] index B-tree or R-tree index
+@return whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+{
+ const page_dir_slot_t* slot;
+ const rec_t* rec;
+ const rec_t* old_rec = NULL;
+ const rec_t* first_rec = NULL;
+ ulint offs = 0;
+ ulint n_slots;
+ ibool ret = TRUE;
+ ulint i;
+ rec_offs offsets_1[REC_OFFS_NORMAL_SIZE];
+ rec_offs offsets_2[REC_OFFS_NORMAL_SIZE];
+ rec_offs* offsets = offsets_1;
+ rec_offs* old_offsets = offsets_2;
+
+ rec_offs_init(offsets_1);
+ rec_offs_init(offsets_2);
+
+#ifdef UNIV_GIS_DEBUG
+ if (dict_index_is_spatial(index)) {
+ fprintf(stderr, "Page no: %lu\n", page_get_page_no(page));
+ }
+#endif /* UNIV_DEBUG */
+
+ if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+ != dict_table_is_comp(index->table))) {
+ ib::error() << "'compact format' flag mismatch";
+func_exit2:
+ ib::error() << "Apparent corruption in space "
+ << page_get_space_id(page) << " page "
+ << page_get_page_no(page)
+ << " of index " << index->name
+ << " of table " << index->table->name;
+ return FALSE;
+ }
+
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+ goto func_exit2;
+ }
+ } else {
+ if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+ goto func_exit2;
+ }
+ }
+
+ /* Multiple transactions cannot simultaneously operate on the
+ same temp-table in parallel.
+ max_trx_id is ignored for temp tables because it not required
+ for MVCC. */
+ if (!page_is_leaf(page) || page_is_empty(page)
+ || !dict_index_is_sec_or_ibuf(index)
+ || index->table->is_temporary()) {
+ } else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) {
+ trx_id_t max_trx_id = page_get_max_trx_id(page);
+
+ if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+ ib::error() << "PAGE_MAX_TRX_ID out of bounds: "
+ << max_trx_id << ", " << sys_max_trx_id;
+ ret = FALSE;
+ }
+ } else {
+ ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN);
+ }
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+ <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+ ib::warn() << "Record heap and directory overlap";
+ goto func_exit2;
+ }
+
+ switch (uint16_t type = fil_page_get_type(page)) {
+ case FIL_PAGE_RTREE:
+ if (!index->is_spatial()) {
+wrong_page_type:
+ ib::warn() << "Wrong page type " << type;
+ ret = FALSE;
+ }
+ break;
+ case FIL_PAGE_TYPE_INSTANT:
+ if (index->is_instant()
+ && page_get_page_no(page) == index->page) {
+ break;
+ }
+ goto wrong_page_type;
+ case FIL_PAGE_INDEX:
+ if (index->is_spatial()) {
+ goto wrong_page_type;
+ }
+ if (index->is_instant()
+ && page_get_page_no(page) == index->page) {
+ goto wrong_page_type;
+ }
+ break;
+ default:
+ goto wrong_page_type;
+ }
+
+ /* The following buffer is used to check that the
+ records in the page record heap do not overlap */
+ mem_heap_t* heap = mem_heap_create(srv_page_size + 200);;
+ byte* buf = static_cast<byte*>(mem_heap_zalloc(heap, srv_page_size));
+
+ /* Validate the record list in a loop checking also that
+ it is consistent with the directory. */
+ ulint count = 0, data_size = 0, own_count = 1, slot_no = 0;
+ ulint info_bits;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+ for (;;) {
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_is_comp(page) && page_rec_is_user_rec(rec)
+ && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+ == page_is_leaf(page))) {
+ ib::error() << "'node_ptr' flag mismatch";
+ ret = FALSE;
+ goto next_rec;
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+ ret = FALSE;
+ goto next_rec;
+ }
+
+ info_bits = rec_get_info_bits(rec, page_is_comp(page));
+ if (info_bits
+ & ~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) {
+ ib::error() << "info_bits has an incorrect value "
+ << info_bits;
+ ret = false;
+ }
+
+ if (rec == first_rec) {
+ if (info_bits & REC_INFO_MIN_REC_FLAG) {
+ if (page_has_prev(page)) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG "
+ "is set on non-left page";
+ ret = false;
+ } else if (!page_is_leaf(page)) {
+ /* leftmost node pointer page */
+ } else if (!index->is_instant()) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG "
+ "is set in a leaf-page record";
+ ret = false;
+ } else if (!(info_bits & REC_INFO_DELETED_FLAG)
+ != !index->table->instant) {
+ ib::error() << (index->table->instant
+ ? "Metadata record "
+ "is not delete-marked"
+ : "Metadata record "
+ "is delete-marked");
+ ret = false;
+ }
+ } else if (!page_has_prev(page)
+ && index->is_instant()) {
+ ib::error() << "Metadata record is missing";
+ ret = false;
+ }
+ } else if (info_bits & REC_INFO_MIN_REC_FLAG) {
+ ib::error() << "REC_INFO_MIN_REC_FLAG record is not "
+ "first in page";
+ ret = false;
+ }
+
+ if (page_is_comp(page)) {
+ const rec_comp_status_t status = rec_get_status(rec);
+ if (status != REC_STATUS_ORDINARY
+ && status != REC_STATUS_NODE_PTR
+ && status != REC_STATUS_INFIMUM
+ && status != REC_STATUS_SUPREMUM
+ && status != REC_STATUS_INSTANT) {
+ ib::error() << "impossible record status "
+ << status;
+ ret = false;
+ } else if (page_rec_is_infimum(rec)) {
+ if (status != REC_STATUS_INFIMUM) {
+ ib::error()
+ << "infimum record has status "
+ << status;
+ ret = false;
+ }
+ } else if (page_rec_is_supremum(rec)) {
+ if (status != REC_STATUS_SUPREMUM) {
+ ib::error() << "supremum record has "
+ "status "
+ << status;
+ ret = false;
+ }
+ } else if (!page_is_leaf(page)) {
+ if (status != REC_STATUS_NODE_PTR) {
+ ib::error() << "node ptr record has "
+ "status "
+ << status;
+ ret = false;
+ }
+ } else if (!index->is_instant()
+ && status == REC_STATUS_INSTANT) {
+ ib::error() << "instantly added record in a "
+ "non-instant index";
+ ret = false;
+ }
+ }
+
+ /* Check that the records are in the ascending order */
+ if (count >= PAGE_HEAP_NO_USER_LOW
+ && !page_rec_is_supremum(rec)) {
+
+ int ret = cmp_rec_rec(
+ rec, old_rec, offsets, old_offsets, index);
+
+ /* For spatial index, on nonleaf leavel, we
+ allow recs to be equal. */
+ if (ret <= 0 && !(ret == 0 && index->is_spatial()
+ && !page_is_leaf(page))) {
+
+ ib::error() << "Records in wrong order";
+
+ fputs("\nInnoDB: previous record ", stderr);
+ /* For spatial index, print the mbr info.*/
+ if (index->type & DICT_SPATIAL) {
+ putc('\n', stderr);
+ rec_print_mbr_rec(stderr,
+ old_rec, old_offsets);
+ fputs("\nInnoDB: record ", stderr);
+ putc('\n', stderr);
+ rec_print_mbr_rec(stderr, rec, offsets);
+ putc('\n', stderr);
+ putc('\n', stderr);
+
+ } else {
+ rec_print_new(stderr, old_rec, old_offsets);
+ fputs("\nInnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+
+ ret = FALSE;
+ }
+ }
+
+ if (page_rec_is_user_rec(rec)) {
+
+ data_size += rec_offs_size(offsets);
+
+#if defined(UNIV_GIS_DEBUG)
+ /* For spatial index, print the mbr info.*/
+ if (index->type & DICT_SPATIAL) {
+ rec_print_mbr_rec(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+#endif /* UNIV_GIS_DEBUG */
+ }
+
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+ ib::error() << "Record offset out of bounds: "
+ << offs << '+' << i;
+ ret = FALSE;
+ goto next_rec;
+ }
+ while (i--) {
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ ib::error() << "Record overlaps another: "
+ << offs << '+' << i;
+ ret = FALSE;
+ break;
+ }
+ buf[offs + i] = 1;
+ }
+
+ if (ulint rec_own_count = page_is_comp(page)
+ ? rec_get_n_owned_new(rec)
+ : rec_get_n_owned_old(rec)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+ ib::error() << "Wrong owned count at " << offs
+ << ": " << rec_own_count
+ << ", " << own_count;
+ ret = FALSE;
+ }
+
+ if (page_dir_slot_get_rec(slot) != rec) {
+ ib::error() << "Dir slot does not"
+ " point to right rec at " << offs;
+ ret = FALSE;
+ }
+
+ if (ret) {
+ page_dir_slot_check(slot);
+ }
+
+ own_count = 0;
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+next_rec:
+ if (page_rec_is_supremum(rec)) {
+ break;
+ }
+
+ count++;
+ own_count++;
+ old_rec = rec;
+ rec = page_rec_get_next_const(rec);
+
+ if (page_rec_is_infimum(old_rec)
+ && page_rec_is_user_rec(rec)) {
+ first_rec = rec;
+ }
+
+ /* set old_offsets to offsets; recycle offsets */
+ std::swap(old_offsets, offsets);
+ }
+
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+ goto n_owned_zero;
+ }
+ } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+ ib::error() << "n owned is zero at " << offs;
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ ib::error() << "n slots wrong " << slot_no << " "
+ << (n_slots - 1);
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ ib::error() << "n recs wrong "
+ << page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+ ib::error() << "Summed data size " << data_size
+ << ", returned by func " << page_get_data_size(page);
+ ret = FALSE;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ offsets = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+ ret = FALSE;
+next_free:
+ const ulint offs = rec_get_next_offs(
+ rec, page_is_comp(page));
+ if (!offs) {
+ break;
+ }
+ if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+ || offs >= srv_page_size)) {
+ ib::error() << "Page free list is corrupted";
+ ret = FALSE;
+ break;
+ }
+
+ rec = page + offs;
+ continue;
+ }
+
+ count++;
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+ ib::error() << "Free record offset out of bounds: "
+ << offs << '+' << i;
+ ret = FALSE;
+ goto next_free;
+ }
+ while (i--) {
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ ib::error() << "Free record overlaps another: "
+ << offs << '+' << i;
+ ret = FALSE;
+ break;
+ }
+ buf[offs + i] = 1;
+ }
+
+ goto next_free;
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+ ib::error() << "N heap is wrong "
+ << page_dir_get_n_heap(page) << " " << count + 1;
+ ret = FALSE;
+ }
+
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(!ret)) {
+ goto func_exit2;
+ }
+
+ return(ret);
+}
+
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no)/*!< in: heap number */
+{
+ const rec_t* rec;
+
+ if (page_is_comp(page)) {
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (;;) {
+ ulint rec_heap_no = rec_get_heap_no_new(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, TRUE);
+ }
+ } else {
+ rec = page + PAGE_OLD_INFIMUM;
+
+ for (;;) {
+ ulint rec_heap_no = rec_get_heap_no_old(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, FALSE);
+ }
+ }
+}
+
+/** Get the last non-delete-marked record on a page.
+@param[in] page index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t*
+page_find_rec_max_not_deleted(
+ const page_t* page)
+{
+ const rec_t* rec = page_get_infimum_rec(page);
+ const rec_t* prev_rec = NULL; // remove warning
+
+ /* Because the page infimum is never delete-marked
+ and never the metadata pseudo-record (MIN_REC_FLAG)),
+ prev_rec will always be assigned to it first. */
+ ut_ad(!rec_get_info_bits(rec, page_rec_is_comp(rec)));
+ ut_ad(page_is_leaf(page));
+
+ if (page_is_comp(page)) {
+ do {
+ if (!(rec[-REC_NEW_INFO_BITS]
+ & (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG))) {
+ prev_rec = rec;
+ }
+ rec = page_rec_get_next_low(rec, true);
+ } while (rec != page + PAGE_NEW_SUPREMUM);
+ } else {
+ do {
+ if (!(rec[-REC_OLD_INFO_BITS]
+ & (REC_INFO_DELETED_FLAG
+ | REC_INFO_MIN_REC_FLAG))) {
+ prev_rec = rec;
+ }
+ rec = page_rec_get_next_low(rec, false);
+ } while (rec != page + PAGE_OLD_SUPREMUM);
+ }
+ return(prev_rec);
+}
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
new file mode 100644
index 00000000..331ecbfb
--- /dev/null
+++ b/storage/innobase/page/page0zip.cc
@@ -0,0 +1,4713 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.cc
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "page0page.h"
+#include "buf0checksum.h"
+#include "ut0crc32.h"
+#include "zlib.h"
+#include "span.h"
+
+using st_::span;
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "log0recv.h"
+#include "row0row.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "buf0lru.h"
+#include "srv0mon.h"
+
+#include <map>
+#include <algorithm>
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+page_zip_stat_per_index_t page_zip_stat_per_index;
+
+/** Compression level to be used by zlib. Settable by user. */
+uint page_zip_level;
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+ 0x01, /* info_bits=0, n_owned=1 */
+ 0x00, 0x02 /* heap_no=0, status=2 */
+ /* ?, ? */ /* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+ 0x69, 0x6e, 0x66, 0x69,
+ 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data alignas(4) [] = {
+ /* 0x0?, */ /* info_bits=0, n_owned=1..8 */
+ 0x00, 0x0b, /* heap_no=1, status=3 */
+ 0x00, 0x00, /* next=0 */
+ 0x73, 0x75, 0x70, 0x72,
+ 0x65, 0x6d, 0x75, 0x6d /* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+@param b in: memory block
+@param s in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
+
+/* Enable some extra debugging output. This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+MY_ATTRIBUTE((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+ const char* fmt, /*!< in: printf(3) format string */
+ ...) /*!< in: arguments corresponding to fmt */
+{
+ int res;
+ va_list ap;
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ", stderr);
+ va_start(ap, fmt);
+ res = vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+{
+ ulint size = zip_size
+ /* subtract the page header and the longest
+ uncompressed data needed for one record */
+ - (PAGE_DATA
+ + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
+ + 1/* encoded heap_no==2 in page_zip_write_rec() */
+ + 1/* end of modification log */
+ - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+ /* subtract the space for page_zip_fields_encode() */
+ - compressBound(static_cast<uLong>(2 * (n_fields + 1)));
+ return(lint(size) > 0 ? size : 0);
+}
+
+/** Check whether a tuple is too big for compressed table
+@param[in] index dict index object
+@param[in] entry entry for the index
+@return true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+ const dict_index_t* index,
+ const dtuple_t* entry)
+{
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* Estimate the free space of an empty compressed page.
+ Subtract one byte for the encoded heap_no in the
+ modification log. */
+ ulint free_space_zip = page_zip_empty_size(
+ index->n_fields, zip_size);
+ ulint n_uniq = dict_index_get_n_unique_in_tree(index);
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(zip_size);
+
+ if (free_space_zip == 0) {
+ return(true);
+ }
+
+ /* Subtract one byte for the encoded heap_no in the
+ modification log. */
+ free_space_zip--;
+
+ /* There should be enough room for two node pointer
+ records on an empty non-leaf page. This prevents
+ infinite page splits. */
+
+ if (entry->n_fields >= n_uniq
+ && (REC_NODE_PTR_SIZE
+ + rec_get_converted_size_comp_prefix(
+ index, entry->fields, n_uniq, NULL)
+ /* On a compressed page, there is
+ a two-byte entry in the dense
+ page directory for every record.
+ But there is no record header. */
+ - (REC_N_NEW_EXTRA_BYTES - 2)
+ > free_space_zip / 2)) {
+ return(true);
+ }
+
+ return(false);
+}
+
+/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ /* Exclude the page infimum and supremum from the record count. */
+ return ulint(page_dir_get_n_heap(page_zip->data))
+ - PAGE_HEAP_NO_USER_LOW;
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint n_dense) /*!< in: directory size */
+{
+ ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+ return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@param[in] n_dense number of entries in the directory
+@return pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense) \
+ ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@return pointer to the dense page directory */
+#define page_zip_dir_start(page_zip) \
+ page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size = PAGE_ZIP_DIR_SLOT_SIZE
+ * ulint(page_get_n_recs(page_zip->data));
+ ut_ad(size <= page_zip_dir_size(page_zip));
+ return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+ byte* slot, /*!< in: start of records */
+ byte* end, /*!< in: end of records */
+ ulint offset) /*!< in: offset of user record */
+{
+ ut_ad(slot <= end);
+
+ for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+ if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+ == offset) {
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+ end,
+ offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+ end - page_zip_dir_user_size(page_zip),
+ offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint slot) /*!< in: slot
+ (0=first user record) */
+{
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+ return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] offset byte offset from b.zip.data
+@param[in] len length of the data to write */
+inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
+ fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
+ ut_ad(page_zip_simple_validate(&b.page.zip));
+ ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
+
+ memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] dest destination within b.zip.data
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
+{
+ byte *d= static_cast<byte*>(dest);
+ const byte *s= static_cast<const byte*>(str);
+ ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
+ if (w != FORCED)
+ {
+ ut_ad(len);
+ const byte *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ zmemcpy(b, d - b.page.zip.data, len);
+}
+
+/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
+@param[in,out] block ROW_FORMAT=COMPRESSED index page
+@param[in] index the index that the block belongs to
+@param[in,out] mtr mini-transaction */
+static void page_zip_compress_write_log(buf_block_t *block,
+ dict_index_t *index, mtr_t *mtr)
+{
+ ut_ad(!index->is_ibuf());
+
+ if (mtr->get_log_mode() != MTR_LOG_ALL)
+ {
+ ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
+ mtr->get_log_mode() == MTR_LOG_NO_REDO);
+ return;
+ }
+
+ const page_t *page= block->frame;
+ const page_zip_des_t *page_zip= &block->page.zip;
+ /* Read the number of user records. */
+ ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
+ PAGE_HEAP_NO_USER_LOW;
+ /* Multiply by uncompressed of size stored per record */
+ if (!page_is_leaf(page))
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ else if (index->is_clust())
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
+ DATA_ROLL_PTR_LEN;
+ else
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
+ /* Add the space occupied by BLOB pointers. */
+ trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(page_zip->m_end > PAGE_DATA);
+ compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
+ ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+ mtr->init(block);
+ mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
+
+ if (trailer_size)
+ mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
+ trailer_size);
+ block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
+}
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ const rec_t* rec, /*!< in: compact physical record
+ on a B-tree leaf page */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ const page_t* page = page_align(rec);
+ ulint n_ext = 0;
+ ulint i;
+ ulint left;
+ ulint heap_no;
+ ulint n_recs = page_get_n_recs(page_zip->data);
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(page_is_comp(page));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ left = heap_no - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(!left)) {
+ return(0);
+ }
+
+ for (i = 0; i < n_recs; i++) {
+ const rec_t* r = page + (page_zip_dir_get(page_zip, i)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+
+ if (rec_get_heap_no_new(r) < heap_no) {
+ n_ext += rec_get_n_extern_new(r, index,
+ ULINT_UNDEFINED);
+ if (!--left) {
+ break;
+ }
+ }
+ }
+
+ return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+ byte* buf, /*!< in: pointer to buffer where to write */
+ ulint val) /*!< in: value to write */
+{
+ ut_ad(val >= 2);
+
+ if (UNIV_LIKELY(val < 126)) {
+ /*
+ 0 = nullable variable field of at most 255 bytes length;
+ 1 = not null variable field of at most 255 bytes length;
+ 126 = nullable variable field with maximum length >255;
+ 127 = not null variable field with maximum length >255
+ */
+ *buf++ = (byte) val;
+ } else {
+ *buf++ = (byte) (0x80 | val >> 8);
+ *buf++ = (byte) val;
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields
+ to compress */
+ const dict_index_t* index, /*!< in: index comprising
+ at least n fields */
+ ulint trx_id_pos,
+ /*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */
+{
+ const byte* buf_start = buf;
+ ulint i;
+ ulint col;
+ ulint trx_id_col = 0;
+ /* sum of lengths of preceding non-nullable fixed fields, or 0 */
+ ulint fixed_sum = 0;
+
+ ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+ for (i = col = 0; i < n; i++) {
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+ ulint val;
+
+ if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+ val = 1; /* set the "not nullable" flag */
+ } else {
+ val = 0; /* nullable field */
+ }
+
+ if (!field->fixed_len) {
+ /* variable-length field */
+ const dict_col_t* column
+ = dict_field_get_col(field);
+
+ if (DATA_BIG_COL(column)) {
+ val |= 0x7e; /* max > 255 bytes */
+ }
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ *buf++ = (byte) val;
+ col++;
+ } else if (val) {
+ /* fixed-length non-nullable field */
+
+ if (fixed_sum && UNIV_UNLIKELY
+ (fixed_sum + field->fixed_len
+ > DICT_MAX_FIXED_COL_LEN)) {
+ /* Write out the length of the
+ preceding non-nullable fields,
+ to avoid exceeding the maximum
+ length of a fixed-length column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+ if (fixed_sum) {
+ /* Write out the length of any
+ preceding non-nullable fields,
+ and start a new trx_id column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ col++;
+ }
+
+ trx_id_col = col;
+ fixed_sum = field->fixed_len;
+ } else {
+ /* add to the sum */
+ fixed_sum += field->fixed_len;
+ }
+ } else {
+ /* fixed-length nullable field */
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ buf = page_zip_fixed_field_encode(
+ buf, ulint(field->fixed_len) << 1);
+ col++;
+ }
+ }
+
+ if (fixed_sum) {
+ /* Write out the lengths of last fixed-length columns. */
+ buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+ }
+
+ if (trx_id_pos != ULINT_UNDEFINED) {
+ /* Write out the position of the trx_id column */
+ i = trx_id_col;
+ } else {
+ /* Write out the number of nullable fields */
+ i = index->n_nullable;
+ }
+
+ if (i < 128) {
+ *buf++ = (byte) i;
+ } else {
+ *buf++ = (byte) (0x80 | i >> 8);
+ *buf++ = (byte) i;
+ }
+
+ ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+ return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+ const page_t* page, /*!< in: compact page */
+ byte* buf, /*!< in: pointer to dense page directory[-1];
+ out: dense directory on compressed page */
+ const rec_t** recs) /*!< in: pointer to an array of 0, or NULL;
+ out: dense page directory sorted by ascending
+ address (and heap_no) */
+{
+ const byte* rec;
+ ulint status;
+ ulint min_mark;
+ ulint heap_no;
+ ulint i;
+ ulint n_heap;
+ ulint offs;
+
+ min_mark = 0;
+
+ if (page_is_leaf(page)) {
+ status = REC_STATUS_ORDINARY;
+ } else {
+ status = REC_STATUS_NODE_PTR;
+ if (UNIV_UNLIKELY(!page_has_prev(page))) {
+ min_mark = REC_INFO_MIN_REC_FLAG;
+ }
+ }
+
+ n_heap = page_dir_get_n_heap(page);
+
+ /* Traverse the list of stored records in the collation order,
+ starting from the first user record. */
+
+ rec = page + PAGE_NEW_INFIMUM;
+
+ i = 0;
+
+ for (;;) {
+ ulint info_bits;
+ offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+ break;
+ }
+ rec = page + offs;
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+ ut_a(offs < srv_page_size - PAGE_DIR);
+ ut_a(offs >= PAGE_ZIP_START);
+ compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
+ & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
+ compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
+ >= UNIV_ZIP_SIZE_MAX - 1);
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+ offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+ }
+
+ info_bits = rec_get_info_bits(rec, TRUE);
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ info_bits &= ~REC_INFO_DELETED_FLAG;
+ offs |= PAGE_ZIP_DIR_SLOT_DEL;
+ }
+ ut_a(info_bits == min_mark);
+ /* Only the smallest user record can have
+ REC_INFO_MIN_REC_FLAG set. */
+ min_mark = 0;
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ ut_a(ulint(rec_get_status(rec)) == status);
+ }
+
+ offs = page_header_get_field(page, PAGE_FREE);
+
+ /* Traverse the free list (of deleted records). */
+ while (offs) {
+ ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+ rec = page + offs;
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+
+ ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+ ut_a(ulint(rec_get_status(rec)) == status);
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ offs = rec_get_next_offs(rec, TRUE);
+ }
+
+ /* Ensure that each heap no occurs at least once. */
+ ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+extern "C" {
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_zalloc(
+/*============*/
+ void* opaque, /*!< in/out: memory heap */
+ uInt items, /*!< in: number of items to allocate */
+ uInt size) /*!< in: size of an item in bytes */
+{
+ return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+ void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */
+ void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */
+{
+}
+
+} /* extern "C" */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ z_stream* strm = static_cast<z_stream*>(stream);
+
+ strm->zalloc = page_zip_zalloc;
+ strm->zfree = page_zip_free;
+ strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+static bool page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+static unsigned page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set.
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+ FILE* logfile,/*!< in: log file, or NULL */
+ z_streamp strm, /*!< in/out: compressed stream for deflate() */
+ int flush) /*!< in: deflate() flushing method */
+{
+ int status;
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ut_print_buf(stderr, strm->next_in, strm->avail_in);
+ }
+ if (UNIV_LIKELY_NULL(logfile)) {
+ if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
+ != strm->avail_in) {
+ perror("fwrite");
+ }
+ }
+ status = deflate(strm, flush);
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ fprintf(stderr, " -> %d\n", status);
+ }
+ return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm in/out: compressed stream
+@param flush in: flushing method
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Only leaf nodes may contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+
+ /* Compress the data bytes, except node_ptr. */
+ c_stream->next_in = (byte*) rec;
+ c_stream->avail_in = static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+
+ memcpy(storage - REC_NODE_PTR_SIZE
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in, REC_NODE_PTR_SIZE);
+ c_stream->next_in += REC_NODE_PTR_SIZE;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense) /*!< in: size of recs[] */
+{
+ int err = Z_OK;
+
+ ut_ad(n_dense > 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ /* Compress everything up to this record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ MEM_CHECK_DEFINED(c_stream->next_in,
+ c_stream->avail_in);
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+ c_stream->next_in = (byte*) rec;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ byte** externs, /*!< in/out: pointer to the next
+ available BLOB pointer */
+ ulint* n_blobs) /*!< in/out: number of
+ externally stored columns */
+{
+ int err;
+ ulint i;
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ /* Store trx_id and roll_ptr
+ in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ i++;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ /* Reserve space for the data at
+ the end of the space reserved for
+ the compressed data and the page
+ modification log. */
+
+ if (UNIV_UNLIKELY
+ (c_stream->avail_out
+ <= BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* out of space */
+ return(Z_BUF_ERROR);
+ }
+
+ ut_ad(*externs == c_stream->next_out
+ + c_stream->avail_out
+ + 1/* end of modif. log */);
+
+ c_stream->next_in
+ += BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Skip deleted records. */
+ if (UNIV_LIKELY_NULL
+ (page_zip_dir_find_low(
+ storage, deleted,
+ page_offset(rec)))) {
+ continue;
+ }
+
+ (*n_blobs)++;
+ c_stream->avail_out
+ -= BTR_EXTERN_FIELD_REF_SIZE;
+ *externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Copy the BLOB pointer */
+ memcpy(*externs, c_stream->next_in
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+
+ return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* n_blobs, /*!< in: 0; out: number of
+ externally stored columns */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+ /* BTR_EXTERN_FIELD_REF storage */
+ byte* externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(*n_blobs == 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(rec_offs_n_fields(offsets)
+ == dict_index_get_n_fields(index));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Compress the data bytes. */
+
+ c_stream->next_in = (byte*) rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, store the
+ BTR_EXTERN_FIELD_REF separately. */
+ if (rec_offs_any_extern(offsets)) {
+ ut_ad(dict_index_is_clust(index));
+
+ err = page_zip_compress_clust_ext(
+ LOGFILE
+ c_stream, rec, offsets, trx_id_col,
+ deleted, storage, &externs, n_blobs);
+
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ } else {
+ ulint len;
+ const byte* src;
+
+ /* Store trx_id and roll_ptr in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+ }
+
+ /* Compress the last bytes of the record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec + rec_offs_data_size(offsets) - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ } while (--n_dense);
+
+func_exit:
+ return(err);}
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+ buf_block_t* block, /*!< in/out: buffer block */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: commpression level */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ z_stream c_stream;
+ int err;
+ byte* fields; /*!< index field information */
+ byte* buf; /*!< compressed payload of the
+ page */
+ byte* buf_end; /* end of buf */
+ ulint n_dense;
+ ulint slot_size; /* amount of uncompressed bytes
+ per record */
+ const rec_t** recs; /*!< dense page directory,
+ sorted by address */
+ mem_heap_t* heap;
+ ulint trx_id_col = ULINT_UNDEFINED;
+ ulint n_blobs = 0;
+ byte* storage; /* storage of uncompressed
+ columns */
+ const ulonglong ns = my_interval_timer();
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ FILE* logfile = NULL;
+#endif
+ /* A local copy of srv_cmp_per_index_enabled to avoid reading that
+ variable multiple times in this function since it can be changed at
+ anytime. */
+ my_bool cmp_per_index_enabled;
+ cmp_per_index_enabled = srv_cmp_per_index_enabled;
+
+ page_t* page = block->frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_a(page_is_comp(page));
+ ut_a(fil_page_index_page_check(page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ /* Check the data that will be omitted. */
+ ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra));
+ ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+ infimum_data, sizeof infimum_data));
+ ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+ /* info_bits == 0, n_owned <= max */
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+ supremum_extra_data, sizeof supremum_extra_data));
+
+ if (page_is_empty(page)) {
+ ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+ == PAGE_NEW_SUPREMUM);
+ }
+
+ const ulint n_fields = page_is_leaf(page)
+ ? dict_index_get_n_fields(index)
+ : dict_index_get_n_unique_in_tree_nonleaf(index);
+ index_id_t ind_id = index->id;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ib::info() << "compress "
+ << static_cast<void*>(page_zip) << " "
+ << static_cast<const void*>(page) << " "
+ << page_is_leaf(page) << " "
+ << n_fields << " " << n_dense;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_compress_log)) {
+ /* Create a log file for every compression attempt. */
+ char logfilename[9];
+ snprintf(logfilename, sizeof logfilename,
+ "%08x", page_zip_compress_log++);
+ logfile = fopen(logfilename, "wb");
+
+ if (logfile) {
+ /* Write the uncompressed page to the log. */
+ if (fwrite(page, 1, srv_page_size, logfile)
+ != srv_page_size) {
+ perror("fwrite");
+ }
+ /* Record the compressed size as zero.
+ This will be overwritten at successful exit. */
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ }
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ page_zip_stat[page_zip->ssize - 1].compressed++;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed++;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+
+ goto err_exit;
+ }
+
+ MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
+ heap = mem_heap_create(page_zip_get_size(page_zip)
+ + n_fields * (2 + sizeof(ulint))
+ + REC_OFFS_HEADER_SIZE
+ + n_dense * ((sizeof *recs)
+ - PAGE_ZIP_DIR_SLOT_SIZE)
+ + srv_page_size * 4
+ + (512 << MAX_MEM_LEVEL));
+
+ recs = static_cast<const rec_t**>(
+ mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+ fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
+
+ buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+ /* Compress the data payload. */
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, static_cast<int>(level),
+ Z_DEFLATED, static_cast<int>(srv_page_size_shift),
+ MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+
+ c_stream.next_out = buf;
+
+ /* Subtract the space reserved for uncompressed data. */
+ /* Page header and the end marker of the modification log */
+ c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
+
+ /* Dense page directory and uncompressed columns, if any */
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ trx_id_col = index->db_trx_id();
+
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ } else {
+ /* Signal the absence of trx_id
+ in page_zip_fields_encode() */
+ trx_id_col = 0;
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+ } else {
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+ + 6/* sizeof(zlib header and footer) */)) {
+ goto zlib_error;
+ }
+
+ c_stream.avail_out -= uInt(n_dense * slot_size);
+ c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
+ trx_id_col, fields));
+ c_stream.next_in = fields;
+
+ if (UNIV_LIKELY(!trx_id_col)) {
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FULL_FLUSH);
+ if (err != Z_OK) {
+ goto zlib_error;
+ }
+
+ ut_ad(!c_stream.avail_in);
+
+ page_zip_dir_encode(page, buf_end, recs);
+
+ c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+ storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ /* Compress the records in heap_no order. */
+ if (UNIV_UNLIKELY(!n_dense)) {
+ } else if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ err = page_zip_compress_node_ptrs(LOGFILE
+ &c_stream, recs, n_dense,
+ index, storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ err = page_zip_compress_sec(LOGFILE
+ &c_stream, recs, n_dense);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ err = page_zip_compress_clust(LOGFILE
+ &c_stream, recs, n_dense,
+ index, &n_blobs, trx_id_col,
+ buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+ * page_get_n_recs(page),
+ storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ }
+
+ /* Finish the compression. */
+ ut_ad(!c_stream.avail_in);
+ /* Compress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list,
+ or the data of the last record from page_zip_compress_sec(). */
+ c_stream.avail_in = static_cast<uInt>(
+ page_header_get_field(page, PAGE_HEAP_TOP)
+ - (c_stream.next_in - page));
+ ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FINISH);
+
+ if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ if (page_is_leaf(page)) {
+ dict_index_zip_failure(index);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec
+ += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_usec
+ += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+ return false;
+ }
+
+ err = deflateEnd(&c_stream);
+ ut_a(err == Z_OK);
+
+ ut_ad(buf + c_stream.total_out == c_stream.next_out);
+ ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+ /* Valgrind believes that zlib does not initialize some bits
+ in the last 7 or 8 bytes of the stream. Make Valgrind happy. */
+ MEM_MAKE_DEFINED(buf, c_stream.total_out);
+#endif /* HAVE_valgrind && !memory_sanitizer */
+
+ /* Zero out the area reserved for the modification log.
+ Space for the end marker of the modification log is not
+ included in avail_out. */
+ memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start =
+#endif /* UNIV_DEBUG */
+ page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
+ page_zip->m_nonempty = FALSE;
+ page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
+ /* Copy those header fields that will not be written
+ in buf_flush_init_for_writing() */
+ memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA);
+ /* Copy the rest of the compressed page */
+ memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+ mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_zip_compress_write_log(block, index, mtr);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ /* Record the compressed size of the block. */
+ byte sz[4];
+ mach_write_to_4(sz, c_stream.total_out);
+ fseek(logfile, srv_page_size, SEEK_SET);
+ if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
+ perror("fwrite");
+ }
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_ok++;
+ page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ if (page_is_leaf(page)) {
+ dict_index_zip_success(index);
+ }
+
+ return true;
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+ dict_index_t* index) /*!< in: dummy index to be freed */
+{
+ if (index) {
+ dict_table_t* table = index->table;
+ index->zip_pad.mutex.~mutex();
+ mem_heap_free(index->heap);
+
+ dict_mem_table_free(table);
+ }
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+ const byte* buf, /*!< in: index information */
+ const byte* end, /*!< in: end of buf */
+ ulint* trx_id_col,/*!< in: NULL for non-leaf pages;
+ for leaf pages, pointer to where to store
+ the position of the trx_id column */
+ bool is_spatial)/*< in: is spatial index or not */
+{
+ const byte* b;
+ ulint n;
+ ulint i;
+ ulint val;
+ dict_table_t* table;
+ dict_index_t* index;
+
+ /* Determine the number of fields. */
+ for (b = buf, n = 0; b < end; n++) {
+ if (*b++ & 0x80) {
+ b++; /* skip the second byte */
+ }
+ }
+
+ n--; /* n_nullable or trx_id */
+
+ if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+ page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+ (ulong) n));
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(b > end)) {
+
+ page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+ (const void*) b, (const void*) end));
+ return(NULL);
+ }
+
+ table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
+ DICT_TF_COMPACT, 0);
+ index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
+ index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+
+ /* Initialize the fields. */
+ for (b = buf, i = 0; i < n; i++) {
+ ulint mtype;
+ ulint len;
+
+ val = *b++;
+
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ /* fixed length > 62 bytes */
+ val = (val & 0x7f) << 8 | *b++;
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ } else if (UNIV_UNLIKELY(val >= 126)) {
+ /* variable length with max > 255 bytes */
+ len = 0x7fff;
+ mtype = DATA_BINARY;
+ } else if (val <= 1) {
+ /* variable length with max <= 255 bytes */
+ len = 0;
+ mtype = DATA_BINARY;
+ } else {
+ /* fixed length < 62 bytes */
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ }
+
+ dict_mem_table_add_col(table, NULL, NULL, mtype,
+ val & 1 ? DATA_NOT_NULL : 0, len);
+ dict_index_add_col(index, table,
+ dict_table_get_nth_col(table, i), 0);
+ }
+
+ val = *b++;
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ val = (val & 0x7f) << 8 | *b++;
+ }
+
+ /* Decode the position of the trx_id column. */
+ if (trx_id_col) {
+ if (!val) {
+ val = ULINT_UNDEFINED;
+ } else if (UNIV_UNLIKELY(val >= n)) {
+fail:
+ page_zip_fields_free(index);
+ return NULL;
+ } else {
+ index->type = DICT_CLUSTERED;
+ }
+
+ *trx_id_col = val;
+ } else {
+ /* Decode the number of nullable fields. */
+ if (UNIV_UNLIKELY(index->n_nullable > val)) {
+ goto fail;
+ } else {
+ index->n_nullable = static_cast<unsigned>(val)
+ & dict_index_t::MAX_N_FIELDS;
+ }
+ }
+
+ /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
+ index->n_core_fields = index->n_fields;
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ ut_ad(b == end);
+
+ if (is_spatial) {
+ index->type |= DICT_SPATIAL;
+ }
+
+ return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+page_zip_dir_decode(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ page_t* page, /*!< in: compact page with valid header;
+ out: trailer and sparse page directory
+ filled in */
+ rec_t** recs, /*!< out: dense page directory sorted by
+ ascending address (and heap_no) */
+ ulint n_dense)/*!< in: number of user records, and
+ size of recs[] */
+{
+ ulint i;
+ ulint n_recs;
+ byte* slot;
+
+ n_recs = page_get_n_recs(page);
+
+ if (UNIV_UNLIKELY(n_recs > n_dense)) {
+ page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+ (ulong) n_recs, (ulong) n_dense));
+ return(FALSE);
+ }
+
+ /* Traverse the list of stored records in the sorting order,
+ starting from the first user record. */
+
+ slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+ UNIV_PREFETCH_RW(slot);
+
+ /* Zero out the page trailer. */
+ memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+ mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+
+ /* Initialize the sparse directory and copy the dense directory. */
+ for (i = 0; i < n_recs; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+ mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+ }
+
+ if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+ < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_recs,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ {
+ const page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ page, page_dir_get_n_slots(page) - 1U);
+
+ if (UNIV_UNLIKELY(slot != last_slot)) {
+ page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+ (const void*) slot,
+ (const void*) last_slot));
+ return(FALSE);
+ }
+ }
+
+ /* Copy the rest of the dense directory. */
+ for (; i < n_dense; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+ page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_dense,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + offs;
+ }
+
+ std::sort(recs, recs + n_dense);
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ page_t* page, /*!< in/out: uncompressed page */
+ ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+ ulint n;
+ ulint i;
+ ulint n_owned = 1;
+ ulint offs;
+ rec_t* rec;
+
+ n = page_get_n_recs(page);
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (i = 0; i < n; i++) {
+ offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
+ info_bits |= REC_INFO_DELETED_FLAG;
+ }
+ if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+ info_bits |= n_owned;
+ n_owned = 1;
+ } else {
+ n_owned++;
+ }
+ offs &= PAGE_ZIP_DIR_SLOT_MASK;
+ if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+ + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_set_extra_bytes 1:"
+ " %u %u %lx\n",
+ (unsigned) i, (unsigned) n,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec_set_next_offs_new(rec, offs);
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+ info_bits = 0;
+ }
+
+ /* Set the next pointer of the last user record. */
+ rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+ /* Set n_owned of the supremum record. */
+ page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+
+ if (i >= n) {
+ if (UNIV_LIKELY(i == n)) {
+ return(TRUE);
+ }
+
+ page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+ (unsigned) i, (unsigned) n));
+ return(FALSE);
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+
+ /* Set the extra bytes of deleted records on the free list. */
+ for (;;) {
+ if (UNIV_UNLIKELY(!offs)
+ || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+ page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+ if (++i == n) {
+ break;
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+ rec_set_next_offs_new(rec, offs);
+ }
+
+ /* Terminate the free list. */
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+ rec_set_next_offs_new(rec, 0);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns. Do not copy the fields that are stored separately.
+@return pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ const byte* data, /*!< in: modification log */
+ const byte* end) /*!< in: end of modification log */
+{
+ ulint i;
+ ulint len;
+ byte* next_out = rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, skip the
+ BTR_EXTERN_FIELD_REF. */
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ if (UNIV_UNLIKELY(dst - next_out >= end - data)
+ || UNIV_UNLIKELY
+ (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+ || rec_offs_nth_extern(offsets, i)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " trx_id len %lu,"
+ " %p - %p >= %p - %p\n",
+ (ulong) len,
+ (const void*) dst,
+ (const void*) next_out,
+ (const void*) end,
+ (const void*) data));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, ulint(dst - next_out));
+ data += ulint(dst - next_out);
+ next_out = dst + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ len += ulint(dst - next_out)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " ext %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, len);
+ data += len;
+ next_out += len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ /* Copy the last bytes of the record. */
+ len = ulint(rec_get_end(rec, offsets) - next_out);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " last %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(next_out, data, len);
+ data += len;
+
+ return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+ const byte* data, /*!< in: modification log */
+ ulint size, /*!< in: maximum length of the log, in bytes */
+ rec_t** recs, /*!< in: dense page directory,
+ sorted by address (indexed by
+ heap_no - PAGE_HEAP_NO_USER_LOW) */
+ ulint n_dense,/*!< in: size of recs[] */
+ ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */
+ ulint trx_id_col,/*!< in: column number of trx_id in the index,
+ or ULINT_UNDEFINED if none */
+ ulint heap_status,
+ /*!< in: heap_no and status bits for
+ the next record to uncompress */
+ dict_index_t* index, /*!< in: index of the page */
+ rec_offs* offsets)/*!< in/out: work area for
+ rec_get_offsets_reverse() */
+{
+ const byte* const end = data + size;
+
+ for (;;) {
+ ulint val;
+ rec_t* rec;
+ ulint len;
+ ulint hs;
+
+ val = *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ return(data - 1);
+ }
+ if (val & 0x80) {
+ val = (val & 0x7f) << 8 | *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " invalid val %x%x\n",
+ data[-2], data[-1]));
+ return(NULL);
+ }
+ }
+ if (UNIV_UNLIKELY(data >= end)) {
+ page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+ (const void*) data,
+ (const void*) end));
+ return(NULL);
+ }
+ if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+ page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+ (ulong) val, (ulong) n_dense));
+ return(NULL);
+ }
+
+ /* Determine the heap number and status bits of the record. */
+ rec = recs[(val >> 1) - 1];
+
+ hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+ hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+ /* This may either be an old record that is being
+ overwritten (updated in place, or allocated from
+ the free list), or a new record, with the next
+ available_heap_no. */
+ if (UNIV_UNLIKELY(hs > heap_status)) {
+ page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+ (ulong) hs, (ulong) heap_status));
+ return(NULL);
+ } else if (hs == heap_status) {
+ /* A new record was allocated from the heap. */
+ if (UNIV_UNLIKELY(val & 1)) {
+ /* Only existing records may be cleared. */
+ page_zip_fail(("page_zip_apply_log:"
+ " attempting to create"
+ " deleted rec %lu\n",
+ (ulong) hs));
+ return(NULL);
+ }
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ }
+
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+ if (val & 1) {
+ /* Clear the data bytes of the record. */
+ mem_heap_t* heap = NULL;
+ rec_offs* offs;
+ offs = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ memset(rec, 0, rec_offs_data_size(offs));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ continue;
+ }
+
+ compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
+ rec_get_offsets_reverse(data, index,
+ hs & REC_STATUS_NODE_PTR,
+ offsets);
+ /* Silence a debug assertion in rec_offs_make_valid().
+ This will be overwritten in page_zip_set_extra_bytes(),
+ called by page_zip_decompress_low(). */
+ ut_d(rec[-REC_NEW_INFO_BITS] = 0);
+ rec_offs_make_valid(rec, index, n_core != 0, offsets);
+
+ /* Copy the extra bytes (backwards). */
+ {
+ byte* start = rec_get_start(rec, offsets);
+ byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+ while (b != start) {
+ *--b = *data++;
+ }
+ }
+
+ /* Copy the data bytes. */
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ /* Non-leaf nodes should not contain any
+ externally stored columns. */
+ if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " %lu&REC_STATUS_NODE_PTR\n",
+ (ulong) hs));
+ return(NULL);
+ }
+
+ data = page_zip_apply_log_ext(
+ rec, offsets, trx_id_col, data, end);
+
+ if (UNIV_UNLIKELY(!data)) {
+ return(NULL);
+ }
+ } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ len = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+ /* Copy the data bytes, except node_ptr. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " node_ptr %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(rec, data, len);
+ data += len;
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ len = rec_offs_data_size(offsets);
+
+ /* Copy all data bytes of
+ a record in a secondary index. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " sec %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(rec, data, len);
+ data += len;
+ } else {
+ /* Skip DB_TRX_ID and DB_ROLL_PTR. */
+ ulint l = rec_get_nth_field_offs(offsets,
+ trx_id_col, &len);
+ byte* b;
+
+ if (UNIV_UNLIKELY(data + l >= end)
+ || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN))) {
+ page_zip_fail(("page_zip_apply_log:"
+ " trx_id %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) l,
+ (const void*) end));
+ return(NULL);
+ }
+
+ /* Copy any preceding data bytes. */
+ memcpy(rec, data, l);
+ data += l;
+
+ /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+ b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ len = ulint(rec_get_end(rec, offsets) - b);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " clust %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(b, data, len);
+ data += len;
+ }
+ }
+}
+
+/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ ulint& heap_status) /*!< in/out: heap_no and status bits */
+{
+ if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+ /* n_dense has grown since the page was last compressed. */
+ return(FALSE);
+ }
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+ d_stream->next_out = rec;
+
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ ulint heap_status = REC_STATUS_NODE_PTR
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+ const byte* storage;
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(
+ n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Decompress the data bytes, except node_ptr. */
+ d_stream->avail_out =static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ /* Clear the node pointer in case the record
+ will be deleted and the space will be reallocated
+ to a smaller record. */
+ memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+ d_stream->next_out += REC_NODE_PTR_SIZE;
+
+ ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense, 0,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index))
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " %lu + %lu >= %lu, %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, dict_index_is_clust(index)),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip),
+ (ulong) dict_index_is_clust(index)));
+ return(FALSE);
+ }
+
+ /* Restore the uncompressed columns in heap_no order. */
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ storage -= REC_NODE_PTR_SIZE;
+
+ memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+ storage, REC_NODE_PTR_SIZE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets) /*!< in/out: temporary offsets */
+{
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+
+ ut_a(!dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uint>(
+ n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ /* Decompress everything up to this record. */
+ d_stream->avail_out = static_cast<uint>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ if (UNIV_LIKELY(d_stream->avail_out)) {
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+ }
+
+ /* Decompress the data of the last record and any trailing garbage,
+ in case the last record was allocated from an originally longer space
+ on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_sec:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, FALSE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ /* There are no uncompressed columns on leaf pages of
+ secondary indexes. */
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col) /*!< in: position of of DB_TRX_ID */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " len[%lu] = %lu\n",
+ (ulong) i, (ulong) len));
+ return(FALSE);
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " DB_TRX_ID at %lu is ext\n",
+ (ulong) i));
+ return(FALSE);
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear the BLOB pointer in case
+ the record will be deleted and the
+ space will not be reused. Note that
+ the final initialization of the BLOB
+ pointers (copying from "externs"
+ or clearing) will have to take place
+ only after the page modification log
+ has been applied. Otherwise, we
+ could end up with an uninitialized
+ BLOB pointer when a record is deleted,
+ reallocated and deleted. */
+ memset(d_stream->next_out, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ d_stream->next_out
+ += BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err;
+ ulint slot;
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ const byte* storage;
+ const byte* externs;
+
+ ut_a(dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(n_dense)
+ * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out =static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ err = inflate(d_stream, Z_SYNC_FLUSH);
+ switch (err) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (UNIV_LIKELY(!d_stream->avail_out)) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* This is a leaf page in a clustered index. */
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, restore the
+ BTR_EXTERN_FIELD_REF separately. */
+
+ if (rec_offs_any_extern(offsets)) {
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_clust_ext(
+ d_stream, rec, offsets, trx_id_col))) {
+
+ goto zlib_error;
+ }
+ } else {
+ /* Skip trx_id and roll_ptr */
+ ulint len;
+ byte* dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " len = %lu\n", (ulong) len));
+ goto zlib_error;
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ }
+
+ /* Decompress the last bytes of the record. */
+ d_stream->avail_out = static_cast<uInt>(
+ rec_get_end(rec, offsets) - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ trx_id_col, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, TRUE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Restore the uncompressed columns in heap_no order. */
+
+ for (slot = 0; slot < n_dense; slot++) {
+ ulint i;
+ ulint len;
+ byte* dst;
+ rec_t* rec = recs[slot];
+ bool exists = !page_zip_dir_find_free(
+ page_zip, page_offset(rec));
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ memcpy(dst, storage,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Check if there are any externally stored
+ columns in this record. For each externally
+ stored column, restore or clear the
+ BTR_EXTERN_FIELD_REF. */
+ if (!rec_offs_any_extern(offsets)) {
+ continue;
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " %lu < 20\n",
+ (ulong) len));
+ return(FALSE);
+ }
+
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_LIKELY(exists)) {
+ /* Existing record:
+ restore the BLOB pointer */
+ externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY
+ (externs < page_zip->data
+ + page_zip->m_end)) {
+ page_zip_fail(("page_zip_"
+ "decompress_clust:"
+ " %p < %p + %lu\n",
+ (const void*) externs,
+ (const void*)
+ page_zip->data,
+ (ulong)
+ page_zip->m_end));
+ return(FALSE);
+ }
+
+ memcpy(dst, externs,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ page_zip->n_blobs++;
+ } else {
+ /* Deleted record:
+ clear the BLOB pointer */
+ memset(dst, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_low(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ z_stream d_stream;
+ dict_index_t* index = NULL;
+ rec_t** recs; /*!< dense page directory, sorted by address */
+ ulint n_dense;/* number of user records on the page */
+ ulint trx_id_col = ULINT_UNDEFINED;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+ (ulong) n_dense,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
+
+ recs = static_cast<rec_t**>(
+ mem_heap_alloc(heap, n_dense * sizeof *recs));
+
+ if (all) {
+ /* Copy the page header. */
+ memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
+ } else {
+ /* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(FIL_PAGE_TYPE + page,
+ FIL_PAGE_TYPE + page_zip->data,
+ PAGE_HEADER - FIL_PAGE_TYPE));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+ PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+ PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ /* Copy the mutable parts of the page header. */
+ memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
+ memcpy_aligned<2>(PAGE_HEADER + page,
+ PAGE_HEADER + page_zip->data,
+ PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the page headers match after copying. */
+ ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ /* Clear the uncompressed page, except the header. */
+ memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+ MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
+
+ /* Copy the page directory. */
+ if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+ n_dense))) {
+zlib_error:
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ /* Copy the infimum and supremum records. */
+ memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra);
+ if (page_is_empty(page)) {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ page_zip_dir_get(page_zip, 0)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+ memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+ memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
+ + page, supremum_extra_data,
+ sizeof supremum_extra_data);
+
+ page_zip_set_alloc(&d_stream, heap);
+
+ d_stream.next_in = page_zip->data + PAGE_DATA;
+ /* Subtract the space reserved for
+ the page header and the end marker of the modification log. */
+ d_stream.avail_in = static_cast<uInt>(
+ page_zip_get_size(page_zip) - (PAGE_DATA + 1));
+ d_stream.next_out = page + PAGE_ZIP_START;
+ d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
+
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
+ != Z_OK)) {
+ ut_error;
+ }
+
+ /* Decode the zlib header and the index information. */
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ index = page_zip_fields_decode(
+ page + PAGE_ZIP_START, d_stream.next_out,
+ page_is_leaf(page) ? &trx_id_col : NULL,
+ fil_page_get_type(page) == FIL_PAGE_RTREE);
+
+ if (UNIV_UNLIKELY(!index)) {
+
+ goto zlib_error;
+ }
+
+ /* Decompress the user records. */
+ page_zip->n_blobs = 0;
+ d_stream.next_out = page + PAGE_ZIP_START;
+
+ {
+ /* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+ ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, n * sizeof(ulint)));
+
+ rec_offs_set_n_alloc(offsets, n);
+ }
+
+ /* Decompress the records in heap_no order. */
+ if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ ulint info_bits;
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+ recs, n_dense, index,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+ info_bits))) {
+ goto err_exit;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+ recs, n_dense,
+ index, offsets))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+err_exit:
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+ &d_stream, recs,
+ n_dense, index,
+ trx_id_col,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+ goto err_exit;
+ }
+ }
+
+ ut_a(page_is_comp(page));
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ const ulonglong ns = my_interval_timer();
+
+ if (!page_zip_decompress_low(page_zip, page, all)) {
+ return(FALSE);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].decompressed++;
+ page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+ index_id_t index_id = btr_page_get_index_id(page);
+
+ if (srv_cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index_id].decompressed++;
+ page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ /* Update the stat counter for LRU policy. */
+ buf_LRU_stat_inc_unzip();
+
+ MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
+ return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+ const char* name, /*!< in: name of the data structure */
+ const void* buf, /*!< in: data */
+ ulint size) /*!< in: length of the data, in bytes */
+{
+ const byte* s = static_cast<const byte*>(buf);
+ ulint addr;
+ const ulint width = 32; /* bytes per line */
+
+ fprintf(stderr, "%s:\n", name);
+
+ for (addr = 0; addr < size; addr += width) {
+ ulint i;
+
+ fprintf(stderr, "%04lx ", (ulong) addr);
+
+ i = ut_min(width, size - addr);
+
+ while (i--) {
+ fprintf(stderr, "%02x", *s++);
+ }
+
+ putc('\n', stderr);
+ }
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf in: data
+@param size in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+bool page_zip_validate_header_only;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index, /*!< in: index of the page, if known */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+{
+ page_zip_des_t temp_page_zip;
+ ibool valid;
+
+ if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV)
+ || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+ || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_ROOT_AUTO_INC)
+ /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
+ on the clustered index root page (page number 3 in .ibd files).
+ That allows concurrent readers (holding buf_block_t::lock S-latch).
+ Because we do not know what type of a latch our caller is holding,
+ we will ignore the field on clustered index root pages in order
+ to avoid false positives. */
+ || (page_get_page_no(page) != 3/* clustered index root page */
+ && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
+ &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
+ || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
+ page_zip_fail(("page_zip_validate: page header\n"));
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ return(FALSE);
+ }
+
+ ut_a(page_is_comp(page));
+
+ if (page_zip_validate_header_only) {
+ return(TRUE);
+ }
+
+ /* page_zip_decompress() expects the uncompressed page to be
+ srv_page_size aligned. */
+ page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ temp_page_zip = *page_zip;
+ valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
+ if (!valid) {
+ fputs("page_zip_validate(): failed to decompress\n", stderr);
+ goto func_exit;
+ }
+ if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+ page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+ page_zip->n_blobs, temp_page_zip.n_blobs));
+ valid = FALSE;
+ }
+#ifdef UNIV_DEBUG
+ if (page_zip->m_start != temp_page_zip.m_start) {
+ page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+ page_zip->m_start, temp_page_zip.m_start));
+ valid = FALSE;
+ }
+#endif /* UNIV_DEBUG */
+ if (page_zip->m_end != temp_page_zip.m_end) {
+ page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+ page_zip->m_end, temp_page_zip.m_end));
+ valid = FALSE;
+ }
+ if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+ page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+ page_zip->m_nonempty,
+ temp_page_zip.m_nonempty));
+ valid = FALSE;
+ }
+ if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+ /* In crash recovery, the "minimum record" flag may be
+ set incorrectly until the mini-transaction is
+ committed. Let us tolerate that difference when we
+ are performing a sloppy validation. */
+
+ rec_offs* offsets;
+ mem_heap_t* heap;
+ const rec_t* rec;
+ const rec_t* trec;
+ byte info_bits_diff;
+ ulint offset
+ = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
+ ut_a(offset >= PAGE_NEW_SUPREMUM);
+ offset -= 5/*REC_NEW_INFO_BITS*/;
+
+ info_bits_diff = page[offset] ^ temp_page[offset];
+
+ if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+ temp_page[offset] = page[offset];
+
+ if (!memcmp(page + PAGE_HEADER,
+ temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER
+ - FIL_PAGE_DATA_END)) {
+
+ /* Only the minimum record flag
+ differed. Let us ignore it. */
+ page_zip_fail(("page_zip_validate:"
+ " min_rec_flag"
+ " (%s" ULINTPF "," ULINTPF
+ ",0x%02x)\n",
+ sloppy ? "ignored, " : "",
+ page_get_space_id(page),
+ page_get_page_no(page),
+ page[offset]));
+ /* We don't check for spatial index, since
+ the "minimum record" could be deleted when
+ doing rtr_update_mbr_field.
+ GIS_FIXME: need to validate why
+ rtr_update_mbr_field.() could affect this */
+ if (index && dict_index_is_spatial(index)) {
+ valid = true;
+ } else {
+ valid = sloppy;
+ }
+ goto func_exit;
+ }
+ }
+
+ /* Compare the pointers in the PAGE_FREE list. */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+ trec = page_header_get_ptr(temp_page, PAGE_FREE);
+
+ while (rec || trec) {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " PAGE_FREE list: %u!=%u\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ }
+
+ /* Compare the records. */
+ heap = NULL;
+ offsets = NULL;
+ rec = page_rec_get_next_low(
+ page + PAGE_NEW_INFIMUM, TRUE);
+ trec = page_rec_get_next_low(
+ temp_page + PAGE_NEW_INFIMUM, TRUE);
+ const ulint n_core = page_is_leaf(page) ? index->n_fields : 0;
+
+ do {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " record list: 0x%02x!=0x%02x\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ break;
+ }
+
+ if (index) {
+ /* Compare the data. */
+ offsets = rec_get_offsets(
+ rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (memcmp(rec - rec_offs_extra_size(offsets),
+ trec - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets))) {
+ page_zip_fail(
+ ("page_zip_validate:"
+ " record content: 0x%02x",
+ (unsigned) page_offset(rec)));
+ valid = FALSE;
+ break;
+ }
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ } while (rec || trec);
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ }
+
+func_exit:
+ if (!valid) {
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ page_zip_hexdump(temp_page, srv_page_size);
+ }
+ aligned_free(temp_page);
+ return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index) /*!< in: index of the page, if known */
+{
+ return(page_zip_validate_low(page_zip, page, index,
+ recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const byte* page) /*!< in: uncompressed page */
+{
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns. The data must already have been written to the uncompressed page.
+@return end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ const dict_index_t*index, /*!< in: record descriptor */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ulint create, /*!< in: nonzero=insert, zero=update */
+ ulint trx_id_col, /*!< in: position of DB_TRX_ID */
+ ulint heap_no, /*!< in: heap number of rec */
+ byte* storage, /*!< in: end of dense page directory */
+ byte* data, /*!< in: end of modification log */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* start = rec;
+ ulint i;
+ ulint len;
+ byte* externs = storage;
+ ulint n_ext = rec_offs_n_extern(offsets);
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+ /* Note that this will not take into account
+ the BLOB columns of rec if create==TRUE. */
+ ut_ad(data + rec_offs_data_size(offsets)
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ - n_ext * FIELD_REF_SIZE
+ < externs - FIELD_REF_SIZE * page_zip->n_blobs);
+
+ if (n_ext) {
+ ulint blob_no = page_zip_get_n_prev_extern(
+ page_zip, rec, index);
+ byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
+ ut_ad(blob_no <= page_zip->n_blobs);
+ externs -= blob_no * FIELD_REF_SIZE;
+
+ if (create) {
+ page_zip->n_blobs = (page_zip->n_blobs + n_ext)
+ & ((1U << 12) - 1);
+ ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
+ if (ulint len = ulint(externs - ext_end)) {
+ byte* ext_start = ext_end
+ - n_ext * FIELD_REF_SIZE;
+ memmove(ext_start, ext_end, len);
+ mtr->memmove(*block,
+ ext_start - page_zip->data,
+ ext_end - page_zip->data, len);
+ }
+ }
+
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i));
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i + 1));
+ /* Locate trx_id and roll_ptr. */
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len = DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+ i++; /* skip also roll_ptr */
+ mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(len >= FIELD_REF_SIZE);
+ src += len - FIELD_REF_SIZE;
+
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + FIELD_REF_SIZE;
+
+ /* Store the BLOB pointer. */
+ externs -= FIELD_REF_SIZE;
+ ut_ad(data < externs);
+ memcpy(externs, src, FIELD_REF_SIZE);
+ mtr->zmemcpy(*block, externs - page_zip->data,
+ FIELD_REF_SIZE);
+ }
+ }
+
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets) - ulint(start - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, start, len);
+ data += len;
+
+ return(data);
+}
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] rec record in the uncompressed page
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] create nonzero=insert, zero=update
+@param[in,out] mtr mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ ulint create, mtr_t *mtr)
+{
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ byte* data;
+ byte* storage;
+ ulint heap_no;
+ byte* slot;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+
+ ut_ad(page_zip_header_cmp(page_zip, page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ /* Copy the delete mark. */
+ if (rec_get_deleted_flag(rec, TRUE)) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record.
+ On non-leaf pages, the delete-mark flag is garbage. */
+ ut_ad(!index->is_primary() || !page_is_leaf(page)
+ || row_get_rec_trx_id(rec, index, offsets));
+ *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+ } else {
+ *slot &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ }
+
+ ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+ ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
+ - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+ * page_dir_get_n_slots(page));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+ ut_ad(heap_no < page_dir_get_n_heap(page));
+
+ /* Append to the modification log. */
+ data = page_zip->data + page_zip->m_end;
+ ut_ad(!*data);
+
+ /* Identify the record by writing its heap number - 1.
+ 0 is reserved to indicate the end of the modification log. */
+
+ if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+ *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+ ut_ad(!*data);
+ }
+ *data++ = (byte) ((heap_no - 1) << 1);
+ ut_ad(!*data);
+
+ {
+ const byte* start = rec - rec_offs_extra_size(offsets);
+ const byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+
+ /* Write the extra bytes backwards, so that
+ rec_offs_extra_size() can be easily computed in
+ page_zip_apply_log() by invoking
+ rec_get_offsets_reverse(). */
+
+ while (b != start) {
+ *data++ = *--b;
+ ut_ad(!*data);
+ }
+ }
+
+ /* Write the data bytes. Store the uncompressed bytes separately. */
+ storage = page_zip_dir_start(page_zip);
+
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ /* Store separately trx_id, roll_ptr and
+ the BTR_EXTERN_FIELD_REF of each BLOB column. */
+ if (rec_offs_any_extern(offsets)) {
+ data = page_zip_write_rec_ext(
+ block,
+ rec, index, offsets, create,
+ index->db_trx_id(), heap_no,
+ storage, data, mtr);
+ } else {
+ /* Locate trx_id and roll_ptr. */
+ ulint len;
+ const byte* src
+ = rec_get_nth_field(rec, offsets,
+ index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ index->db_roll_ptr(), &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - rec);
+ memcpy(data, rec, ulint(src - rec));
+ data += src - rec;
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len
+ = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+
+ src += sys_len;
+ mtr->zmemcpy(*block, sys - page_zip->data,
+ sys_len);
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets)
+ - ulint(src - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, src, len);
+ data += len;
+ }
+ } else {
+ /* Leaf page of a secondary index:
+ no externally stored columns */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Log the entire record. */
+ ulint len = rec_offs_data_size(offsets);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+ }
+ } else {
+ /* This is a node pointer page. */
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Copy the data bytes, except node_ptr. */
+ ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+ ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+
+ /* Copy the node pointer to the uncompressed area. */
+ byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
+ rec + len, REC_NODE_PTR_SIZE);
+ }
+
+ ut_a(!*data);
+ ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+ mtr->zmemcpy(*block, page_zip->m_end,
+ data - page_zip->data - page_zip->m_end);
+ page_zip->m_end = uint16_t(data - page_zip->data);
+ page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page_align(rec), index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* field;
+ byte* externs;
+ const page_t* const page = block->frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ ulint blob_no;
+ ulint len;
+
+ ut_ad(page_align(rec) == page);
+ ut_ad(index != NULL);
+ ut_ad(offsets != NULL);
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(rec_offs_nth_extern(offsets, n));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+ + rec_get_n_extern_new(rec, index, n);
+ ut_a(blob_no < page_zip->n_blobs);
+
+ externs = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+
+ field = rec_get_nth_field(rec, offsets, n, &len);
+
+ externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+ field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ byte* field;
+ byte* storage;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->frame);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(page_rec_is_comp(rec));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(!page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, size);
+
+ storage = page_zip_dir_start(page_zip)
+ - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+ field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ compile_time_assert(REC_NODE_PTR_SIZE == 4);
+ mach_write_to_4(field, ptr);
+ mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
+}
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in,out] rec record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
+@param[in] trx_id DB_TRX_ID value (transaction identifier)
+@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
+@param[in,out] mtr mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+ buf_block_t* block,
+ byte* rec,
+ const rec_offs* offsets,
+ ulint trx_id_col,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ mtr_t* mtr)
+{
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->frame);
+ ut_ad(page_align(rec) == page);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ const ulint heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
+
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ ulint len;
+ byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(field + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ mach_write_to_6(field, trx_id);
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+ len = 0;
+ if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+ byte* prev = storage + sys_len;
+ for (; len < sys_len && prev[len] == field[len]; len++);
+ if (len > 4) {
+ /* We save space by replacing a single record
+
+ WRITE,offset(storage),byte[13]
+
+ with up to two records:
+
+ MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+ WRITE|0x80,0,byte[13-len]
+
+ The single WRITE record would be x+13 bytes long (x>2).
+ The MEMMOVE record would be x+1+1 = x+2 bytes, and
+ the second WRITE would be 1+1+13-len = 15-len bytes.
+
+ The total size is: x+13 versus x+2+15-len = x+17-len.
+ To save space, we must have len>4. */
+ memcpy(storage, prev, len);
+ mtr->memmove(*block, ulint(storage - page_zip->data),
+ ulint(storage - page_zip->data) + sys_len,
+ len);
+ storage += len;
+ field += len;
+ if (UNIV_LIKELY(len < sys_len)) {
+ goto write;
+ }
+ } else {
+ len = 0;
+ goto write;
+ }
+ } else {
+write:
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
+ sys_len - len);
+ }
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in: record to clear */
+ const dict_index_t* index, /*!< in: index of rec */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint heap_no;
+ byte* storage;
+ byte* field;
+ ulint len;
+
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ /* page_zip_validate() would fail here if a record
+ containing externally stored columns is being deleted. */
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+ ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+ ut_ad(page_zip_header_cmp(page_zip, block->frame));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ if (!page_is_leaf(block->frame)) {
+ /* Clear node_ptr. On the compressed page,
+ there is an array of node_ptr immediately before the
+ dense page directory, at the very end of the page. */
+ storage = page_zip_dir_start(page_zip);
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
+ rec_offs_n_fields(offsets) - 1);
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1,
+ &len);
+ ut_ad(len == REC_NODE_PTR_SIZE);
+ ut_ad(!rec_offs_any_extern(offsets));
+ memset(field, 0, REC_NODE_PTR_SIZE);
+ storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
+ len = REC_NODE_PTR_SIZE;
+clear_page_zip:
+ memset(storage, 0, len);
+ mtr->memset(*block, storage - page_zip->data, len, 0);
+ } else if (index->is_clust()) {
+ /* Clear trx_id and roll_ptr. On the compressed page,
+ there is an array of these fields immediately before the
+ dense page directory, at the very end of the page. */
+ const ulint trx_id_pos
+ = dict_col_get_clust_pos(
+ dict_table_get_sys_col(
+ index->table, DATA_TRX_ID), index);
+ field = rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ /* Clear all BLOB pointers in order to make
+ page_zip_validate() pass. */
+ if (rec_offs_nth_extern(offsets, i)) {
+ field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ ut_ad(len
+ == BTR_EXTERN_FIELD_REF_SIZE);
+ memset(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ 0, BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ storage = page_zip_dir_start(page_zip)
+ - (heap_no - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ goto clear_page_zip;
+ } else {
+ ut_ad(!rec_offs_any_extern(offsets));
+ }
+}
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->frame);
+ byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+ byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
+ byte* rec, /*!< in: record to insert */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(cursor->rec) == cursor->block->frame);
+ ut_ad(page_align(rec) == cursor->block->frame);
+ page_zip_des_t *const page_zip= &cursor->block->page.zip;
+
+ ulint n_dense;
+ byte* slot_rec;
+ byte* slot_free;
+
+ ut_ad(cursor->rec != rec);
+ ut_ad(page_rec_get_next_const(cursor->rec) == rec);
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ if (page_rec_is_infimum(cursor->rec)) {
+ /* Use the first slot. */
+ slot_rec = page_zip->data + page_zip_get_size(page_zip);
+ } else {
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+ byte* start = end - page_zip_dir_user_size(page_zip);
+
+ if (UNIV_LIKELY(!free_rec)) {
+ /* PAGE_N_RECS was already incremented
+ in page_cur_insert_rec_zip(), but the
+ dense directory slot at that position
+ contains garbage. Skip it. */
+ start += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ slot_rec = page_zip_dir_find_low(start, end,
+ page_offset(cursor->rec));
+ ut_a(slot_rec);
+ }
+
+ /* Read the old n_dense (n_heap may have been incremented). */
+ n_dense = page_dir_get_n_heap(page_zip->data)
+ - (PAGE_HEAP_NO_USER_LOW + 1U);
+
+ if (UNIV_UNLIKELY(free_rec)) {
+ /* The record was allocated from the free list.
+ Shift the dense directory only up to that slot.
+ Note that in this case, n_dense is actually
+ off by one, because page_cur_insert_rec_zip()
+ did not increment n_heap. */
+ ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+ + PAGE_HEAP_NO_USER_LOW);
+ ut_ad(page_offset(rec) >= free_rec);
+ slot_free = page_zip_dir_find(page_zip, free_rec);
+ ut_ad(slot_free);
+ slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+ } else {
+ /* The record was allocated from the heap.
+ Shift the entire dense directory. */
+ ut_ad(rec_get_heap_no_new(rec) == n_dense
+ + PAGE_HEAP_NO_USER_LOW);
+
+ /* Shift to the end of the dense page directory. */
+ slot_free = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ }
+
+ if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+ /* Shift the dense directory to allocate place for rec. */
+ memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free, slot_len);
+ mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+ - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the inserted record.
+ The "owned" and "deleted" flags must be zero. */
+ mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
+ mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
+ - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] free previous start of the free list
+@param[in,out] mtr mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ const byte *free, mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ mach_write_to_2(rec - REC_NEXT,
+ free ? static_cast<uint16_t>(free - rec) : 0);
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_free, page_offset(rec));
+ byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+ mach_read_from_2(garbage));
+ compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
+ memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
+ byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot_rec);
+ uint16_t n_recs= page_get_n_recs(block->frame);
+ ut_ad(n_recs);
+ ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
+ /* This could not be done before page_zip_dir_find(). */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->frame);
+ mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+ memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+ 2);
+
+ byte *slot_free;
+
+ if (UNIV_UNLIKELY(!free))
+ /* Make the last slot the start of the free list. */
+ slot_free= page_zip->data + page_zip_get_size(page_zip) -
+ PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
+ PAGE_HEAP_NO_USER_LOW);
+ else
+ {
+ slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
+ ut_a(slot_free < slot_rec);
+ /* Grow the free list by one slot by moving the start. */
+ slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
+ if (slot_len)
+ {
+ memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+ slot_len);
+ mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the deleted record.
+ The "owned" and "deleted" flags will be cleared. */
+ mach_write_to_2(slot_free, page_offset(rec));
+ mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
+
+ if (const ulint n_ext= rec_offs_n_extern(offsets))
+ {
+ ut_ad(index->is_primary());
+ ut_ad(page_is_leaf(block->frame));
+
+ /* Shift and zero fill the array of BLOB pointers. */
+ ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+ byte *externs= page_zip->data + page_zip_get_size(page_zip) -
+ (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
+ PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+ byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
+
+ /* Shift and zero fill the array. */
+ if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+ BTR_EXTERN_FIELD_REF_SIZE)
+ {
+ memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+ mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+ ext_end - page_zip->data, ext_len);
+ }
+ memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
+ mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
+ page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
+ }
+
+ /* The compression algorithm expects info_bits and n_owned
+ to be 0 for deleted records. */
+ rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
+
+ page_zip_clear_rec(block, rec, index, offsets, mtr);
+}
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@retval true on success
+@retval false on failure; the block will be left intact */
+bool
+page_zip_reorganize(
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint z_level,/*!< in: compression level */
+ mtr_t* mtr, /*!< in: mini-transaction */
+ bool restore)/*!< whether to restore on failure */
+{
+ page_t* page = buf_block_get_frame(block);
+ buf_block_t* temp_block;
+ page_t* temp_page;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(page));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+ /* Note that page_zip_validate(page_zip, page, index) may fail here. */
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
+ page_zip_get_size(buf_block_get_page_zip(block)));
+
+ /* Disable logging */
+ mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+ temp_block = buf_block_alloc();
+ btr_search_drop_page_hash_index(block);
+ temp_page = temp_block->frame;
+
+ /* Copy the old page to temporary space */
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
+ srv_page_size);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ page_create(block, mtr, true);
+ if (index->is_spatial()) {
+ mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
+ memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
+ page + FIL_PAGE_TYPE, 2);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
+ }
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ page_copy_rec_list_end_no_locks(block, temp_block,
+ page_get_infimum_rec(temp_page),
+ index, mtr);
+
+ /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+ memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+ temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
+ /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
+ ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
+ || page_get_max_trx_id(page) != 0);
+ /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+ clustered index root pages. */
+ ut_ad(page_get_max_trx_id(page) == 0
+ || (dict_index_is_clust(index)
+ ? !page_has_siblings(temp_page)
+ : page_is_leaf(temp_page)));
+
+ /* Restore logging. */
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(block, index, z_level, mtr)) {
+ if (restore) {
+ /* Restore the old page and exit. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the bytes that we skip are identical. */
+ ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+ PAGE_HEADER + PAGE_N_RECS + temp_page,
+ PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+ ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
+ srv_page_size - FIL_PAGE_DATA_END
+ + temp_page,
+ FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+ PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+ memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+ srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(page, temp_page, srv_page_size));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+ buf_block_free(temp_block);
+ return false;
+ }
+
+ lock_move_reorganize_page(block, temp_block);
+
+ buf_block_free(temp_block);
+ return true;
+}
+
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+ buf_block_t* block, /*!< in/out: buffer block */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ page_t* page = block->frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+#ifdef UNIV_ZIP_DEBUG
+ /* The B-tree operations that call this function may set
+ FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+ mismatch. A strict page_zip_validate() will be executed later
+ during the B-tree operations. */
+ ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+ if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+ ut_a(page_is_leaf(src));
+ ut_a(dict_index_is_clust(index));
+ }
+
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(src, srv_page_size);
+ MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
+
+ /* Copy those B-tree page header fields that are related to
+ the records stored in the page. Also copy the field
+ PAGE_MAX_TRX_ID. Skip the rest of the page header and
+ trailer. On the compressed page, there is no trailer. */
+ compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
+ srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
+ memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
+ PAGE_HEADER + src_zip->data,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page_zip->data,
+ PAGE_DATA + src_zip->data,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+
+ if (dict_index_is_clust(index)) {
+ /* Reset the PAGE_ROOT_AUTO_INC field when copying
+ from a root page. */
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page, 0, 8);
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page_zip->data, 0, 8);
+ } else {
+ /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
+ of secondary indexes, and 0 on others. */
+ ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
+ }
+
+ /* Copy all fields of src_zip to page_zip, except the pointer
+ to the compressed data page. */
+ {
+ page_zip_t* data = page_zip->data;
+ memcpy(page_zip, src_zip, sizeof *page_zip);
+ page_zip->data = data;
+ }
+ ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
+ + page_zip->m_end < page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(src)
+ && UNIV_UNLIKELY(!page_has_prev(src))
+ && UNIV_LIKELY(page_has_prev(page))) {
+ /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+ ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE);
+ if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+ rec_t* rec = page + offs;
+ ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+ & REC_INFO_MIN_REC_FLAG);
+ rec[-REC_N_NEW_EXTRA_BYTES]
+ &= byte(~REC_INFO_MIN_REC_FLAG);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_zip_compress_write_log(block, index, mtr);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param[in] data compressed page
+@param[in] size size of compressed page
+@param[in] algo algorithm to use
+@return page checksum */
+uint32_t
+page_zip_calc_checksum(
+ const void* data,
+ ulint size,
+ srv_checksum_algorithm_t algo)
+{
+ uLong adler;
+ const Bytef* s = static_cast<const byte*>(data);
+
+ /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+ and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+
+ switch (algo) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ return ut_crc32(s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+ ^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+ ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ adler = adler32(0L, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+ adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+ adler = adler32(
+ adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ static_cast<uInt>(size)
+ - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return(uint32_t(adler));
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return(BUF_NO_CHECKSUM_MAGIC);
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
+
+ ut_error;
+ return(0);
+}
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data ROW_FORMAT=COMPRESSED page
+@param size size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size)
+{
+ const srv_checksum_algorithm_t curr_algo =
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+ if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return true;
+ }
+
+ if (buf_is_zeroes(span<const byte>(data, size))) {
+ return true;
+ }
+
+ const uint32_t stored = mach_read_from_4(
+ data + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ uint32_t calc = page_zip_calc_checksum(data, size, curr_algo);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " %s checksum: calculated = " UINT32PF ";"
+ " recorded = " UINT32PF "\n", cur_page_num,
+ buf_checksum_algorithm_name(
+ static_cast<srv_checksum_algorithm_t>(
+ srv_checksum_algorithm)),
+ calc, stored);
+ }
+
+ if (!strict_verify) {
+ const uint32_t crc32 = page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ": crc32 checksum:"
+ " calculated = " UINT32PF "; recorded = " UINT32PF "\n",
+ cur_page_num, crc32, stored);
+ fprintf(log_file, "page::" UINT32PF ": none checksum:"
+ " calculated = %lu; recorded = " UINT32PF "\n",
+ cur_page_num, BUF_NO_CHECKSUM_MAGIC, stored);
+ }
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (stored == calc) {
+ return(TRUE);
+ }
+
+ switch (curr_algo) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return FALSE;
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return(TRUE);
+ }
+
+ return stored == page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return TRUE;
+ }
+
+ return stored == page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ return TRUE;
+ }
+
+ return FALSE;
+}