diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/page | |
parent | Initial commit. (diff) | |
download | mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/page')
-rw-r--r-- | storage/innobase/page/page0cur.cc | 3097 | ||||
-rw-r--r-- | storage/innobase/page/page0page.cc | 2523 | ||||
-rw-r--r-- | storage/innobase/page/page0zip.cc | 4666 |
3 files changed, 10286 insertions, 0 deletions
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc new file mode 100644 index 00000000..b019694b --- /dev/null +++ b/storage/innobase/page/page0cur.cc @@ -0,0 +1,3097 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file page/page0cur.cc +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0cur.h" +#include "page0zip.h" +#include "btr0btr.h" +#include "mtr0log.h" +#include "log0recv.h" +#include "rem0cmp.h" +#include "gis0rtree.h" + +#include <algorithm> + +#ifdef BTR_CUR_HASH_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT +static ulint page_cur_short_succ; +# endif /* UNIV_SEARCH_PERF_STAT */ + +/** Try a search shortcut based on the last insert. +@param[in] block index page +@param[in] index index tree +@param[in] tuple search key +@param[in,out] iup_matched_fields already matched fields in the +upper limit record +@param[in,out] ilow_matched_fields already matched fields in the +lower limit record +@param[out] cursor page cursor +@return true on success */ +UNIV_INLINE +bool +page_cur_try_search_shortcut( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + ulint* iup_matched_fields, + ulint* ilow_matched_fields, + page_cur_t* cursor) +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint up_match; + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(page_is_leaf(page)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + + low_match = up_match = std::min(*ilow_matched_fields, + *iup_matched_fields); + + if (cmp_dtuple_rec_with_match(tuple, rec, index, offsets, + &low_match) < 0) { + goto exit_func; + } + + if (!(next_rec = page_rec_get_next_const(rec))) { + goto exit_func; + } + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, + dtuple_get_n_fields(tuple), &heap); + + if (cmp_dtuple_rec_with_match(tuple, next_rec, index, offsets, + &up_match) >= 0) { + goto exit_func; + } + + *iup_matched_fields = up_match; + } + + page_cur_position(rec, block, cursor); + + *ilow_matched_fields = low_match; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +/** Try a search shortcut based on the last insert. +@param[in] block index page +@param[in] index index tree +@param[in] tuple search key +@param[in,out] iup_matched_fields already matched fields in the +upper limit record +@param[in,out] iup_matched_bytes already matched bytes in the +first partially matched field in the upper limit record +@param[in,out] ilow_matched_fields already matched fields in the +lower limit record +@param[in,out] ilow_matched_bytes already matched bytes in the +first partially matched field in the lower limit record +@param[out] cursor page cursor +@return true on success */ +UNIV_INLINE +bool +page_cur_try_search_shortcut_bytes( + const buf_block_t* block, + const dict_index_t* index, + const dtuple_t* tuple, + ulint* iup_matched_fields, + ulint* iup_matched_bytes, + ulint* ilow_matched_fields, + ulint* ilow_matched_bytes, + page_cur_t* cursor) +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint low_bytes; + ulint up_match; + ulint up_bytes; + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(page_is_leaf(page)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + if (ut_pair_cmp(*ilow_matched_fields, *ilow_matched_bytes, + *iup_matched_fields, *iup_matched_bytes) < 0) { + up_match = low_match = *ilow_matched_fields; + up_bytes = low_bytes = *ilow_matched_bytes; + } else { + up_match = low_match = *iup_matched_fields; + up_bytes = low_bytes = *iup_matched_bytes; + } + + if (cmp_dtuple_rec_with_match_bytes( + tuple, rec, index, offsets, &low_match, &low_bytes) < 0) { + goto exit_func; + } + + if (!(next_rec = page_rec_get_next_const(rec))) { + goto exit_func; + } + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, index, offsets, + index->n_core_fields, + dtuple_get_n_fields(tuple), &heap); + + if (cmp_dtuple_rec_with_match_bytes( + tuple, next_rec, index, offsets, + &up_match, &up_bytes) + >= 0) { + goto exit_func; + } + + *iup_matched_fields = up_match; + *iup_matched_bytes = up_bytes; + } + + page_cur_position(rec, block, cursor); + + *ilow_matched_fields = low_match; + *ilow_matched_bytes = low_bytes; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef PAGE_CUR_LE_OR_EXTENDS +/****************************************************************//** +Checks if the nth field in a record is a character type field which extends +the nth field in tuple, i.e., the field is longer or equal in length and has +common first characters. +@return TRUE if rec field extends tuple field */ +static +ibool +page_cur_rec_field_extends( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: compare nth field */ +{ + const dtype_t* type; + const dfield_t* dfield; + const byte* rec_f; + ulint rec_f_len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + dfield = dtuple_get_nth_field(tuple, n); + + type = dfield_get_type(dfield); + + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); + + if (type->mtype == DATA_VARCHAR + || type->mtype == DATA_CHAR + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_BINARY + || type->mtype == DATA_BLOB + || DATA_GEOMETRY_MTYPE(type->mtype) + || type->mtype == DATA_VARMYSQL + || type->mtype == DATA_MYSQL) { + + if (dfield_get_len(dfield) != UNIV_SQL_NULL + && rec_f_len != UNIV_SQL_NULL + && rec_f_len >= dfield_get_len(dfield) + && !cmp_data_data(type->mtype, type->prtype, + dfield_get_data(dfield), + dfield_get_len(dfield), + rec_f, dfield_get_len(dfield))) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + +/****************************************************************//** +Searches the right position for a page cursor. */ +bool +page_cur_search_with_match( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + page_cur_t* cursor, /*!< out: page cursor */ + rtr_info_t* rtr_info)/*!< in/out: rtree search stack */ +{ + ulint up; + ulint low; + ulint mid; + const page_t* page; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint low_matched_fields; + ulint cur_matched_fields; + int cmp; + const dict_index_t* const index = cursor->index; + const buf_block_t* const block = cursor->block; +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_validate(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE + || dict_index_is_spatial(index)); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_d(page_check_dir(page)); + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + +#ifdef BTR_CUR_HASH_ADAPT + if (n_core + && page_get_direction(page) == PAGE_RIGHT + && page_header_get_offs(page, PAGE_LAST_INSERT) + && mode == PAGE_CUR_LE + && !index->is_spatial() + && page_header_get_field(page, PAGE_N_DIRECTION) > 3 + && page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, ilow_matched_fields, cursor)) { + return false; + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif /* BTR_CUR_HASH_ADAPT */ + + /* If the mode is for R-tree indexes, use the special MBR + related compare functions */ + if (index->is_spatial() && mode > PAGE_CUR_LE) { + /* For leaf level insert, we still use the traditional + compare function for now */ + if (mode == PAGE_CUR_RTREE_INSERT && n_core) { + mode = PAGE_CUR_LE; + } else { + return rtr_cur_search_with_match( + block, (dict_index_t*)index, tuple, mode, + cursor, rtr_info); + } + } + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + low_matched_fields = *ilow_matched_fields; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = ulint(page_dir_get_n_slots(page)) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + + while (up - low > 1) { + mid = (low + up) / 2; + const page_dir_slot_t* slot = page_dir_get_nth_slot(page, mid); + if (UNIV_UNLIKELY(!(mid_rec + = page_dir_slot_get_rec_validate(slot)))) { + goto corrupted; + } + cur_matched_fields = std::min(low_matched_fields, + up_matched_fields); + + offsets = offsets_; + offsets = rec_get_offsets( + mid_rec, index, offsets, n_core, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match( + tuple, mid_rec, index, offsets, &cur_matched_fields); + + if (cmp > 0) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + + } else if (cmp) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + low_rec = page_dir_slot_get_rec_validate( + page_dir_get_nth_slot(page, low)); + up_rec = page_dir_slot_get_rec_validate( + page_dir_get_nth_slot(page, up)); + if (UNIV_UNLIKELY(!low_rec || !up_rec)) { +corrupted: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return true; + } + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + for (;;) { + if (const rec_t* next = page_rec_get_next_const(low_rec)) { + if (next == up_rec) { + break; + } + mid_rec = next; + } else { + goto corrupted; + } + cur_matched_fields = std::min(low_matched_fields, + up_matched_fields); + + offsets = offsets_; + offsets = rec_get_offsets( + mid_rec, index, offsets, n_core, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match( + tuple, mid_rec, index, offsets, &cur_matched_fields); + + if (cmp > 0) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + + } else if (cmp) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + if (!cmp && !cur_matched_fields) { +#ifdef UNIV_DEBUG + mtr_t mtr; + mtr_start(&mtr); + + /* We got a match, but cur_matched_fields is + 0, it must have REC_INFO_MIN_REC_FLAG */ + ulint rec_info = rec_get_info_bits(mid_rec, + rec_offs_comp(offsets)); + ut_ad(rec_info & REC_INFO_MIN_REC_FLAG); + ut_ad(!page_has_prev(page)); + mtr_commit(&mtr); +#endif + + cur_matched_fields = dtuple_get_n_fields_cmp(tuple); + } + + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *ilow_matched_fields = low_matched_fields; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return false; +} + +#ifdef BTR_CUR_HASH_ADAPT +/** Search the right position for a page cursor. +@param[in] block buffer block +@param[in] index index tree +@param[in] tuple key to be searched for +@param[in] mode search mode +@param[in,out] iup_matched_fields already matched fields in the +upper limit record +@param[in,out] iup_matched_bytes already matched bytes in the +first partially matched field in the upper limit record +@param[in,out] ilow_matched_fields already matched fields in the +lower limit record +@param[in,out] ilow_matched_bytes already matched bytes in the +first partially matched field in the lower limit record +@param[out] cursor page cursor */ +bool +page_cur_search_with_match_bytes( + const dtuple_t* tuple, + page_cur_mode_t mode, + ulint* iup_matched_fields, + ulint* iup_matched_bytes, + ulint* ilow_matched_fields, + ulint* ilow_matched_bytes, + page_cur_t* cursor) +{ + ulint up; + ulint low; + const page_t* page; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint up_matched_bytes; + ulint low_matched_fields; + ulint low_matched_bytes; + ulint cur_matched_fields; + ulint cur_matched_bytes; + int cmp; + const dict_index_t* const index = cursor->index; + const buf_block_t* const block = cursor->block; +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_validate(tuple)); + ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_d(page_check_dir(page)); + +#ifdef BTR_CUR_HASH_ADAPT + if (page_is_leaf(page) + && page_get_direction(page) == PAGE_RIGHT + && page_header_get_offs(page, PAGE_LAST_INSERT) + && mode == PAGE_CUR_LE + && page_header_get_field(page, PAGE_N_DIRECTION) > 3 + && page_cur_try_search_shortcut_bytes( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return false; + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif /* BTR_CUR_HASH_ADAPT */ + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + up_matched_bytes = *iup_matched_bytes; + low_matched_fields = *ilow_matched_fields; + low_matched_bytes = *ilow_matched_bytes; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = ulint(page_dir_get_n_slots(page)) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + + while (up - low > 1) { + const ulint mid = (low + up) / 2; + mid_rec = page_dir_slot_get_rec_validate( + page_dir_get_nth_slot(page, mid)); + if (UNIV_UNLIKELY(!mid_rec)) { + goto corrupted; + } + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets( + mid_rec, index, offsets_, n_core, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match_bytes( + tuple, mid_rec, index, offsets, + &cur_matched_fields, &cur_matched_bytes); + + if (cmp > 0) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (cmp) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + low_rec = page_dir_slot_get_rec_validate( + page_dir_get_nth_slot(page, low)); + up_rec = page_dir_slot_get_rec_validate( + page_dir_get_nth_slot(page, up)); + if (UNIV_UNLIKELY(!low_rec || !up_rec)) { +corrupted: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return true; + } + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + for (;;) { + if (const rec_t* next = page_rec_get_next_const(low_rec)) { + if (next == up_rec) { + break; + } + mid_rec = next; + } else { + goto corrupted; + } + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + if (UNIV_UNLIKELY(rec_get_info_bits( + mid_rec, + dict_table_is_comp(index->table)) + & REC_INFO_MIN_REC_FLAG)) { + ut_ad(!page_has_prev(page_align(mid_rec))); + ut_ad(!page_rec_is_leaf(mid_rec) + || rec_is_metadata(mid_rec, *index)); + cmp = 1; + goto low_rec_match; + } + + offsets = rec_get_offsets( + mid_rec, index, offsets_, n_core, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match_bytes( + tuple, mid_rec, index, offsets, + &cur_matched_fields, &cur_matched_bytes); + + if (cmp > 0) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (cmp) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *iup_matched_bytes = up_matched_bytes; + *ilow_matched_fields = low_matched_fields; + *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return false; +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +void page_cur_open_on_rnd_user_rec(page_cur_t *cursor) +{ + if (const ulint n_recs= page_get_n_recs(cursor->block->page.frame)) + if ((cursor->rec= page_rec_get_nth(cursor->block->page.frame, + ut_rnd_interval(n_recs) + 1))) + return; + cursor->rec= page_get_infimum_rec(cursor->block->page.frame); +} + +/** +Set the number of owned records. +@param[in,out] rec record in block.frame +@param[in] n_owned number of records skipped in the sparse page directory +@param[in] comp whether ROW_FORMAT is COMPACT or DYNAMIC */ +static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp) +{ + rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED; + *rec= static_cast<byte>((*rec & ~REC_N_OWNED_MASK) | + (n_owned << REC_N_OWNED_SHIFT)); +} + +/** +Split a directory slot which owns too many records. +@param[in,out] block index page +@param[in,out] slot the slot that needs to be split */ +static bool page_dir_split_slot(const buf_block_t &block, + page_dir_slot_t *slot) +{ + ut_ad(slot <= &block.page.frame[srv_page_size - PAGE_EMPTY_DIR_START]); + slot= my_assume_aligned<2>(slot); + + const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1; + + ut_ad(page_dir_slot_get_n_owned(slot) == n_owned); + static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >= + PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility"); + + /* Find a record approximately in the middle. */ + const rec_t *rec= page_dir_slot_get_rec_validate(slot + PAGE_DIR_SLOT_SIZE); + + for (ulint i= n_owned / 2; i--; ) + { + if (UNIV_UNLIKELY(!rec)) + return true; + rec= page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(!rec)) + return true; + + /* Add a directory slot immediately below this one. */ + constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.page.frame); + const uint16_t n_slots= mach_read_from_2(n_slots_p); + + page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*> + (block.page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) - + n_slots * PAGE_DIR_SLOT_SIZE); + + if (UNIV_UNLIKELY(slot < last_slot)) + return true; + + memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE, + slot - last_slot); + + const ulint half_owned= n_owned / 2; + + mach_write_to_2(n_slots_p, n_slots + 1); + + mach_write_to_2(slot, rec - block.page.frame); + const bool comp= page_is_comp(block.page.frame) != 0; + page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp); + page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE), + n_owned - half_owned, comp); + return false; +} + +/** +Split a directory slot which owns too many records. +@param[in,out] block index page (ROW_FORMAT=COMPRESSED) +@param[in] s the slot that needs to be split +@param[in,out] mtr mini-transaction */ +static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr) +{ + ut_ad(block->page.zip.data); + ut_ad(page_is_comp(block->page.frame)); + ut_ad(s); + + page_dir_slot_t *slot= page_dir_get_nth_slot(block->page.frame, s); + const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1; + + ut_ad(page_dir_slot_get_n_owned(slot) == n_owned); + static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >= + PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility"); + + /* 1. We loop to find a record approximately in the middle of the + records owned by the slot. */ + + const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE); + + /* We do not try to prevent crash on corruption here. + For ROW_FORMAT=COMPRESSED pages, the next-record links should + be validated in page_zip_decompress(). Corruption should only + be possible here if the buffer pool was corrupted later. */ + for (ulint i= n_owned / 2; i--; ) + rec= page_rec_get_next_const(rec); + + /* Add a directory slot immediately below this one. */ + constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->page.frame); + const uint16_t n_slots= mach_read_from_2(n_slots_p); + + page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*> + (block->page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) - + n_slots * PAGE_DIR_SLOT_SIZE); + memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE, + slot - last_slot); + + const ulint half_owned= n_owned / 2; + + mtr->write<2>(*block, n_slots_p, 1U + n_slots); + + /* Log changes to the compressed page header and the dense page directory. */ + memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2); + mach_write_to_2(slot, page_offset(rec)); + page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned, + true, mtr); + page_rec_set_n_owned<true>(block, + page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE), + n_owned - half_owned, true, mtr); +} + +/** +Try to balance an underfilled directory slot with an adjacent one, +so that there are at least the minimum number of records owned by the slot; +this may result in merging the two slots. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] s the slot to be balanced +@param[in,out] mtr mini-transaction */ +static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr) +{ + ut_ad(block->page.zip.data); + ut_ad(page_is_comp(block->page.frame)); + ut_ad(s > 0); + + const ulint n_slots = page_dir_get_n_slots(block->page.frame); + + if (UNIV_UNLIKELY(s + 1 == n_slots)) { + /* The last directory slot cannot be balanced. */ + return; + } + + ut_ad(s < n_slots); + + page_dir_slot_t* slot = page_dir_get_nth_slot(block->page.frame, s); + rec_t* const up_rec = const_cast<rec_t*> + (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE)); + rec_t* const slot_rec = const_cast<rec_t*> + (page_dir_slot_get_rec(slot)); + const ulint up_n_owned = rec_get_n_owned_new(up_rec); + + ut_ad(rec_get_n_owned_new(page_dir_slot_get_rec(slot)) + == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 + <= PAGE_DIR_SLOT_MAX_N_OWNED); + /* Merge the slots. */ + page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr); + page_rec_set_n_owned<true>(block, up_rec, up_n_owned + + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), + true, mtr); + /* Shift the slots */ + page_dir_slot_t* last_slot = page_dir_get_nth_slot( + block->page.frame, n_slots - 1); + memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot, + slot - last_slot); + constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2> + (n_slots_f + block->page.frame); + mtr->write<2>(*block, n_slots_p, n_slots - 1); + memcpy_aligned<2>(n_slots_f + block->page.zip.data, + n_slots_p, 2); + memset_aligned<2>(last_slot, 0, 2); + return; + } + + /* Transfer one record to the underfilled slot */ + page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr); + const rec_t* new_rec = page_rec_get_next_low(slot_rec, TRUE); + /* We do not try to prevent crash on corruption here. + For ROW_FORMAT=COMPRESSED pages, the next-record links should + be validated in page_zip_decompress(). Corruption should only + be possible here if the buffer pool was corrupted later. */ + page_rec_set_n_owned<true>(block, const_cast<rec_t*>(new_rec), + PAGE_DIR_SLOT_MIN_N_OWNED, + true, mtr); + mach_write_to_2(slot, page_offset(new_rec)); + page_rec_set_n_owned(up_rec, up_n_owned - 1, true); +} + +/** +Try to balance an underfilled directory slot with an adjacent one, +so that there are at least the minimum number of records owned by the slot; +this may result in merging the two slots. +@param[in,out] block index page +@param[in] s the slot to be balanced */ +static void page_dir_balance_slot(const buf_block_t &block, ulint s) +{ + const bool comp= page_is_comp(block.page.frame); + ut_ad(!block.page.zip.data); + ut_ad(s > 0); + + const ulint n_slots = page_dir_get_n_slots(block.page.frame); + + if (UNIV_UNLIKELY(s + 1 == n_slots)) { + /* The last directory slot cannot be balanced. */ + return; + } + + ut_ad(s < n_slots); + + page_dir_slot_t* slot = page_dir_get_nth_slot(block.page.frame, s); + rec_t* const up_rec = const_cast<rec_t*> + (page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE)); + rec_t* const slot_rec = const_cast<rec_t*> + (page_dir_slot_get_rec(slot)); + const ulint up_n_owned = comp + ? rec_get_n_owned_new(up_rec) + : rec_get_n_owned_old(up_rec); + + ut_ad(page_dir_slot_get_n_owned(slot) + == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 + <= PAGE_DIR_SLOT_MAX_N_OWNED); + /* Merge the slots. */ + page_rec_set_n_owned(slot_rec, 0, comp); + page_rec_set_n_owned(up_rec, up_n_owned + + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp); + /* Shift the slots */ + page_dir_slot_t* last_slot = page_dir_get_nth_slot( + block.page.frame, n_slots - 1); + memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot, + slot - last_slot); + memset_aligned<2>(last_slot, 0, 2); + constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER; + byte *n_slots_p= my_assume_aligned<2> + (n_slots_f + block.page.frame); + mach_write_to_2(n_slots_p, n_slots - 1); + return; + } + + /* Transfer one record to the underfilled slot */ + const rec_t* new_rec; + + if (comp) { + if (UNIV_UNLIKELY(!(new_rec = + page_rec_get_next_low(slot_rec, true)))) { + ut_ad("corrupted page" == 0); + return; + } + page_rec_set_n_owned(slot_rec, 0, true); + page_rec_set_n_owned(const_cast<rec_t*>(new_rec), + PAGE_DIR_SLOT_MIN_N_OWNED, true); + page_rec_set_n_owned(up_rec, up_n_owned - 1, true); + } else { + if (UNIV_UNLIKELY(!(new_rec = + page_rec_get_next_low(slot_rec, false)))) { + ut_ad("corrupted page" == 0); + return; + } + page_rec_set_n_owned(slot_rec, 0, false); + page_rec_set_n_owned(const_cast<rec_t*>(new_rec), + PAGE_DIR_SLOT_MIN_N_OWNED, false); + page_rec_set_n_owned(up_rec, up_n_owned - 1, false); + } + + mach_write_to_2(slot, page_offset(new_rec)); +} + +/** Allocate space for inserting an index record. +@tparam compressed whether to update the ROW_FORMAT=COMPRESSED +@param[in,out] block index page +@param[in] need number of bytes needed +@param[out] heap_no record heap number +@return pointer to the start of the allocated buffer +@retval NULL if allocation fails */ +template<bool compressed=false> +static byte* page_mem_alloc_heap(buf_block_t *block, ulint need, + ulint *heap_no) +{ + ut_ad(!compressed || block->page.zip.data); + + byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER + + block->page.frame); + + const uint16_t top= mach_read_from_2(heap_top); + + if (need > page_get_max_insert_size(block->page.frame, 1)) + return NULL; + + byte *n_heap= my_assume_aligned<2> + (PAGE_N_HEAP + PAGE_HEADER + block->page.frame); + + const uint16_t h= mach_read_from_2(n_heap); + if (UNIV_UNLIKELY((h + 1) & 0x6000)) + { + /* At the minimum record size of 5+2 bytes, we can only reach this + condition when using innodb_page_size=64k. */ + ut_ad((h & 0x7fff) == 8191); + ut_ad(srv_page_size == 65536); + return NULL; + } + + *heap_no= h & 0x7fff; + ut_ad(*heap_no < srv_page_size / REC_N_NEW_EXTRA_BYTES); + compile_time_assert(UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES < 0x3fff); + + mach_write_to_2(heap_top, top + need); + mach_write_to_2(n_heap, h + 1); + + if (compressed) + { + ut_ad(h & 0x8000); + memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER], + heap_top, 4); + } + + return &block->page.frame[top]; +} + +/** Write log for inserting a B-tree or R-tree record in +ROW_FORMAT=REDUNDANT. +@param block B-tree or R-tree page +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_OLD_INFIMUM +@param info_bits info_bits of the record +@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag() +@param hdr_c number of common record header bytes with prev_rec +@param data_c number of common data bytes with prev_rec +@param hdr record header bytes to copy to the log +@param hdr_l number of copied record header bytes +@param data record payload bytes to copy to the log +@param data_l number of copied record data bytes */ +inline void mtr_t::page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_bits, + ulint n_fields_s, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l) +{ + ut_ad(!block.page.zip.data); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame)); + ut_ad(n_slots >= 2); + ut_d(const byte *page_end= + page_dir_get_nth_slot(block.page.frame, n_slots - 1)); + ut_ad(&block.page.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end); + ut_ad(block.page.frame + + page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end); + ut_ad(fil_page_index_page_check(block.page.frame)); + ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits)); + ut_ad(n_fields_s >= 2); + ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS); + ut_ad(data_l + data_c <= REDUNDANT_REC_MAX_DATA_SIZE); + + set_modified(block); + + static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility"); + static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility"); + n_fields_s= (n_fields_s - 2) << 2 | info_bits >> 4; + + size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4; + static_assert((REC_MAX_N_FIELDS << 1 | 1) <= MIN_3BYTE, "compatibility"); + len+= n_fields_s < MIN_2BYTE ? 1 : 2; + len+= hdr_c < MIN_2BYTE ? 1 : 2; + static_assert(REDUNDANT_REC_MAX_DATA_SIZE <= MIN_3BYTE, "compatibility"); + len+= data_c < MIN_2BYTE ? 1 : 2; + len+= hdr_l + data_l; + + const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small); + + if (UNIV_LIKELY(small)) + { + ut_d(const byte * const end = l + len); + *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT; + l= mlog_encode_varint(l, prev_rec); + l= mlog_encode_varint(l, n_fields_s); + l= mlog_encode_varint(l, hdr_c); + l= mlog_encode_varint(l, data_c); + ::memcpy(l, hdr, hdr_l); + l+= hdr_l; + ::memcpy(l, data, data_l); + l+= data_l; + ut_ad(end == l); + m_log.close(l); + } + else + { + m_log.close(l); + l= m_log.open(len - hdr_l - data_l); + ut_d(const byte * const end = l + len - hdr_l - data_l); + *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT; + l= mlog_encode_varint(l, prev_rec); + l= mlog_encode_varint(l, n_fields_s); + l= mlog_encode_varint(l, hdr_c); + l= mlog_encode_varint(l, data_c); + ut_ad(end == l); + m_log.close(l); + m_log.push(hdr, static_cast<uint32_t>(hdr_l)); + m_log.push(data, static_cast<uint32_t>(data_l)); + } + + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for inserting a B-tree or R-tree record in +ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC. +@param block B-tree or R-tree page +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_NEW_INFIMUM +@param info_status rec_get_info_and_status_bits() +@param shift unless !reuse: number of bytes the PAGE_FREE is moving +@param hdr_c number of common record header bytes with prev_rec +@param data_c number of common data bytes with prev_rec +@param hdr record header bytes to copy to the log +@param hdr_l number of copied record header bytes +@param data record payload bytes to copy to the log +@param data_l number of copied record data bytes */ +inline void mtr_t::page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_status, + ssize_t shift, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l) +{ + ut_ad(!block.page.zip.data); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame)); + ut_ad(n_slots >= 2); + ut_d(const byte *page_end= page_dir_get_nth_slot(block.page.frame, + n_slots - 1)); + ut_ad(&block.page.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end); + ut_ad(block.page.frame + + page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end); + ut_ad(fil_page_index_page_check(block.page.frame)); + ut_ad(hdr_l + hdr_c + data_l + data_c <= static_cast<size_t> + (page_end - &block.page.frame[PAGE_NEW_SUPREMUM_END])); + ut_ad(reuse || shift == 0); +#ifdef UNIV_DEBUG + switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) { + default: + ut_ad(0); + break; + case REC_STATUS_NODE_PTR: + ut_ad(!page_is_leaf(block.page.frame)); + break; + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + ut_ad(page_is_leaf(block.page.frame)); + } +#endif + + set_modified(block); + + static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility"); + static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility"); + static_assert(REC_STATUS_INSTANT == 4, "compatibility"); + + const size_t enc_hdr_l= hdr_l << 3 | + (info_status & REC_STATUS_INSTANT) | info_status >> 4; + size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4; + static_assert(REC_MAX_N_FIELDS * 2 < MIN_3BYTE, "compatibility"); + if (reuse) + { + if (shift < 0) + shift= -shift << 1 | 1; + else + shift<<= 1; + len+= static_cast<size_t>(shift) < MIN_2BYTE + ? 1 : static_cast<size_t>(shift) < MIN_3BYTE ? 2 : 3; + } + ut_ad(hdr_c + hdr_l <= REC_MAX_N_FIELDS * 2); + len+= hdr_c < MIN_2BYTE ? 1 : 2; + len+= enc_hdr_l < MIN_2BYTE ? 1 : enc_hdr_l < MIN_3BYTE ? 2 : 3; + len+= data_c < MIN_2BYTE ? 1 : data_c < MIN_3BYTE ? 2 : 3; + len+= hdr_l + data_l; + + const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small); + + if (UNIV_LIKELY(small)) + { + ut_d(const byte * const end = l + len); + *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC; + l= mlog_encode_varint(l, prev_rec); + if (reuse) + l= mlog_encode_varint(l, shift); + l= mlog_encode_varint(l, enc_hdr_l); + l= mlog_encode_varint(l, hdr_c); + l= mlog_encode_varint(l, data_c); + ::memcpy(l, hdr, hdr_l); + l+= hdr_l; + ::memcpy(l, data, data_l); + l+= data_l; + ut_ad(end == l); + m_log.close(l); + } + else + { + m_log.close(l); + l= m_log.open(len - hdr_l - data_l); + ut_d(const byte * const end = l + len - hdr_l - data_l); + *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC; + l= mlog_encode_varint(l, prev_rec); + if (reuse) + l= mlog_encode_varint(l, shift); + l= mlog_encode_varint(l, enc_hdr_l); + l= mlog_encode_varint(l, hdr_c); + l= mlog_encode_varint(l, data_c); + ut_ad(end == l); + m_log.close(l); + m_log.push(hdr, static_cast<uint32_t>(hdr_l)); + m_log.push(data, static_cast<uint32_t>(data_l)); + } + + m_last_offset= FIL_PAGE_TYPE; +} + +/** Report page directory corruption. +@param block index page +@param index index tree +*/ +ATTRIBUTE_COLD +static void page_cur_directory_corrupted(const buf_block_t &block, + const dict_index_t &index) +{ + ib::error() << "Directory of " << block.page.id() + << " of index " << index.name + << " in table " << index.table->name + << " is corrupted"; +} + +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +@return pointer to record +@retval nullptr if not enough space was available */ +rec_t* +page_cur_insert_rec_low( +/*====================*/ + const page_cur_t*cur, /*!< in: page cursor */ + const rec_t* rec, /*!< in: record to insert after cur */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t *block= cur->block; + dict_index_t * const index= cur->index; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) > 0); + ut_ad(index->table->not_redundant() == !!page_is_comp(block->page.frame)); + ut_ad(!!page_is_comp(block->page.frame) == !!rec_offs_comp(offsets)); + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame) == + index->id || + mtr->is_inside_ibuf()); + ut_ad(page_dir_get_n_slots(block->page.frame) >= 2); + + ut_ad(!page_rec_is_supremum(cur->rec)); + + /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */ + ut_ad(!mtr->is_logged() || + !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE)); + + /* 1. Get the size of the physical record in the page */ + const ulint rec_size= rec_offs_size(offsets); + +#ifdef HAVE_MEM_CHECK + { + const void *rec_start __attribute__((unused))= + rec - rec_offs_extra_size(offsets); + ulint extra_size __attribute__((unused))= + rec_offs_extra_size(offsets) - + (page_is_comp(block->page.frame) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + /* All data bytes of the record must be valid. */ + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + MEM_CHECK_DEFINED(rec_start, extra_size); + } +#endif /* HAVE_MEM_CHECK */ + + /* 2. Try to find suitable space from page memory management */ + bool reuse= false; + ssize_t free_offset= 0; + ulint heap_no; + byte *insert_buf; + + const bool comp= page_is_comp(block->page.frame); + const ulint extra_size= rec_offs_extra_size(offsets); + + if (rec_t* free_rec= page_header_get_ptr(block->page.frame, PAGE_FREE)) + { + /* Try to reuse the head of PAGE_FREE. */ + rec_offs foffsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t *heap= nullptr; + + rec_offs_init(foffsets_); + + rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_, + page_is_leaf(block->page.frame) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + const ulint fextra_size= rec_offs_extra_size(foffsets); + insert_buf= free_rec - fextra_size; + const bool too_small= (fextra_size + rec_offs_data_size(foffsets)) < + rec_size; + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + if (too_small) + goto use_heap; + + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + block->page.frame); + if (comp) + { + heap_no= rec_get_heap_no_new(free_rec); + uint16_t next= mach_read_from_2(free_rec - REC_NEXT); + mach_write_to_2(page_free, next + ? static_cast<uint16_t>(free_rec + next - + block->page.frame) + : 0); + } + else + { + heap_no= rec_get_heap_no_old(free_rec); + memcpy(page_free, free_rec - REC_NEXT, 2); + } + + static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility"); + + byte *page_garbage= my_assume_aligned<2>(page_free + 2); + ut_ad(mach_read_from_2(page_garbage) >= rec_size); + mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - rec_size); + reuse= true; + free_offset= extra_size - fextra_size; + } + else + { +use_heap: + insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no); + + if (UNIV_UNLIKELY(!insert_buf)) + return nullptr; + } + + ut_ad(cur->rec != insert_buf + extra_size); + + rec_t *next_rec= block->page.frame + rec_get_next_offs(cur->rec, comp); + ut_ad(next_rec != block->page.frame); + + /* Update page header fields */ + byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + + block->page.frame); + const uint16_t last_insert= mach_read_from_2(page_last_insert); + ut_ad(!last_insert || !comp || + rec_get_node_ptr_flag(block->page.frame + last_insert) == + rec_get_node_ptr_flag(rec)); + + /* Write PAGE_LAST_INSERT */ + mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size)); + + /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */ + if (block->page.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE)) + { + byte *dir= &block->page.frame[PAGE_DIRECTION_B + PAGE_HEADER]; + byte *n= my_assume_aligned<2> + (&block->page.frame[PAGE_N_DIRECTION + PAGE_HEADER]); + if (UNIV_UNLIKELY(!last_insert)) + { +no_direction: + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION); + memset(n, 0, 2); + } + else if (block->page.frame + last_insert == cur->rec && + (*dir & ((1U << 3) - 1)) != PAGE_LEFT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT); +inc_dir: + mach_write_to_2(n, mach_read_from_2(n) + 1); + } + else if (next_rec == block->page.frame + last_insert && + (*dir & ((1U << 3) - 1)) != PAGE_RIGHT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT); + goto inc_dir; + } + else + goto no_direction; + } + + /* Update PAGE_N_RECS. */ + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + + block->page.frame); + + mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1); + + /* Update the preceding record header, the 'owner' record and + prepare the record to insert. */ + rec_t *insert_rec= insert_buf + extra_size; + const ulint data_size= rec_offs_data_size(offsets); + memcpy(insert_buf, rec - extra_size, extra_size + data_size); + size_t hdr_common= 0; + ulint n_owned; + const byte info_status= static_cast<byte> + (rec_get_info_and_status_bits(rec, comp)); + ut_ad(!(rec_get_info_bits(rec, comp) & + ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))); + + if (comp) + { +#ifdef UNIV_DEBUG + switch (rec_get_status(cur->rec)) { + case REC_STATUS_ORDINARY: + case REC_STATUS_NODE_PTR: + case REC_STATUS_INSTANT: + case REC_STATUS_INFIMUM: + break; + case REC_STATUS_SUPREMUM: + ut_ad("wrong status on cur->rec" == 0); + } + switch (rec_get_status(rec)) { + case REC_STATUS_NODE_PTR: + ut_ad(!page_is_leaf(block->page.frame)); + break; + case REC_STATUS_INSTANT: + ut_ad(index->is_instant()); + ut_ad(page_is_leaf(block->page.frame)); + if (!rec_is_metadata(rec, true)) + break; + ut_ad(cur->rec == &block->page.frame[PAGE_NEW_INFIMUM]); + break; + case REC_STATUS_ORDINARY: + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad("wrong status on rec" == 0); + } + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); +#endif + + rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + insert_rec[-REC_NEW_STATUS]= rec[-REC_NEW_STATUS]; + rec_set_bit_field_2(insert_rec, heap_no, + REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + mach_write_to_2(insert_rec - REC_NEXT, + static_cast<uint16_t>(next_rec - insert_rec)); + mach_write_to_2(cur->rec - REC_NEXT, + static_cast<uint16_t>(insert_rec - cur->rec)); + while (!(n_owned= rec_get_n_owned_new(next_rec))) + { + next_rec= block->page.frame + rec_get_next_offs(next_rec, true); + ut_ad(next_rec != block->page.frame); + } + rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (!mtr->is_logged()) + { + mtr->set_modified(*block); + goto copied; + } + + const byte * const c_start= cur->rec - extra_size; + if (extra_size > REC_N_NEW_EXTRA_BYTES && + c_start >= + &block->page.frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES]) + { + /* Find common header bytes with the preceding record. */ + const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1); + for (const byte *c= cur->rec - (REC_N_NEW_EXTRA_BYTES + 1); + *r == *c && c-- != c_start; r--); + hdr_common= static_cast<size_t>((rec - (REC_N_NEW_EXTRA_BYTES + 1)) - r); + ut_ad(hdr_common <= extra_size - REC_N_NEW_EXTRA_BYTES); + } + } + else + { +#ifdef UNIV_DEBUG + if (!page_is_leaf(block->page.frame)); + else if (rec_is_metadata(rec, false)) + { + ut_ad(index->is_instant()); + ut_ad(cur->rec == &block->page.frame[PAGE_OLD_INFIMUM]); + } +#endif + rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(insert_rec, heap_no, + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + memcpy(insert_rec - REC_NEXT, cur->rec - REC_NEXT, 2); + mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec)); + while (!(n_owned= rec_get_n_owned_old(next_rec))) + { + next_rec= block->page.frame + rec_get_next_offs(next_rec, false); + ut_ad(next_rec != block->page.frame); + } + rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (!mtr->is_logged()) + { + mtr->set_modified(*block); + goto copied; + } + + ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES); + const byte * const c_start= cur->rec - extra_size; + if (c_start >= + &block->page.frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES]) + { + /* Find common header bytes with the preceding record. */ + const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1); + for (const byte *c= cur->rec - (REC_N_OLD_EXTRA_BYTES + 1); + *r == *c && c-- != c_start; r--); + hdr_common= static_cast<size_t>((rec - (REC_N_OLD_EXTRA_BYTES + 1)) - r); + ut_ad(hdr_common <= extra_size - REC_N_OLD_EXTRA_BYTES); + } + } + + /* Insert the record, possibly copying from the preceding record. */ + ut_ad(mtr->is_logged()); + + { + const byte *r= rec; + const byte *c= cur->rec; + const byte *c_end= c + data_size; + if (page_rec_is_infimum(c) && data_size > 8) + c_end= c + 8; + static_assert(REC_N_OLD_EXTRA_BYTES == REC_N_NEW_EXTRA_BYTES + 1, ""); + if (c <= insert_buf && c_end > insert_buf) + c_end= insert_buf; + else if (c_end < next_rec && + c_end >= next_rec - REC_N_OLD_EXTRA_BYTES + comp) + c_end= next_rec - REC_N_OLD_EXTRA_BYTES + comp; + else + c_end= std::min<const byte*>(c_end, block->page.frame + srv_page_size - + PAGE_DIR - PAGE_DIR_SLOT_SIZE * + page_dir_get_n_slots(block->page.frame)); + size_t data_common; + /* Copy common data bytes of the preceding record. */ + for (; c != c_end && *r == *c; c++, r++); + data_common= static_cast<size_t>(r - rec); + + if (comp) + mtr->page_insert(*block, reuse, + cur->rec - block->page.frame - PAGE_NEW_INFIMUM, + info_status, free_offset, hdr_common, data_common, + insert_buf, + extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES, + r, data_size - data_common); + else + mtr->page_insert(*block, reuse, + cur->rec - block->page.frame - PAGE_OLD_INFIMUM, + info_status, rec_get_n_fields_old(insert_rec) << 1 | + rec_get_1byte_offs_flag(insert_rec), + hdr_common, data_common, + insert_buf, + extra_size - hdr_common - REC_N_OLD_EXTRA_BYTES, + r, data_size - data_common); + } + +copied: + ut_ad(!memcmp(insert_buf, rec - extra_size, extra_size - + (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES))); + ut_ad(!memcmp(insert_rec, rec, data_size)); + /* We have incremented the n_owned field of the owner record. + If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the + corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + { + const ulint owner= page_dir_find_owner_slot(next_rec); + if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED)) + { + page_cur_directory_corrupted(*block, *index); + return nullptr; + } + + if (page_dir_split_slot(*block, page_dir_get_nth_slot(block->page.frame, + owner))) + return nullptr; + } + + rec_offs_make_valid(insert_buf + extra_size, index, + page_is_leaf(block->page.frame), offsets); + return insert_buf + extra_size; +} + +/** Add a slot to the dense page directory. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] index the index that the page belongs to +@param[in,out] mtr mini-transaction */ +static inline void page_zip_dir_add_slot(buf_block_t *block, + const dict_index_t *index, mtr_t *mtr) +{ + page_zip_des_t *page_zip= &block->page.zip; + + ut_ad(page_is_comp(page_zip->data)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + /* Read the old n_dense (n_heap has already been incremented). */ + ulint n_dense= page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW + + 1U); + + byte *dir= page_zip->data + page_zip_get_size(page_zip) - + PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + byte *stored= dir; + + if (!page_is_leaf(page_zip->data)) + { + ut_ad(!page_zip->n_blobs); + stored-= n_dense * REC_NODE_PTR_SIZE; + } + else if (index->is_clust()) + { + /* Move the BLOB pointer array backwards to make space for the + columns DB_TRX_ID,DB_ROLL_PTR and the dense directory slot. */ + + stored-= n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + byte *externs= stored - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + byte *dst= externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; + ut_ad(!memcmp(dst, field_ref_zero, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE)); + if (const ulint len = ulint(stored - externs)) + { + memmove(dst, externs, len); + mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data, + len); + } + } + else + { + stored-= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(!memcmp(stored - PAGE_ZIP_DIR_SLOT_SIZE, field_ref_zero, + PAGE_ZIP_DIR_SLOT_SIZE)); + } + + /* Move the uncompressed area backwards to make space + for one directory slot. */ + if (const ulint len = ulint(dir - stored)) + { + byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE; + memmove(dst, stored, len); + mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len); + } +} + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record +@return nullptr on failure */ +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor, + logical position unchanged */ + const rec_t* rec, /*!< in: pointer to a physical record */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor); + page_t * const page= cursor->block->page.frame; + dict_index_t * const index = cursor->index; + + ut_ad(page_zip); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(index->table->not_redundant()); + ut_ad(page_is_comp(page)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX || + fil_page_get_type(page) == FIL_PAGE_RTREE); + ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + page) == + index->id || mtr->is_inside_ibuf()); + ut_ad(!page_get_instant(page)); + ut_ad(!page_cur_is_after_last(cursor)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + const ulint rec_size= rec_offs_size(offsets); + +#ifdef HAVE_MEM_CHECK + { + const void *rec_start __attribute__((unused))= + rec - rec_offs_extra_size(offsets); + ulint extra_size __attribute__((unused))= + rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES; + /* All data bytes of the record must be valid. */ + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + MEM_CHECK_DEFINED(rec_start, extra_size); + } +#endif /* HAVE_MEM_CHECK */ + const bool reorg_before_insert= page_has_garbage(page) && + rec_size > page_get_max_insert_size(page, 1) && + rec_size <= page_get_max_insert_size_after_reorganize(page, 1); + constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER; + byte* const page_free = my_assume_aligned<4>(page_free_f + page); + uint16_t free_rec= 0; + + /* 2. Try to find suitable space from page memory management */ + ulint heap_no; + byte *insert_buf; + + if (reorg_before_insert || + !page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + { + /* SET GLOBAL might be executed concurrently. Sample the value once. */ + ulint level= page_zip_level; +#ifdef UNIV_DEBUG + const rec_t * const cursor_rec= page_cur_get_rec(cursor); +#endif /* UNIV_DEBUG */ + + if (page_is_empty(page)) + { + ut_ad(page_cur_is_before_first(cursor)); + + /* This is an empty page. Recreate to remove the modification log. */ + page_create_zip(cursor->block, index, + page_header_get_field(page, PAGE_LEVEL), 0, mtr); + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + goto use_heap; + + /* The cursor should remain on the page infimum. */ + return nullptr; + } + + if (page_zip->m_nonempty || page_has_garbage(page)) + { + ulint pos= page_rec_get_n_recs_before(cursor->rec); + + if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) + return nullptr; + + switch (page_zip_reorganize(cursor->block, index, level, mtr, true)) { + case DB_FAIL: + ut_ad(cursor->rec == cursor_rec); + return nullptr; + case DB_SUCCESS: + break; + default: + return nullptr; + } + + if (!pos) + ut_ad(cursor->rec == page + PAGE_NEW_INFIMUM); + else if (!(cursor->rec= page_rec_get_nth(page, pos))) + { + cursor->rec= page + PAGE_NEW_SUPREMUM; + return nullptr; + } + + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available(page_zip, index->is_clust(), rec_size, 1)) + goto use_heap; + } + + /* Try compressing the whole page afterwards. */ + const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE); + rec_t *insert_rec= page_cur_insert_rec_low(cursor, rec, offsets, mtr); + mtr->set_log_mode(log_mode); + + if (insert_rec) + { + ulint pos= page_rec_get_n_recs_before(insert_rec); + if (UNIV_UNLIKELY(!pos || pos == ULINT_UNDEFINED)) + return nullptr; + + /* We are writing entire page images to the log. Reduce the redo + log volume by reorganizing the page at the same time. */ + switch (page_zip_reorganize(cursor->block, index, level, mtr)) { + case DB_SUCCESS: + /* The page was reorganized: Seek to pos. */ + if (pos <= 1) + cursor->rec= page + PAGE_NEW_INFIMUM; + else if (!(cursor->rec= page_rec_get_nth(page, pos - 1))) + { + cursor->rec= page + PAGE_NEW_INFIMUM; + return nullptr; + } + insert_rec= page + rec_get_next_offs(cursor->rec, 1); + rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets); + break; + case DB_FAIL: + /* Theoretically, we could try one last resort of + page_zip_reorganize() followed by page_zip_available(), but that + would be very unlikely to succeed. (If the full reorganized page + failed to compress, why would it succeed to compress the page, + plus log the insert of this record?) */ + + /* Out of space: restore the page */ + if (!page_zip_decompress(page_zip, page, false)) + ut_error; /* Memory corrupted? */ + ut_ad(page_validate(page, index)); + /* fall through */ + default: + insert_rec= nullptr; + } + } + return insert_rec; + } + + free_rec= mach_read_from_2(page_free); + if (free_rec) + { + /* Try to allocate from the head of the free list. */ + rec_offs foffsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t *heap= nullptr; + + rec_offs_init(foffsets_); + + rec_offs *foffsets= rec_get_offsets(page + free_rec, index, foffsets_, + page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + insert_buf= page + free_rec - rec_offs_extra_size(foffsets); + + if (rec_offs_size(foffsets) < rec_size) + { +too_small: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + free_rec= 0; + goto use_heap; + } + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) - + rec_offs_extra_size(foffsets)); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) + { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff)) + goto too_small; + + insert_buf-= extra_size_diff; + } + else if (UNIV_UNLIKELY(extra_size_diff)) + /* Do not allow extra_size to grow */ + goto too_small; + + byte *const free_rec_ptr= page + free_rec; + heap_no= rec_get_heap_no_new(free_rec_ptr); + int16_t next_free= mach_read_from_2(free_rec_ptr - REC_NEXT); + /* With innodb_page_size=64k, int16_t would be unsafe to use here, + but that cannot be used with ROW_FORMAT=COMPRESSED. */ + static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility"); + if (next_free) + { + next_free= static_cast<int16_t>(next_free + free_rec); + if (UNIV_UNLIKELY(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} > + next_free || + uint16_t(next_free) >= srv_page_size)) + { + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return nullptr; + } + } + + byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]); + mach_write_to_2(hdr, static_cast<uint16_t>(next_free)); + const byte *const garbage= my_assume_aligned<2>(page_free + 2); + ut_ad(mach_read_from_2(garbage) >= rec_size); + mach_write_to_2(my_assume_aligned<2>(hdr + 2), + mach_read_from_2(garbage) - rec_size); + static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility"); + mtr->memcpy(*cursor->block, page_free, hdr, 4); + + if (!page_is_leaf(page)) + { + /* Zero out the node pointer of free_rec, in case it will not be + overwritten by insert_rec. */ + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_size(foffsets) > rec_size) + memset(rec_get_end(free_rec_ptr, foffsets) - + REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE); + } + else if (index->is_clust()) + { + /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will + not be overwritten by insert_rec. */ + + ulint len; + ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(), + &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) + memset(free_rec_ptr + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN == + rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(), + &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + } + else + { +use_heap: + ut_ad(!free_rec); + insert_buf= page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no); + + if (UNIV_UNLIKELY(!insert_buf)) + return insert_buf; + + static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility"); + mtr->memcpy(*cursor->block, PAGE_HEAP_TOP + PAGE_HEADER, 4); + page_zip_dir_add_slot(cursor->block, index, mtr); + } + + /* next record after current before the insertion */ + const rec_t *next_rec = page_rec_get_next_low(cursor->rec, TRUE); + if (UNIV_UNLIKELY(!next_rec || + rec_get_status(next_rec) == REC_STATUS_INFIMUM || + rec_get_status(cursor->rec) > REC_STATUS_INFIMUM)) + return nullptr; + + /* 3. Create the record */ + byte *insert_rec= rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(cursor->rec != insert_rec); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + + mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t> + (next_rec - insert_rec)); + mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t> + (insert_rec - cursor->rec)); + byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page); + mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs)); + memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + + MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + /* 6. Update the last insertion info in page header */ + byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER + + page_zip->data); + const uint16_t last_insert_rec= mach_read_from_2(last_insert); + ut_ad(!last_insert_rec || + rec_get_node_ptr_flag(page + last_insert_rec) == + rec_get_node_ptr_flag(insert_rec)); + mach_write_to_2(last_insert, page_offset(insert_rec)); + + if (!index->is_spatial()) + { + byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B]; + ut_ad(!(*dir & ~((1U << 3) - 1))); + byte *n= my_assume_aligned<2> + (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]); + if (UNIV_UNLIKELY(!last_insert_rec)) + { +no_direction: + *dir= PAGE_NO_DIRECTION; + memset(n, 0, 2); + } + else if (*dir != PAGE_LEFT && page + last_insert_rec == cursor->rec) + { + *dir= PAGE_RIGHT; +inc_dir: + mach_write_to_2(n, mach_read_from_2(n) + 1); + } + else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) == + page + last_insert_rec) + { + *dir= PAGE_LEFT; + goto inc_dir; + } + else + goto no_direction; + } + + /* Write the header fields in one record. */ + mtr->memcpy(*cursor->block, + my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + page), + my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + + page_zip->data), + PAGE_N_RECS - PAGE_LAST_INSERT + 2); + + /* 7. It remains to update the owner record. */ + ulint n_owned; + + while (!(n_owned= rec_get_n_owned_new(next_rec))) + if (!(next_rec= page_rec_get_next_low(next_rec, true))) + return nullptr; + + rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1, + REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + + page_zip_dir_insert(cursor, free_rec, insert_rec, mtr); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + { + const ulint owner= page_dir_find_owner_slot(next_rec); + if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED)) + { + page_cur_directory_corrupted(*cursor->block, *index); + return nullptr; + } + page_zip_dir_split_slot(cursor->block, owner, mtr); + } + + page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr); + return insert_rec; +} + +/** Prepend a record to the PAGE_FREE list, or shrink PAGE_HEAP_TOP. +@param[in,out] block index page +@param[in,out] rec record being deleted +@param[in] data_size record payload size, in bytes +@param[in] extra_size record header size, in bytes */ +static void page_mem_free(const buf_block_t &block, rec_t *rec, + size_t data_size, size_t extra_size) +{ + ut_ad(page_align(rec) == block.page.frame); + ut_ad(!block.page.zip.data); + const rec_t *free= page_header_get_ptr(block.page.frame, PAGE_FREE); + + const uint16_t n_heap= uint16_t(page_header_get_field(block.page.frame, + PAGE_N_HEAP) - 1); + ut_ad(page_get_n_recs(block.page.frame) < (n_heap & 0x7fff)); + const bool deleting_top= n_heap == ((n_heap & 0x8000) + ? (rec_get_heap_no_new(rec) | 0x8000) + : rec_get_heap_no_old(rec)); + + if (deleting_top) + { + byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER + + block.page.frame); + const uint16_t heap_top= mach_read_from_2(page_heap_top); + const size_t extra_savings= heap_top - page_offset(rec + data_size); + ut_ad(extra_savings < heap_top); + + /* When deleting the last record, do not add it to the PAGE_FREE list. + Instead, decrement PAGE_HEAP_TOP and PAGE_N_HEAP. */ + mach_write_to_2(page_heap_top, page_offset(rec - extra_size)); + mach_write_to_2(my_assume_aligned<2>(page_heap_top + 2), n_heap); + static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility"); + if (extra_savings) + { + byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER + + block.page.frame); + uint16_t garbage= mach_read_from_2(page_garbage); + ut_ad(garbage >= extra_savings); + mach_write_to_2(page_garbage, garbage - extra_savings); + } + } + else + { + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + block.page.frame); + byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER + + block.page.frame); + mach_write_to_2(page_free, page_offset(rec)); + mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) + + extra_size + data_size); + } + + memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.page.frame, 0, 2); + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + + block.page.frame); + mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1); + + const byte* const end= rec + data_size; + + if (!deleting_top) + { + uint16_t next= free + ? ((n_heap & 0x8000) + ? static_cast<uint16_t>(free - rec) + : static_cast<uint16_t>(free - block.page.frame)) + : uint16_t{0}; + mach_write_to_2(rec - REC_NEXT, next); + } + else + rec-= extra_size; + + memset(rec, 0, end - rec); +} + +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the next +record after the deleted one. */ +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_offs* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_dir_slot_t* cur_dir_slot; + rec_t* current_rec; + rec_t* prev_rec = NULL; + rec_t* next_rec; + ulint cur_n_owned; + rec_t* rec; + + /* page_zip_validate() will fail here when + btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark(). + Then, both "page_zip" and "block->page.frame" would have the + min-rec-mark set on the smallest user record, but + "block->page.frame" would additionally have it set on the + smallest-but-one record. Because sloppy + page_zip_validate_low() only ignores min-rec-flag differences + in the smallest user record, it cannot be used here either. */ + + current_rec = cursor->rec; + const dict_index_t* const index = cursor->index; + buf_block_t* const block = cursor->block; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(block->page.frame) + == index->table->not_redundant()); + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame) + == index->id + || mtr->is_inside_ibuf()); + ut_ad(mtr->is_named_space(index->table->space)); + + /* The record must not be the supremum or infimum record. */ + ut_ad(page_rec_is_user_rec(current_rec)); + + if (page_get_n_recs(block->page.frame) == 1 + && !rec_is_alter_metadata(current_rec, *index)) { + /* Empty the page. */ + ut_ad(page_is_leaf(block->page.frame)); + /* Usually, this should be the root page, + and the whole index tree should become empty. + However, this could also be a call in + btr_cur_pessimistic_update() to delete the only + record in the page and to insert another one. */ + ut_ad(page_rec_is_supremum(page_rec_get_next(cursor->rec))); + page_cur_set_after_last(block, cursor); + page_create_empty(page_cur_get_block(cursor), + const_cast<dict_index_t*>(index), mtr); + return; + } + + /* Save to local variables some data associated with current_rec */ + ulint cur_slot_no = page_dir_find_owner_slot(current_rec); + + if (UNIV_UNLIKELY(!cur_slot_no || cur_slot_no == ULINT_UNDEFINED)) { + /* Avoid crashing due to a corrupted page. */ + page_cur_directory_corrupted(*block, *index); + return; + } + + cur_dir_slot = page_dir_get_nth_slot(block->page.frame, cur_slot_no); + cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); + + /* The page gets invalid for btr_pcur_restore_pos(). + We avoid invoking buf_block_modify_clock_inc(block) because its + consistency checks would fail for the dummy block that is being + used during IMPORT TABLESPACE. */ + block->modify_clock++; + + /* Find the next and the previous record. Note that the cursor is + left at the next record. */ + + rec = const_cast<rec_t*> + (page_dir_slot_get_rec(cur_dir_slot + PAGE_DIR_SLOT_SIZE)); + + /* rec now points to the record of the previous directory slot. Look + for the immediate predecessor of current_rec in a loop. */ + + while (current_rec != rec) { + prev_rec = rec; + if (!(rec = page_rec_get_next(rec))) { + /* Avoid crashing due to a corrupted page. */ + return; + } + } + + if (!(next_rec = page_cur_move_to_next(cursor))) { + /* Avoid crashing due to a corrupted page. */ + return; + } + + /* Remove the record from the linked list of records */ + /* If the deleted record is pointed to by a dir slot, update the + record pointer in slot. In the following if-clause we assume that + prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED + >= 2. */ + /* Update the number of owned records of the slot */ + + compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2); + ut_ad(cur_n_owned > 1); + + rec_t* slot_rec = const_cast<rec_t*> + (page_dir_slot_get_rec(cur_dir_slot)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(page_is_comp(block->page.frame)); + if (current_rec == slot_rec) { + page_zip_rec_set_owned(block, prev_rec, 1, mtr); + page_zip_rec_set_owned(block, slot_rec, 0, mtr); + slot_rec = prev_rec; + mach_write_to_2(cur_dir_slot, page_offset(slot_rec)); + } else if (cur_n_owned == 1 + && !page_rec_is_supremum(slot_rec)) { + page_zip_rec_set_owned(block, slot_rec, 0, mtr); + } + + mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t> + (next_rec - prev_rec)); + slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>( + (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK) + | (cur_n_owned - 1) << REC_N_OWNED_SHIFT); + + page_header_reset_last_insert(block, mtr); + page_zip_dir_delete(block, rec, index, offsets, + page_header_get_ptr(block->page.frame, + PAGE_FREE), + mtr); + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + page_zip_dir_balance_slot(block, cur_slot_no, mtr); + } + return; + } + + if (current_rec == slot_rec) { + slot_rec = prev_rec; + mach_write_to_2(cur_dir_slot, page_offset(slot_rec)); + } + + const size_t data_size = rec_offs_data_size(offsets); + const size_t extra_size = rec_offs_extra_size(offsets); + + if (page_is_comp(block->page.frame)) { + mtr->page_delete(*block, page_offset(prev_rec) + - PAGE_NEW_INFIMUM, + extra_size - REC_N_NEW_EXTRA_BYTES, + data_size); + mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t> + (next_rec - prev_rec)); + slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>( + (slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK) + | (cur_n_owned - 1) << REC_N_OWNED_SHIFT); + } else { + mtr->page_delete(*block, page_offset(prev_rec) + - PAGE_OLD_INFIMUM); + memcpy(prev_rec - REC_NEXT, current_rec - REC_NEXT, 2); + slot_rec[-REC_OLD_N_OWNED] = static_cast<byte>( + (slot_rec[-REC_OLD_N_OWNED] & ~REC_N_OWNED_MASK) + | (cur_n_owned - 1) << REC_N_OWNED_SHIFT); + } + + page_mem_free(*block, current_rec, data_size, extra_size); + + /* Now we have decremented the number of owned records of the slot. + If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the + slots. */ + + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + page_dir_balance_slot(*block, cur_slot_no); + } + + ut_ad(page_is_comp(block->page.frame) + ? page_simple_validate_new(block->page.frame) + : page_simple_validate_old(block->page.frame)); +} + +/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@param enc_hdr encoded fixed-size header bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_redundant(const buf_block_t &block, bool reuse, + ulint prev, ulint enc_hdr, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len) +{ + page_t * const page= block.page.frame; + const uint16_t n_slots= page_dir_get_n_slots(page); + byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page); + const uint16_t h= mach_read_from_2(page_n_heap); + const page_id_t id(block.page.id()); + if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW || + h >= srv_page_size / REC_N_OLD_EXTRA_BYTES || + !fil_page_index_page_check(page) || + page_get_page_no(page) != id.page_no() || + mach_read_from_2(my_assume_aligned<2> + (PAGE_OLD_SUPREMUM - REC_NEXT + page)))) + { +corrupted: + ib::error() << (reuse + ? "Not applying INSERT_REUSE_REDUNDANT" + " due to corruption on " + : "Not applying INSERT_HEAP_REDUNDANT" + " due to corruption on ") + << id; + return true; + } + + byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1); + byte * const page_heap_top= my_assume_aligned<2> + (PAGE_HEAP_TOP + PAGE_HEADER + page); + const byte *const heap_bot= &page[PAGE_OLD_SUPREMUM_END]; + byte *heap_top= page + mach_read_from_2(page_heap_top); + if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot)) + goto corrupted; + if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM)) + goto corrupted; + if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) != + PAGE_OLD_INFIMUM)) + goto corrupted; + rec_t * const prev_rec= page + PAGE_OLD_INFIMUM + prev; + if (!prev); + else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec || + prev_rec > heap_top)) + goto corrupted; + const ulint pn_fields= rec_get_bit_field_2(prev_rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + if (UNIV_UNLIKELY(pn_fields == 0 || pn_fields > REC_MAX_N_FIELDS)) + goto corrupted; + const ulint pextra_size= REC_N_OLD_EXTRA_BYTES + + (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2); + if (prev_rec == &page[PAGE_OLD_INFIMUM]); + else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot)) + goto corrupted; + if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot)) + goto corrupted; + const ulint pdata_size= rec_get_data_size_old(prev_rec); + if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top)) + goto corrupted; + rec_t * const next_rec= page + mach_read_from_2(prev_rec - REC_NEXT); + if (next_rec == page + PAGE_OLD_SUPREMUM); + else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec || + next_rec > heap_top)) + goto corrupted; + const bool is_short= (enc_hdr >> 2) & 1; + const ulint n_fields= (enc_hdr >> 3) + 1; + if (UNIV_UNLIKELY(n_fields > REC_MAX_N_FIELDS)) + goto corrupted; + const ulint extra_size= REC_N_OLD_EXTRA_BYTES + + (is_short ? n_fields : n_fields * 2); + hdr_c+= REC_N_OLD_EXTRA_BYTES; + if (UNIV_UNLIKELY(hdr_c > extra_size)) + goto corrupted; + if (UNIV_UNLIKELY(extra_size - hdr_c > data_len)) + goto corrupted; + /* We buffer all changes to the record header locally, so that + we will avoid modifying the page before all consistency checks + have been fulfilled. */ + alignas(2) byte insert_buf[REC_N_OLD_EXTRA_BYTES + REC_MAX_N_FIELDS * 2]; + + ulint n_owned; + rec_t *owner_rec= next_rec; + for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED; + !(n_owned= rec_get_n_owned_old(owner_rec)); ) + { + owner_rec= page + mach_read_from_2(owner_rec - REC_NEXT); + if (owner_rec == &page[PAGE_OLD_SUPREMUM]); + else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec || + owner_rec > heap_top)) + goto corrupted; + if (!ns--) + goto corrupted; /* Corrupted (cyclic?) next-record list */ + } + + page_dir_slot_t *owner_slot= last_slot; + + if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED) + goto corrupted; + else + { + mach_write_to_2(insert_buf, owner_rec - page); + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + const page_dir_slot_t * const first_slot= + page_dir_get_nth_slot(page, 0); + + while (memcmp_aligned<2>(owner_slot, insert_buf, 2)) + if ((owner_slot+= 2) == first_slot) + goto corrupted; + } + + memcpy(insert_buf, data, extra_size - hdr_c); + byte *insert_rec= &insert_buf[extra_size]; + memcpy(insert_rec - hdr_c, prev_rec - hdr_c, hdr_c); + rec_set_bit_field_1(insert_rec, (enc_hdr & 3) << 4, + REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, + REC_INFO_BITS_SHIFT); + rec_set_1byte_offs_flag(insert_rec, is_short); + rec_set_n_fields_old(insert_rec, n_fields); + rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + + const ulint data_size= rec_get_data_size_old(insert_rec); + if (UNIV_UNLIKELY(data_c > data_size)) + goto corrupted; + if (UNIV_UNLIKELY(extra_size - hdr_c + data_size - data_c != data_len)) + goto corrupted; + + /* Perform final consistency checks and then apply the change to the page. */ + byte *buf; + if (reuse) + { + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + page); + rec_t *free_rec= page + mach_read_from_2(page_free); + if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec || + free_rec > heap_top)) + goto corrupted; + const ulint fn_fields= rec_get_n_fields_old(free_rec); + const ulint fextra_size= REC_N_OLD_EXTRA_BYTES + + (rec_get_1byte_offs_flag(free_rec) ? fn_fields : fn_fields * 2); + if (UNIV_UNLIKELY(free_rec - fextra_size < heap_bot)) + goto corrupted; + const ulint fdata_size= rec_get_data_size_old(free_rec); + if (UNIV_UNLIKELY(free_rec + fdata_size > heap_top)) + goto corrupted; + if (UNIV_UNLIKELY(extra_size + data_size > fextra_size + fdata_size)) + goto corrupted; + byte *page_garbage= my_assume_aligned<2>(page_free + 2); + if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < + fextra_size + fdata_size)) + goto corrupted; + buf= free_rec - fextra_size; + const rec_t *const next_free= page + + mach_read_from_2(free_rec - REC_NEXT); + if (next_free == page); + else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] || + heap_top < next_free)) + goto corrupted; + mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - + extra_size - data_size); + rec_set_bit_field_2(insert_rec, rec_get_heap_no_old(free_rec), + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + memcpy(page_free, free_rec - REC_NEXT, 2); + } + else + { + if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot)) + goto corrupted; + rec_set_bit_field_2(insert_rec, h, + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + mach_write_to_2(page_n_heap, h + 1); + mach_write_to_2(page_heap_top, + mach_read_from_2(page_heap_top) + extra_size + data_size); + buf= heap_top; + } + + ut_ad(data_size - data_c == data_len - (extra_size - hdr_c)); + byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + + page); + const uint16_t last_insert= mach_read_from_2(page_last_insert); + memcpy(buf, insert_buf, extra_size); + buf+= extra_size; + mach_write_to_2(page_last_insert, buf - page); + memcpy(prev_rec - REC_NEXT, page_last_insert, 2); + memcpy(buf, prev_rec, data_c); + memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c), + data_len - (extra_size - hdr_c)); + rec_set_bit_field_1(owner_rec, n_owned + 1, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + + /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */ + if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE)) + { + byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER]; + byte *n_dir= my_assume_aligned<2> + (&page[PAGE_N_DIRECTION + PAGE_HEADER]); + if (UNIV_UNLIKELY(!last_insert)) + { +no_direction: + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION); + memset(n_dir, 0, 2); + } + else if (page + last_insert == prev_rec && + (*dir & ((1U << 3) - 1)) != PAGE_LEFT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT); +inc_dir: + mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1); + } + else if (next_rec == page + last_insert && + (*dir & ((1U << 3) - 1)) != PAGE_RIGHT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT); + goto inc_dir; + } + else + goto no_direction; + } + + /* Update PAGE_N_RECS. */ + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page); + + mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1); + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + return page_dir_split_slot(block, owner_slot); + ut_ad(page_simple_validate_old(page)); + return false; +} + +/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param shift unless !reuse: number of bytes the PAGE_FREE is moving +@param enc_hdr_l number of copied record header bytes, plus record type bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse, + ulint prev, ulint shift, ulint enc_hdr_l, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len) +{ + page_t * const page= block.page.frame; + const uint16_t n_slots= page_dir_get_n_slots(page); + byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page); + ulint h= mach_read_from_2(page_n_heap); + const page_id_t id(block.page.id()); + if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) || + (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES || + (h & 0x7fff) < n_slots || + !fil_page_index_page_check(page) || + page_get_page_no(page) != id.page_no() || + mach_read_from_2(my_assume_aligned<2> + (PAGE_NEW_SUPREMUM - REC_NEXT + page)) || + ((enc_hdr_l & REC_STATUS_INSTANT) && + !page_is_leaf(page)) || + (enc_hdr_l >> 3) > data_len)) + { +corrupted: + ib::error() << (reuse + ? "Not applying INSERT_REUSE_DYNAMIC" + " due to corruption on " + : "Not applying INSERT_HEAP_DYNAMIC" + " due to corruption on ") + << id; + return true; + } + + byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1); + byte * const page_heap_top= my_assume_aligned<2> + (PAGE_HEAP_TOP + PAGE_HEADER + page); + const byte *const heap_bot= &page[PAGE_NEW_SUPREMUM_END]; + byte *heap_top= page + mach_read_from_2(page_heap_top); + if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot)) + goto corrupted; + if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM)) + goto corrupted; + if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) != + PAGE_NEW_INFIMUM)) + goto corrupted; + + uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev); + rec_t *prev_rec= page + n; + n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT)); + if (!prev); + else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec || + prev_rec > heap_top)) + goto corrupted; + + rec_t * const next_rec= page + n; + if (next_rec == page + PAGE_NEW_SUPREMUM); + else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec || + next_rec > heap_top)) + goto corrupted; + + ulint n_owned; + rec_t *owner_rec= next_rec; + n= static_cast<uint16_t>(next_rec - page); + + for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED; + !(n_owned= rec_get_n_owned_new(owner_rec)); ) + { + n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT)); + owner_rec= page + n; + if (n == PAGE_NEW_SUPREMUM); + else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec || + owner_rec > heap_top)) + goto corrupted; + if (!ns--) + goto corrupted; /* Corrupted (cyclic?) next-record list */ + } + + page_dir_slot_t* owner_slot= last_slot; + + if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED) + goto corrupted; + else + { + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + alignas(2) byte slot_buf[2]; + mach_write_to_2(slot_buf, owner_rec - page); + const page_dir_slot_t * const first_slot= + page_dir_get_nth_slot(page, 0); + + while (memcmp_aligned<2>(owner_slot, slot_buf, 2)) + if ((owner_slot+= 2) == first_slot) + goto corrupted; + } + + const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_c + (enc_hdr_l >> 3); + const ulint data_size= data_c + data_len - (enc_hdr_l >> 3); + + /* Perform final consistency checks and then apply the change to the page. */ + byte *buf; + if (reuse) + { + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + page); + rec_t *free_rec= page + mach_read_from_2(page_free); + if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec || + free_rec > heap_top)) + goto corrupted; + buf= free_rec - extra_size; + if (shift & 1) + buf-= shift >> 1; + else + buf+= shift >> 1; + + if (UNIV_UNLIKELY(heap_bot > buf || + &buf[extra_size + data_size] > heap_top)) + goto corrupted; + byte *page_garbage= my_assume_aligned<2>(page_free + 2); + if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < extra_size + data_size)) + goto corrupted; + if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0) + { + n= static_cast<uint16_t>(n + free_rec - page); + if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES || + heap_top < page + n)) + goto corrupted; + } + mach_write_to_2(page_free, n); + mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - + (extra_size + data_size)); + h= rec_get_heap_no_new(free_rec); + } + else + { + if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot)) + goto corrupted; + mach_write_to_2(page_n_heap, h + 1); + h&= 0x7fff; + mach_write_to_2(page_heap_top, + mach_read_from_2(page_heap_top) + extra_size + data_size); + buf= heap_top; + } + + memcpy(buf, data, (enc_hdr_l >> 3)); + buf+= enc_hdr_l >> 3; + data_len-= enc_hdr_l >> 3; + data= &static_cast<const byte*>(data)[enc_hdr_l >> 3]; + + memcpy(buf, prev_rec - REC_N_NEW_EXTRA_BYTES - hdr_c, hdr_c); + buf+= hdr_c; + *buf++= static_cast<byte>((enc_hdr_l & 3) << 4); /* info_bits; n_owned=0 */ + *buf++= static_cast<byte>(h >> 5); /* MSB of heap number */ + h= (h & ((1U << 5) - 1)) << 3; + static_assert(REC_STATUS_ORDINARY == 0, "compatibility"); + static_assert(REC_STATUS_INSTANT == 4, "compatibility"); + if (page_is_leaf(page)) + h|= enc_hdr_l & REC_STATUS_INSTANT; + else + { + ut_ad(!(enc_hdr_l & REC_STATUS_INSTANT)); /* Checked at the start */ + h|= REC_STATUS_NODE_PTR; + } + *buf++= static_cast<byte>(h); /* LSB of heap number, and status */ + static_assert(REC_NEXT == 2, "compatibility"); + buf+= REC_NEXT; + mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf)); + byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + + page); + const uint16_t last_insert= mach_read_from_2(page_last_insert); + mach_write_to_2(page_last_insert, buf - page); + mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec)); + memcpy(buf, prev_rec, data_c); + buf+= data_c; + memcpy(buf, data, data_len); + + rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + + /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */ + if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE)) + { + byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER]; + byte *n_dir= my_assume_aligned<2>(&page[PAGE_N_DIRECTION + PAGE_HEADER]); + if (UNIV_UNLIKELY(!last_insert)) + { +no_direction: + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION); + memset(n_dir, 0, 2); + } + else if (page + last_insert == prev_rec && + (*dir & ((1U << 3) - 1)) != PAGE_LEFT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT); +inc_dir: + mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1); + } + else if (next_rec == page + last_insert && + (*dir & ((1U << 3) - 1)) != PAGE_RIGHT) + { + *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT); + goto inc_dir; + } + else + goto no_direction; + } + + /* Update PAGE_N_RECS. */ + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page); + + mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1); + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) + return page_dir_split_slot(block, owner_slot); + ut_ad(page_simple_validate_new(page)); + return false; +} + +/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by +page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_redundant(const buf_block_t &block, ulint prev) +{ + page_t * const page= block.page.frame; + const uint16_t n_slots= page_dir_get_n_slots(page); + ulint n_recs= page_get_n_recs(page); + const page_id_t id(block.page.id()); + + if (UNIV_UNLIKELY(!n_recs || n_slots < 2 || + !fil_page_index_page_check(page) || + page_get_page_no(page) != id.page_no() || + mach_read_from_2(my_assume_aligned<2> + (PAGE_OLD_SUPREMUM - REC_NEXT + page)) || + page_is_comp(page))) + { +corrupted: + ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT" + " due to corruption on " << id; + return true; + } + + byte *slot= page_dir_get_nth_slot(page, n_slots - 1); + rec_t *prev_rec= page + PAGE_OLD_INFIMUM + prev; + if (UNIV_UNLIKELY(prev_rec > slot)) + goto corrupted; + uint16_t n= mach_read_from_2(prev_rec - REC_NEXT); + rec_t *rec= page + n; + if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES || + slot < rec)) + goto corrupted; + const ulint extra_size= REC_N_OLD_EXTRA_BYTES + rec_get_n_fields_old(rec) * + (rec_get_1byte_offs_flag(rec) ? 1 : 2); + const ulint data_size= rec_get_data_size_old(rec); + if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + extra_size || + slot < rec + data_size)) + goto corrupted; + + n= mach_read_from_2(rec - REC_NEXT); + rec_t *next= page + n; + if (n == PAGE_OLD_SUPREMUM); + else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES || + slot < next)) + goto corrupted; + + rec_t *s= rec; + ulint slot_owned; + for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); ) + { + n= mach_read_from_2(s - REC_NEXT); + s= page + n; + if (n == PAGE_OLD_SUPREMUM); + else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES || + slot < s)) + goto corrupted; + if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */ + goto corrupted; + } + slot_owned--; + + /* The first slot is always pointing to the infimum record. + Find the directory slot pointing to s. */ + const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2); + alignas(2) byte slot_offs[2]; + mach_write_to_2(slot_offs, s - page); + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + + while (memcmp_aligned<2>(slot, slot_offs, 2)) + if ((slot+= 2) == first_slot) + goto corrupted; + + if (rec == s) + { + s= prev_rec; + mach_write_to_2(slot, s - page); + } + + memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2); + s-= REC_OLD_N_OWNED; + *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) | + slot_owned << REC_N_OWNED_SHIFT); + page_mem_free(block, rec, data_size, extra_size); + + if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED) + page_dir_balance_slot(block, (first_slot - slot) / 2); + + ut_ad(page_simple_validate_old(page)); + return false; +} + +/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by +page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES +@param data_size data payload size, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev, + size_t hdr_size, size_t data_size) +{ + page_t * const page= block.page.frame; + const uint16_t n_slots= page_dir_get_n_slots(page); + ulint n_recs= page_get_n_recs(page); + const page_id_t id(block.page.id()); + + if (UNIV_UNLIKELY(!n_recs || n_slots < 2 || + !fil_page_index_page_check(page) || + page_get_page_no(page) != id.page_no() || + mach_read_from_2(my_assume_aligned<2> + (PAGE_NEW_SUPREMUM - REC_NEXT + page)) || + !page_is_comp(page))) + { +corrupted: + ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC" + " due to corruption on " << id; + return true; + } + + byte *slot= page_dir_get_nth_slot(page, n_slots - 1); + uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev); + rec_t *prev_rec= page + n; + if (UNIV_UNLIKELY(prev_rec > slot)) + goto corrupted; + n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT)); + rec_t *rec= page + n; + if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES || + slot < rec)) + goto corrupted; + const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_size; + if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + extra_size || + slot < rec + data_size)) + goto corrupted; + n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT)); + rec_t *next= page + n; + if (n == PAGE_NEW_SUPREMUM); + else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES || + slot < next)) + goto corrupted; + + rec_t *s= rec; + n= static_cast<uint16_t>(rec - page); + ulint slot_owned; + for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); ) + { + const uint16_t next= mach_read_from_2(s - REC_NEXT); + if (UNIV_UNLIKELY(next < REC_N_NEW_EXTRA_BYTES || + next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES))) + goto corrupted; + n= static_cast<uint16_t>(n + next); + s= page + n; + if (n == PAGE_NEW_SUPREMUM); + else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES || + slot < s)) + goto corrupted; + if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */ + goto corrupted; + } + slot_owned--; + + /* The first slot is always pointing to the infimum record. + Find the directory slot pointing to s. */ + const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2); + alignas(2) byte slot_offs[2]; + mach_write_to_2(slot_offs, s - page); + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + + while (memcmp_aligned<2>(slot, slot_offs, 2)) + if ((slot+= 2) == first_slot) + goto corrupted; + + if (rec == s) + { + s= prev_rec; + mach_write_to_2(slot, s - page); + } + + mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec)); + s-= REC_NEW_N_OWNED; + *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) | + slot_owned << REC_N_OWNED_SHIFT); + page_mem_free(block, rec, data_size, extra_size); + + if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED) + page_dir_balance_slot(block, (first_slot - slot) / 2); + + ut_ad(page_simple_validate_new(page)); + return false; +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +/*******************************************************************//** +Print the first n numbers, generated by ut_rnd_gen() to make sure +(visually) that it works properly. */ +void +test_ut_rnd_gen( + int n) /*!< in: print first n numbers */ +{ + int i; + unsigned long long rnd; + + for (i = 0; i < n; i++) { + rnd = ut_rnd_gen(); + printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n", + rnd, + rnd % 2, + rnd % 3, + rnd % 5, + rnd % 7, + rnd % 11); + } +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc new file mode 100644 index 00000000..258d47a5 --- /dev/null +++ b/storage/innobase/page/page0page.cc @@ -0,0 +1,2523 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0page.cc +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#include "page0page.h" +#include "page0cur.h" +#include "page0zip.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "btr0btr.h" +#include "srv0srv.h" +#include "lock0lock.h" +#include "fut0lst.h" +#include "btr0sea.h" +#include "trx0sys.h" +#include <algorithm> + +/* THE INDEX PAGE + ============== + +The index page consists of a page header which contains the page's +id and other information. On top of it are the index records +in a heap linked into a one way linear list according to alphabetic order. + +Just below page end is an array of pointers which we call page directory, +to about every sixth record in the list. The pointers are placed in +the directory in the alphabetical order of the records pointed to, +enabling us to make binary search using the array. Each slot n:o I +in the directory points to a record, where a 4-bit field contains a count +of those records which are in the linear list between pointer I and +the pointer I - 1 in the directory, including the record +pointed to by pointer I and not including the record pointed to by I - 1. +We say that the record pointed to by slot I, or that slot I, owns +these records. The count is always kept in the range 4 to 8, with +the exception that it is 1 for the first slot, and 1--8 for the second slot. + +An essentially binary search can be performed in the list of index +records, like we could do if we had pointer to every record in the +page directory. The data structure is, however, more efficient when +we are doing inserts, because most inserts are just pushed on a heap. +Only every 8th insert requires block move in the directory pointer +table, which itself is quite small. A record is deleted from the page +by just taking it off the linear list and updating the number of owned +records-field of the record which owns it, and updating the page directory, +if necessary. A special case is the one when the record owns itself. +Because the overhead of inserts is so small, we may also increase the +page size from the projected default of 8 kB to 64 kB without too +much loss of efficiency in inserts. Bigger page becomes actual +when the disk transfer rate compared to seek and latency time rises. +On the present system, the page size is set so that the page transfer +time (3 ms) is 20 % of the disk random access time (15 ms). + +When the page is split, merged, or becomes full but contains deleted +records, we have to reorganize the page. + +Assuming a page size of 8 kB, a typical index page of a secondary +index contains 300 index entries, and the size of the page directory +is 50 x 4 bytes = 200 bytes. */ + +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number +@retval ULINT_UNDEFINED on corruption */ +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + ut_ad(page_rec_check(rec)); + + const page_t* page = page_align(rec); + const page_dir_slot_t* first_slot = page_dir_get_nth_slot(page, 0); + const page_dir_slot_t* slot = page_dir_get_nth_slot( + page, ulint(page_dir_get_n_slots(page)) - 1); + const rec_t* r = rec; + + if (page_is_comp(page)) { + while (rec_get_n_owned_new(r) == 0) { + r = page_rec_get_next_low(r, true); + if (UNIV_UNLIKELY(r < page + PAGE_NEW_SUPREMUM + || r >= slot)) { + return ULINT_UNDEFINED; + } + } + } else { + while (rec_get_n_owned_old(r) == 0) { + r = page_rec_get_next_low(r, false); + if (UNIV_UNLIKELY(r < page + PAGE_OLD_SUPREMUM + || r >= slot)) { + return ULINT_UNDEFINED; + } + } + } + + while (UNIV_LIKELY(*(uint16*) slot + != mach_encode_2(ulint(r - page)))) { + if (UNIV_UNLIKELY(slot == first_slot)) { + return ULINT_UNDEFINED; + } + + slot += PAGE_DIR_SLOT_SIZE; + } + + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); +} + +/**************************************************************//** +Used to check the consistency of a directory slot. +@return TRUE if succeed */ +static +ibool +page_dir_slot_check( +/*================*/ + const page_dir_slot_t* slot) /*!< in: slot */ +{ + const page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = page_align(slot); + + n_slots = page_dir_get_n_slots(page); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); + + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot)); + } + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + +/*************************************************************//** +Sets the max trx id field value. */ +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */ +{ + ut_ad(!mtr || mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip == &block->page.zip); + static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment"); + byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID + + PAGE_HEADER + block->page.frame); + + mtr->write<8>(*block, max_trx_id, trx_id); + if (UNIV_LIKELY_NULL(page_zip)) + memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER], + max_trx_id, 8); +} + +/** Persist the AUTO_INCREMENT value on a clustered index root page. +@param[in,out] block clustered index root page +@param[in] index clustered index +@param[in] autoinc next available AUTO_INCREMENT value +@param[in,out] mtr mini-transaction +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +page_set_autoinc( + buf_block_t* block, + ib_uint64_t autoinc, + mtr_t* mtr, + bool reset) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + block->page.frame); + ib_uint64_t old= mach_read_from_8(field); + if (old == autoinc || (old > autoinc && !reset)) + return; /* nothing to update */ + + mtr->write<8>(*block, field, autoinc); + if (UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data, + field, 8); +} + +/** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */ +static const byte infimum_supremum_redundant[] = { + /* the infimum record */ + 0x08/*end offset*/, + 0x01/*n_owned*/, + 0x00, 0x00/*heap_no=0*/, + 0x03/*n_fields=1, 1-byte offsets*/, + 0x00, 0x74/* pointer to supremum */, + 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0, + /* the supremum record */ + 0x09/*end offset*/, + 0x01/*n_owned*/, + 0x00, 0x08/*heap_no=1*/, + 0x03/*n_fields=1, 1-byte offsets*/, + 0x00, 0x00/* end of record list */, + 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm', 0 +}; + +/** The page infimum and supremum of an empty page in ROW_FORMAT=COMPACT */ +static const byte infimum_supremum_compact[] = { + /* the infimum record */ + 0x01/*n_owned=1*/, + 0x00, 0x02/* heap_no=0, REC_STATUS_INFIMUM */, + 0x00, 0x0d/* pointer to supremum */, + 'i', 'n', 'f', 'i', 'm', 'u', 'm', 0, + /* the supremum record */ + 0x01/*n_owned=1*/, + 0x00, 0x0b/* heap_no=1, REC_STATUS_SUPREMUM */, + 0x00, 0x00/* end of record list */, + 's', 'u', 'p', 'r', 'e', 'm', 'u', 'm' +}; + +/** Create an index page. +@param[in,out] block buffer block +@param[in] comp nonzero=compact page format */ +void page_create_low(const buf_block_t* block, bool comp) +{ + page_t* page; + + compile_time_assert(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE + <= PAGE_DATA); + compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE + <= PAGE_DATA); + + page = block->page.frame; + + fil_page_set_type(page, FIL_PAGE_INDEX); + + memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END); + page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2; + page[PAGE_HEADER + PAGE_INSTANT] = 0; + page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION; + + if (comp) { + page[PAGE_HEADER + PAGE_N_HEAP] = 0x80;/*page_is_comp()*/ + page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW; + page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_NEW_SUPREMUM_END; + memcpy(page + PAGE_DATA, infimum_supremum_compact, + sizeof infimum_supremum_compact); + memset(page + + PAGE_NEW_SUPREMUM_END, 0, + srv_page_size - PAGE_DIR - PAGE_NEW_SUPREMUM_END); + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] + = PAGE_NEW_SUPREMUM; + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] + = PAGE_NEW_INFIMUM; + } else { + page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW; + page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_OLD_SUPREMUM_END; + memcpy(page + PAGE_DATA, infimum_supremum_redundant, + sizeof infimum_supremum_redundant); + memset(page + + PAGE_OLD_SUPREMUM_END, 0, + srv_page_size - PAGE_DIR - PAGE_OLD_SUPREMUM_END); + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1] + = PAGE_OLD_SUPREMUM; + page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1] + = PAGE_OLD_INFIMUM; + } +} + +/** Create an uncompressed index page. +@param[in,out] block buffer block +@param[in,out] mtr mini-transaction +@param[in] comp set unless ROW_FORMAT=REDUNDANT */ +void page_create(buf_block_t *block, mtr_t *mtr, bool comp) +{ + mtr->page_create(*block, comp); + buf_block_modify_clock_inc(block); + page_create_low(block, comp); +} + +/**********************************************************//** +Create a compressed B-tree index page. */ +void +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame + where the page is created */ + dict_index_t* index, /*!< in: the index of the + page */ + ulint level, /*!< in: the B-tree level + of the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction + handle */ +{ + ut_ad(block); + ut_ad(buf_block_get_page_zip(block)); + ut_ad(dict_table_is_comp(index->table)); + + /* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for + temporary tables. */ + ut_ad(max_trx_id == 0 || !index->table->is_temporary()); + /* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID + must be zero on non-leaf pages. max_trx_id can be 0 when the + index consists of an empty root (leaf) page. */ + ut_ad(max_trx_id == 0 + || level == 0 + || !dict_index_is_sec_or_ibuf(index) + || index->table->is_temporary()); + /* In the clustered index, PAGE_ROOT_AUTOINC or + PAGE_MAX_TRX_ID must be 0 on other pages than the root. */ + ut_ad(level == 0 || max_trx_id == 0 + || !dict_index_is_sec_or_ibuf(index) + || index->table->is_temporary()); + + buf_block_modify_clock_inc(block); + page_create_low(block, true); + + if (index->is_spatial()) { + mach_write_to_2(FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_RTREE); + memset(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8); + memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8); + } + + mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->page.frame, level); + mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->page.frame, + max_trx_id); + + if (!page_zip_compress(block, index, page_zip_level, mtr)) { + /* The compression of a newly created + page should always succeed. */ + ut_error; + } +} + +/**********************************************************//** +Empty a previously created B-tree index page. */ +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_id_t max_trx_id; + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(!index->is_dummy); + ut_ad(block->page.id().space() == index->table->space->id); + + /* Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (dict_index_is_sec_or_ibuf(index) + && !index->table->is_temporary() + && page_is_leaf(block->page.frame)) { + max_trx_id = page_get_max_trx_id(block->page.frame); + ut_ad(max_trx_id); + } else if (block->page.id().page_no() == index->page) { + /* Preserve PAGE_ROOT_AUTO_INC. */ + max_trx_id = page_get_max_trx_id(block->page.frame); + } else { + max_trx_id = 0; + } + + if (page_zip) { + ut_ad(!index->table->is_temporary()); + page_create_zip(block, index, + page_header_get_field(block->page.frame, + PAGE_LEVEL), + max_trx_id, mtr); + } else { + page_create(block, mtr, index->table->not_redundant()); + if (index->is_spatial()) { + static_assert(((FIL_PAGE_INDEX & 0xff00) + | byte(FIL_PAGE_RTREE)) + == FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, + FIL_PAGE_TYPE + 1 + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + + FIL_RTREE_SPLIT_SEQ_NUM)) { + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, + 8, 0); + } + } + + if (max_trx_id) { + mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID + + block->page.frame, max_trx_id); + } + } +} + +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code */ +dberr_t +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t cur1; + page_cur_t cur2; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + cur1.index = cur2.index = index; + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) { + return DB_CORRUPTION; + } + + if (UNIV_UNLIKELY(page_is_comp(new_page) != page_rec_is_comp(rec) + || mach_read_from_2(new_page + srv_page_size - 10) + != ulint(page_is_comp(new_page) + ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM))) { + return DB_CORRUPTION; + } + + const ulint n_core = page_is_leaf(block->page.frame) + ? index->n_core_fields : 0; + + dberr_t err = DB_SUCCESS; + page_cur_set_before_first(new_block, &cur2); + + /* Copy records from the original page to the new page */ + + while (!page_cur_is_after_last(&cur1)) { + rec_t* ins_rec; + offsets = rec_get_offsets(cur1.rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_insert_rec_low(&cur2, cur1.rec, offsets, + mtr); + if (UNIV_UNLIKELY(!ins_rec || !page_cur_move_to_next(&cur1))) { + err = DB_CORRUPTION; + break; + } + ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page)) + & REC_INFO_MIN_REC_FLAG)); + cur2.rec = ins_rec; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return err; +} + +/*************************************************************//** +Copies records from page to new_page, from a given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_t::commit(). + +@return pointer to the original successor of the infimum record on new_block +@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ +{ + page_t* new_page = new_block->page.frame; + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_t* page = block->page.frame; + rec_t* ret = page_rec_get_next( + page_get_infimum_rec(new_page)); + ulint num_moved = 0; + ut_ad(page_align(rec) == page); + + if (UNIV_UNLIKELY(!ret)) { + *err = DB_CORRUPTION; + return nullptr; + } + +#ifdef UNIV_ZIP_DEBUG + if (new_page_zip) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + ut_a(page_zip); + + /* Strict page_zip_validate() may fail here. + Furthermore, btr_compress() may set FIL_PAGE_PREV to + FIL_NULL on new_page while leaving it intact on + new_page_zip. So, we cannot validate new_page_zip. */ + ut_a(page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(buf_block_get_frame(block) == page); + ut_ad(page_is_leaf(page) == page_is_leaf(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); + /* Here, "ret" may be pointing to a user record or the + predefined supremum record. */ + + const mtr_log_t log_mode = new_page_zip + ? mtr->set_log_mode(MTR_LOG_NONE) : MTR_LOG_NONE; + const bool was_empty = page_dir_get_n_heap(new_page) + == PAGE_HEAP_NO_USER_LOW; + alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT]; + memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page, + sizeof h); + mem_heap_t* heap = nullptr; + rtr_rec_move_t* rec_move = nullptr; + + if (index->is_spatial()) { + ulint max_to_move = page_get_n_recs( + buf_block_get_frame(block)); + heap = mem_heap_create(256); + rec_move= static_cast<rtr_rec_move_t*>( + mem_heap_alloc(heap, max_to_move * sizeof *rec_move)); + /* For spatial index, we need to insert recs one by one + to keep recs ordered. */ + *err = rtr_page_copy_rec_list_end_no_locks(new_block, + block, rec, index, + heap, rec_move, + max_to_move, + &num_moved, + mtr); + } else { + *err = page_copy_rec_list_end_no_locks(new_block, block, rec, + index, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return nullptr; + } + if (was_empty) { + mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER + + PAGE_LAST_INSERT + + new_page, h, sizeof h); + } + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. + Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (dict_index_is_sec_or_ibuf(index) + && page_is_leaf(page) + && !index->table->is_temporary()) { + ut_ad(!was_empty || page_dir_get_n_heap(new_page) + == PAGE_HEAP_NO_USER_LOW + + page_header_get_field(new_page, PAGE_N_RECS)); + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page), mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(new_block, index, + page_zip_level, mtr)) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the successor of + the predefined infimum record. It must still + have at least one predecessor (the predefined + infimum record, or a freshly copied record + that is smaller than "ret"). */ + if (UNIV_UNLIKELY(!ret_pos + || ret_pos == ULINT_UNDEFINED)) { + *err = DB_CORRUPTION; + goto err_exit; + } + + *err = page_zip_reorganize(new_block, index, + page_zip_level, mtr); + switch (*err) { + case DB_FAIL: + if (!page_zip_decompress(new_page_zip, + new_page, FALSE)) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + /* fall through */ + default: + goto err_exit; + case DB_SUCCESS: + /* The page was reorganized: + Seek to ret_pos. */ + ret = page_rec_get_nth(new_page, ret_pos); + ut_ad(ret); + } + } + } + + /* Update the lock table and possible hash index */ + + if (!index->has_locking()) { + } else if (UNIV_LIKELY_NULL(rec_move)) { + lock_rtr_move_rec_list(new_block, block, rec_move, num_moved); + } else { + lock_move_rec_list_end(new_block, block, rec); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + btr_search_move_or_delete_hash_entries(new_block, block); + + return(ret); +} + +/*************************************************************//** +Copies records from page to new_page, up to the given record, +NOT including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on new_block +@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ +{ + ut_ad(page_align(rec) == block->page.frame); + + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_cur_t cur1; + page_cur_t cur2; + mem_heap_t* heap = NULL; + ulint num_moved = 0; + rtr_rec_move_t* rec_move = NULL; + rec_t* ret + = page_rec_get_prev(page_get_supremum_rec(new_page)); + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + if (UNIV_UNLIKELY(!ret)) { +corrupted: + *err = DB_CORRUPTION; + return nullptr; + } + + /* Here, "ret" may be pointing to a user record or the + predefined infimum record. */ + + if (page_rec_is_infimum(rec)) { + *err = DB_SUCCESS; + return(ret); + } + + page_cur_set_before_first(block, &cur1); + if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) { + goto corrupted; + } + + mtr_log_t log_mode = MTR_LOG_NONE; + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + cur2.index = index; + page_cur_position(ret, new_block, &cur2); + + const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0; + + /* Copy records from the original page to the new page */ + if (index->is_spatial()) { + ut_ad(!index->is_instant()); + ulint max_to_move = page_get_n_recs( + buf_block_get_frame(block)); + heap = mem_heap_create(256); + + rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc( + heap, + sizeof (*rec_move) * max_to_move)); + + /* For spatial index, we need to insert recs one by one + to keep recs ordered. */ + *err = rtr_page_copy_rec_list_start_no_locks(new_block, + block, rec, index, + heap, rec_move, + max_to_move, + &num_moved, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + } else { + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(cur1.rec, index, offsets, + n_core, + ULINT_UNDEFINED, &heap); + cur2.rec = page_cur_insert_rec_low(&cur2, cur1.rec, + offsets, mtr); + if (UNIV_UNLIKELY(!cur2.rec + || !page_cur_move_to_next(&cur1))) { + *err = DB_CORRUPTION; + return nullptr; + } + + ut_ad(!(rec_get_info_bits(cur1.rec, + page_is_comp(new_page)) + & REC_INFO_MIN_REC_FLAG)); + } + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. + Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (n_core && !index->is_primary() && !index->table->is_temporary()) { + page_update_max_trx_id(new_block, nullptr, + page_get_max_trx_id(block->page.frame), + mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail", + goto zip_reorganize;); + + if (!page_zip_compress(new_block, index, + page_zip_level, mtr)) { +#ifndef DBUG_OFF +zip_reorganize: +#endif /* DBUG_OFF */ + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the predecessor + of the predefined supremum record. If it was + the predefined infimum record, then it would + still be the infimum, and we would have + ret_pos == 0. */ + if (UNIV_UNLIKELY(!ret_pos + || ret_pos == ULINT_UNDEFINED)) { + *err = DB_CORRUPTION; + return nullptr; + } + *err = page_zip_reorganize(new_block, index, + page_zip_level, mtr); + switch (*err) { + case DB_SUCCESS: + ret = page_rec_get_nth(new_page, ret_pos); + ut_ad(ret); + break; + case DB_FAIL: + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page, FALSE))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + /* fall through */ + default: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return nullptr; + } + } + } + + /* Update the lock table and possible hash index */ + + if (!index->has_locking()) { + } else if (dict_index_is_spatial(index)) { + lock_rtr_move_rec_list(new_block, block, rec_move, num_moved); + } else { + lock_move_rec_list_start(new_block, block, rec, ret); + } + + if (heap) { + mem_heap_free(heap); + } + + btr_search_move_or_delete_hash_entries(new_block, block); + + *err = DB_SUCCESS; + return(ret); +} + +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +dberr_t +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t * const page= block->page.frame; + + ut_ad(size == ULINT_UNDEFINED || size < srv_page_size); + ut_ad(page_align(rec) == page); + ut_ad(index->table->not_redundant() == !!page_is_comp(page)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!block->page.zip.data || + page_zip_validate(&block->page.zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_supremum(rec)) + { + ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED); + /* Nothing to do, there are no records bigger than the page supremum. */ + return DB_SUCCESS; + } + + if (page_rec_is_infimum(rec) || + n_recs == page_get_n_recs(page) || + rec == (page_is_comp(page) + ? page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) + : page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0))) + { + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return DB_SUCCESS; + } + +#if 0 // FIXME: consider deleting the last record as a special case + if (page_rec_is_last(rec)) + { + page_cur_t cursor= { index, rec, offsets, block }; + page_cur_delete_rec(&cursor, index, offsets, mtr); + return DB_SUCCESS; + } +#endif + + /* The page becomes invalid for optimistic searches */ + buf_block_modify_clock_inc(block); + + const ulint n_core= page_is_leaf(page) ? index->n_core_fields : 0; + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + +#if 1 // FIXME: remove this, and write minimal amount of log! */ + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + ut_ad(page_is_comp(page)); + do + { + page_cur_t cur; + page_cur_position(rec, block, &cur); + cur.index= index; + offsets= rec_get_offsets(rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + rec= const_cast<rec_t*>(page_rec_get_next_low(rec, true)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(&block->page.zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&cur, offsets, mtr); + } + while (page_offset(rec) != PAGE_NEW_SUPREMUM); + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return DB_SUCCESS; + } +#endif + + byte *prev_rec= page_rec_get_prev(rec); + if (UNIV_UNLIKELY(!prev_rec)) + return DB_CORRUPTION; + byte *last_rec= page_rec_get_prev(page_get_supremum_rec(page)); + if (UNIV_UNLIKELY(!last_rec)) + return DB_CORRUPTION; + + // FIXME: consider a special case of shrinking PAGE_HEAP_TOP + + const bool scrub= srv_immediate_scrub_data_uncompressed; + if (scrub || size == ULINT_UNDEFINED || n_recs == ULINT_UNDEFINED) + { + rec_t *rec2= rec; + /* Calculate the sum of sizes and the number of records */ + size= 0; + n_recs= 0; + + do + { + offsets = rec_get_offsets(rec2, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + ulint s= rec_offs_size(offsets); + ut_ad(ulint(rec2 - page) + s - rec_offs_extra_size(offsets) < + srv_page_size); + ut_ad(size + s < srv_page_size); + size+= s; + n_recs++; + + if (scrub) + mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0); + + rec2= page_rec_get_next(rec2); + } + while (rec2 && !page_rec_is_supremum(rec2)); + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + if (UNIV_UNLIKELY(!rec)) + return DB_CORRUPTION; + } + + ut_ad(size < srv_page_size); + + ulint slot_index, n_owned; + { + const rec_t *owner_rec= rec; + ulint count= 0; + + if (page_is_comp(page)) + while (!(n_owned= rec_get_n_owned_new(owner_rec))) + { + count++; + if (!(owner_rec= page_rec_get_next_low(owner_rec, true))) + return DB_CORRUPTION; + } + else + while (!(n_owned= rec_get_n_owned_old(owner_rec))) + { + count++; + if (!(owner_rec= page_rec_get_next_low(owner_rec, false))) + return DB_CORRUPTION; + } + + ut_ad(n_owned > count); + n_owned-= count; + slot_index= page_dir_find_owner_slot(owner_rec); + } + + if (UNIV_UNLIKELY(!slot_index || slot_index == ULINT_UNDEFINED)) + return DB_CORRUPTION; + + mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2> + (PAGE_N_DIR_SLOTS + PAGE_HEADER + page), + slot_index + 1); + mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2> + (PAGE_LAST_INSERT + PAGE_HEADER + page), 0U); + /* Catenate the deleted chain segment to the page free list */ + alignas(4) byte page_header[4]; + byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE + page); + const uint16_t free= page_header_get_field(page, PAGE_FREE); + static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility"); + + mach_write_to_2(page_header, page_offset(rec)); + mach_write_to_2(my_assume_aligned<2>(page_header + 2), + mach_read_from_2(my_assume_aligned<2>(page_free + 2)) + + size); + mtr->memcpy(*block, page_free, page_header, 4); + + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page); + mtr->write<2>(*block, page_n_recs, + ulint{mach_read_from_2(page_n_recs)} - n_recs); + + /* Update the page directory; there is no need to balance the number + of the records owned by the supremum record, as it is allowed to be + less than PAGE_DIR_SLOT_MIN_N_OWNED */ + page_dir_slot_t *slot= page_dir_get_nth_slot(page, slot_index); + + if (page_is_comp(page)) + { + mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM); + byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + page; + byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) | + n_owned << REC_N_OWNED_SHIFT); +#if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + *owned= new_owned; + memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data, + PAGE_N_DIR_SLOTS + PAGE_HEADER + page, + PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS); + // TODO: the equivalent of page_zip_dir_delete() for all records + mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t> + (PAGE_NEW_SUPREMUM - page_offset(prev_rec))); + mach_write_to_2(last_rec - REC_NEXT, free + ? static_cast<uint16_t>(free - page_offset(last_rec)) + : 0U); + return DB_SUCCESS; + } +#endif + mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned); + mtr->write<2>(*block, prev_rec - REC_NEXT, static_cast<uint16_t> + (PAGE_NEW_SUPREMUM - page_offset(prev_rec))); + mtr->write<2>(*block, last_rec - REC_NEXT, free + ? static_cast<uint16_t>(free - page_offset(last_rec)) + : 0U); + } + else + { + mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM); + byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + page; + byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) | + n_owned << REC_N_OWNED_SHIFT); + mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned); + mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM); + mtr->write<2>(*block, last_rec - REC_NEXT, free); + } + + return DB_SUCCESS; +} + +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t cur1; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(offsets_); + + ut_ad(page_align(rec) == block->page.frame); + ut_ad((ibool) !!page_rec_is_comp(rec) + == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + + /* page_zip_validate() would detect a min_rec_mark mismatch + in btr_page_split_and_insert() + between btr_attach_half_pages() and insert_page = ... + when btr_page_get_split_rec_to_left() holds + (direction == FSP_DOWN). */ + ut_a(!page_zip + || page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + return; + } + + if (page_rec_is_supremum(rec)) { + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return; + } + + cur1.index = index; + page_cur_set_before_first(block, &cur1); + if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) { + ut_ad("corrupted page" == 0); + return; + } + + const ulint n_core = page_rec_is_leaf(rec) + ? index->n_core_fields : 0; + + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, n_core, + ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record +@retval nullptr on corrupted page */ +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + const page_dir_slot_t* slot; + ulint i; + ulint n_owned; + const rec_t* rec; + + if (nth == 0) { + return(page_get_infimum_rec(page)); + } + + ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1)); + + for (i = 0;; i++) { + slot = page_dir_get_nth_slot(page, i); + n_owned = page_dir_slot_get_n_owned(slot); + + if (n_owned > nth) { + break; + } else { + nth -= n_owned; + } + } + + if (UNIV_UNLIKELY(!i)) { + return nullptr; + } + rec = page_dir_slot_get_rec(slot + 2); + + if (page_is_comp(page)) { + do { + rec = page_rec_get_next_low(rec, TRUE); + } while (rec && nth--); + } else { + do { + rec = page_rec_get_next_low(rec, FALSE); + } while (rec && nth--); + } + + return(rec); +} + + +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record +@retval nullptr on error */ +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record, must not be page + infimum */ +{ + const rec_t* rec2; + const rec_t* prev_rec = NULL; + + ut_ad(page_rec_check(rec)); + + const page_t* const page = page_align(rec); + + ut_ad(!page_rec_is_infimum(rec)); + + ulint slot_no = page_dir_find_owner_slot(rec); + + if (UNIV_UNLIKELY(!slot_no || slot_no == ULINT_UNDEFINED)) { + return nullptr; + } + + const page_dir_slot_t* slot = page_dir_get_nth_slot(page, slot_no - 1); + + if (UNIV_UNLIKELY(!(rec2 = page_dir_slot_get_rec_validate(slot)))) { + return nullptr; + } + + if (page_is_comp(page)) { + while (rec2 && rec != rec2) { + prev_rec = rec2; + ulint offs = rec_get_next_offs(rec2, TRUE); + if (offs < PAGE_NEW_INFIMUM + || offs > page_header_get_field(page, + PAGE_HEAP_TOP)) { + return nullptr; + } + rec2 = page + offs; + } + switch (rec_get_status(prev_rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + if (!page_is_leaf(page)) { + return nullptr; + } + break; + case REC_STATUS_INFIMUM: + break; + case REC_STATUS_NODE_PTR: + if (!page_is_leaf(page)) { + break; + } + /* fall through */ + default: + return nullptr; + } + } else { + while (rec2 && rec != rec2) { + prev_rec = rec2; + ulint offs = rec_get_next_offs(rec2, FALSE); + if (offs < PAGE_OLD_INFIMUM + || offs > page_header_get_field(page, + PAGE_HEAP_TOP)) { + return nullptr; + } + rec2 = page + offs; + } + } + + return(prev_rec); +} + +/** Return the number of preceding records in an index page. +@param rec index record +@return number of preceding records, including the infimum pseudo-record +@retval ULINT_UNDEFINED on corrupted page */ +ulint page_rec_get_n_recs_before(const rec_t *rec) +{ + const page_t *const page= page_align(rec); + const page_dir_slot_t *slot = page_dir_get_nth_slot(page, 0); + const page_dir_slot_t *const end_slot= slot - 2 * page_dir_get_n_slots(page); + + lint n= 0; + + ut_ad(page_rec_check(rec)); + + if (page_is_comp(page)) + { + for (; rec_get_n_owned_new(rec) == 0; n--) + if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, true)))) + return ULINT_UNDEFINED; + + do + { + const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot); + if (UNIV_UNLIKELY(!slot_rec)) + break; + n+= lint(rec_get_n_owned_new(slot_rec)); + + if (rec == slot_rec) + goto found; + } + while ((slot-= 2) > end_slot); + } + else + { + for (; rec_get_n_owned_old(rec) == 0; n--) + if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, false)))) + return ULINT_UNDEFINED; + + do + { + const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot); + if (UNIV_UNLIKELY(!slot_rec)) + break; + n+= lint(rec_get_n_owned_old(slot_rec)); + + if (rec == slot_rec) + goto found; + } + while ((slot-= 2) > end_slot); + } + + return ULINT_UNDEFINED; +found: + return --n < 0 ? ULINT_UNDEFINED : ulint(n); +} + +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: record descriptor */ +{ + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); + if (page_rec_is_comp(rec)) { + ib::info() << "n_owned: " << rec_get_n_owned_new(rec) + << "; heap_no: " << rec_get_heap_no_new(rec) + << "; next rec: " << rec_get_next_offs(rec, TRUE); + } else { + ib::info() << "n_owned: " << rec_get_n_owned_old(rec) + << "; heap_no: " << rec_get_heap_no_old(rec) + << "; next rec: " << rec_get_next_offs(rec, FALSE); + } + + page_rec_check(rec); + rec_validate(rec, offsets); +} + +#ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + ulint n; + ulint i; + page_dir_slot_t* slot; + + n = page_dir_get_n_slots(page); + + fprintf(stderr, "--------------------------------\n" + "PAGE DIRECTORY\n" + "Page address %p\n" + "Directory stack top at offs: %lu; number of slots: %lu\n", + page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)), + (ulong) n); + for (i = 0; i < n; i++) { + slot = page_dir_get_nth_slot(page, i); + if ((i == pr_n) && (i < n - pr_n)) { + fputs(" ... \n", stderr); + } + if ((i < pr_n) || (i >= n - pr_n)) { + fprintf(stderr, + "Contents of slot: %lu: n_owned: %lu," + " rec offs: %lu\n", + (ulong) i, + (ulong) page_dir_slot_get_n_owned(slot), + (ulong) + page_offset(page_dir_slot_get_rec(slot))); + } + } + fprintf(stderr, "Total of %lu records\n" + "--------------------------------\n", + (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page))); +} + +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + page_t* page = block->page.frame; + page_cur_t cur; + ulint count; + ulint n_recs; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + + fprint(stderr, + "--------------------------------\n" + "PAGE RECORD LIST\n" + "Page address %p\n", page); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cur); + count = 0; + for (;;) { + offsets = rec_get_offsets(cur.rec, index, offsets, + page_rec_is_leaf(cur.rec), + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + + if (count == pr_n) { + break; + } + if (page_cur_is_after_last(&cur)) { + break; + } + page_cur_move_to_next(&cur); + count++; + } + + if (n_recs > 2 * pr_n) { + fputs(" ... \n", stderr); + } + + while (!page_cur_is_after_last(&cur)) { + page_cur_move_to_next(&cur); + + if (count + pr_n >= n_recs) { + offsets = rec_get_offsets(cur.rec, index, offsets, + page_rec_is_leaf(cur.rec), + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + } + count++; + } + + fprintf(stderr, + "Total of %lu records \n" + "--------------------------------\n", + (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Prints the info in a page header. */ +void +page_header_print( +/*==============*/ + const page_t* page) +{ + fprintf(stderr, + "--------------------------------\n" + "PAGE HEADER INFO\n" + "Page address %p, n records %u (%s)\n" + "n dir slots %u, heap top %u\n" + "Page n heap %u, free %u, garbage %u\n" + "Page last insert %u, direction %u, n direction %u\n", + page, page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", + page_header_get_field(page, PAGE_N_DIR_SLOTS), + page_header_get_field(page, PAGE_HEAP_TOP), + page_dir_get_n_heap(page), + page_header_get_field(page, PAGE_FREE), + page_header_get_field(page, PAGE_GARBAGE), + page_header_get_field(page, PAGE_LAST_INSERT), + page_get_direction(page), + page_header_get_field(page, PAGE_N_DIRECTION)); +} + +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn) /*!< in: print rn first and last records + in directory */ +{ + page_t* page = block->page.frame; + + page_header_print(page); + page_dir_print(page, dn); + page_print_list(block, index, rn); +} +#endif /* UNIV_BTR_PRINT */ + +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_owned; + ulint heap_no; + const page_t* page; + + page = page_align(rec); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); + + page_rec_check(rec); + rec_validate(rec, offsets); + + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } + + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { + ib::warn() << "Dir slot of rec " << page_offset(rec) + << ", n owned too big " << n_owned; + return(FALSE); + } + + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { + ib::warn() << "Heap no of rec " << page_offset(rec) + << " too big " << heap_no << " " + << page_dir_get_n_heap(page); + return(FALSE); + } + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +void +page_check_dir( +/*===========*/ + const page_t* page) /*!< in: index page */ +{ + ulint n_slots; + ulint infimum_offs; + ulint supremum_offs; + + n_slots = page_dir_get_n_slots(page); + infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0)); + supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page, + n_slots - 1)); + + if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) { + + ib::fatal() << "Page directory corruption: infimum not" + " pointed to"; + } + + if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) { + + ib::fatal() << "Page directory corruption: supremum not" + " pointed to"; + } +} +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(!page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) { + ib::error() << "Nonsensical number of page dir slots: " + << n_slots; + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + ib::error() + << "Record heap and dir overlap on a page, heap top " + << page_header_get_field(page, PAGE_HEAP_TOP) + << ", dir " + << page_offset(page_dir_get_nth_slot(page, + n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + ib::error() << "Record " << (rec - page) + << " is above rec heap top " + << (rec_heap_top - page); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != 0)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { + + ib::error() << "Wrong owned count " + << rec_get_n_owned_old(rec) + << ", " << own_count << ", rec " + << (rec - page); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + ib::error() << "Dir slot does not point" + " to right rec " << (rec - page); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= srv_page_size)) { + + ib::error() << "Next record offset nonsensical " + << rec_get_next_offs(rec, FALSE) << " for rec " + << (rec - page); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > srv_page_size)) { + ib::error() << "Page record list appears" + " to be circular " << count; + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { + ib::error() << "n owned is zero in a supremum rec"; + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + ib::error() << "n slots wrong " + << slot_no << ", " << (n_slots - 1); + goto func_exit; + } + + if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS)) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + ib::error() << "n recs wrong " + << page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW << " " << (count + 1); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + srv_page_size)) { + ib::error() << "Free list record has" + " a nonsensical offset " << (rec - page); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + ib::error() << "Free list record " << (rec - page) + << " is above rec heap top " + << (rec_heap_top - page); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > srv_page_size)) { + ib::error() << "Page free list appears" + " to be circular " << count; + goto func_exit; + } + + ulint offs = rec_get_next_offs(rec, FALSE); + if (!offs) { + break; + } + if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM + || offs >= srv_page_size)) { + ib::error() << "Page free list is corrupted " << count; + goto func_exit; + } + + rec = page + offs; + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + ib::error() << "N heap is wrong " + << page_dir_get_n_heap(page) << ", " << (count + 1); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) { + ib::error() << "Nonsensical number of page dir slots: " + << n_slots; + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + ib::error() << "Record heap and dir overlap on a page," + " heap top " + << page_header_get_field(page, PAGE_HEAP_TOP) + << ", dir " << page_offset( + page_dir_get_nth_slot(page, n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page + PAGE_NEW_INFIMUM; + + for (;;) { + if (UNIV_UNLIKELY(rec < page + PAGE_NEW_INFIMUM + || rec > rec_heap_top)) { + ib::error() << "Record " << page_offset(rec) + << " is out of bounds: " + << page_offset(rec_heap_top); + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + ib::error() << "Wrong owned count " + << rec_get_n_owned_new(rec) << ", " + << own_count << ", rec " + << page_offset(rec); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + ib::error() << "Dir slot does not point" + " to right rec " << page_offset(rec); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= srv_page_size)) { + + ib::error() << "Next record offset nonsensical " + << rec_get_next_offs(rec, TRUE) + << " for rec " << page_offset(rec); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > srv_page_size)) { + ib::error() << "Page record list appears to be" + " circular " << count; + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + ib::error() << "n owned is zero in a supremum rec"; + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + ib::error() << "n slots wrong " << slot_no << ", " + << (n_slots - 1); + goto func_exit; + } + + if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS)) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + ib::error() << "n recs wrong " + << page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW << " " << (count + 1); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + srv_page_size)) { + + ib::error() << "Free list record has" + " a nonsensical offset " << page_offset(rec); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + ib::error() << "Free list record " << page_offset(rec) + << " is above rec heap top " + << page_offset(rec_heap_top); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > srv_page_size)) { + ib::error() << "Page free list appears to be" + " circular " << count; + goto func_exit; + } + + const ulint offs = rec_get_next_offs(rec, TRUE); + if (!offs) { + break; + } + if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM + || offs >= srv_page_size)) { + ib::error() << "Page free list is corrupted " << count; + goto func_exit; + } + + rec = page + offs; + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + ib::error() << "N heap is wrong " + << page_dir_get_n_heap(page) << ", " << (count + 1); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/** Check the consistency of an index page. +@param[in] page index page +@param[in] index B-tree or R-tree index +@return whether the page is valid */ +bool page_validate(const page_t* page, const dict_index_t* index) +{ + const page_dir_slot_t* slot; + const rec_t* rec; + const rec_t* old_rec = NULL; + const rec_t* first_rec = NULL; + ulint offs = 0; + ulint n_slots; + ibool ret = TRUE; + ulint i; + rec_offs offsets_1[REC_OFFS_NORMAL_SIZE]; + rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_1; + rec_offs* old_offsets = offsets_2; + + rec_offs_init(offsets_1); + rec_offs_init(offsets_2); + +#ifdef UNIV_GIS_DEBUG + if (dict_index_is_spatial(index)) { + fprintf(stderr, "Page no: %lu\n", page_get_page_no(page)); + } +#endif /* UNIV_DEBUG */ + + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != dict_table_is_comp(index->table))) { + ib::error() << "'compact format' flag mismatch"; +func_exit2: + ib::error() << "Apparent corruption in space " + << page_get_space_id(page) << " page " + << page_get_page_no(page) + << " of index " << index->name + << " of table " << index->table->name; + return FALSE; + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } + } + + /* Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (!page_is_leaf(page) || page_is_empty(page) + || !dict_index_is_sec_or_ibuf(index) + || index->table->is_temporary()) { + } else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) { + trx_id_t max_trx_id = page_get_max_trx_id(page); + + if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { + ib::error() << "PAGE_MAX_TRX_ID out of bounds: " + << max_trx_id << ", " << sys_max_trx_id; + ret = FALSE; + } + } else { + ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN); + } + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { + + ib::warn() << "Record heap and directory overlap"; + goto func_exit2; + } + + switch (uint16_t type = fil_page_get_type(page)) { + case FIL_PAGE_RTREE: + if (!index->is_spatial()) { +wrong_page_type: + ib::warn() << "Wrong page type " << type; + ret = FALSE; + } + break; + case FIL_PAGE_TYPE_INSTANT: + if (index->is_instant() + && page_get_page_no(page) == index->page) { + break; + } + goto wrong_page_type; + case FIL_PAGE_INDEX: + if (index->is_spatial()) { + goto wrong_page_type; + } + if (index->is_instant() + && page_get_page_no(page) == index->page) { + goto wrong_page_type; + } + break; + default: + goto wrong_page_type; + } + + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + mem_heap_t* heap = mem_heap_create(srv_page_size + 200);; + byte* buf = static_cast<byte*>(mem_heap_zalloc(heap, srv_page_size)); + + /* Validate the record list in a loop checking also that + it is consistent with the directory. */ + ulint count = 0, data_size = 0, own_count = 1, slot_no = 0; + ulint info_bits; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + + for (;;) { + offsets = rec_get_offsets(rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) + == page_is_leaf(page))) { + ib::error() << "'node_ptr' flag mismatch"; + ret = FALSE; + goto next_rec; + } + + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + ret = FALSE; + goto next_rec; + } + + info_bits = rec_get_info_bits(rec, page_is_comp(page)); + if (info_bits + & ~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) { + ib::error() << "info_bits has an incorrect value " + << info_bits; + ret = false; + } + + if (rec == first_rec) { + if (info_bits & REC_INFO_MIN_REC_FLAG) { + if (page_has_prev(page)) { + ib::error() << "REC_INFO_MIN_REC_FLAG " + "is set on non-left page"; + ret = false; + } else if (!page_is_leaf(page)) { + /* leftmost node pointer page */ + } else if (!index->is_instant()) { + ib::error() << "REC_INFO_MIN_REC_FLAG " + "is set in a leaf-page record"; + ret = false; + } else if (!(info_bits & REC_INFO_DELETED_FLAG) + != !index->table->instant) { + ib::error() << (index->table->instant + ? "Metadata record " + "is not delete-marked" + : "Metadata record " + "is delete-marked"); + ret = false; + } + } else if (!page_has_prev(page) + && index->is_instant()) { + ib::error() << "Metadata record is missing"; + ret = false; + } + } else if (info_bits & REC_INFO_MIN_REC_FLAG) { + ib::error() << "REC_INFO_MIN_REC_FLAG record is not " + "first in page"; + ret = false; + } + + if (page_is_comp(page)) { + const rec_comp_status_t status = rec_get_status(rec); + if (status != REC_STATUS_ORDINARY + && status != REC_STATUS_NODE_PTR + && status != REC_STATUS_INFIMUM + && status != REC_STATUS_SUPREMUM + && status != REC_STATUS_INSTANT) { + ib::error() << "impossible record status " + << status; + ret = false; + } else if (page_rec_is_infimum(rec)) { + if (status != REC_STATUS_INFIMUM) { + ib::error() + << "infimum record has status " + << status; + ret = false; + } + } else if (page_rec_is_supremum(rec)) { + if (status != REC_STATUS_SUPREMUM) { + ib::error() << "supremum record has " + "status " + << status; + ret = false; + } + } else if (!page_is_leaf(page)) { + if (status != REC_STATUS_NODE_PTR) { + ib::error() << "node ptr record has " + "status " + << status; + ret = false; + } + } else if (!index->is_instant() + && status == REC_STATUS_INSTANT) { + ib::error() << "instantly added record in a " + "non-instant index"; + ret = false; + } + } + + /* Check that the records are in the ascending order */ + if (count >= PAGE_HEAP_NO_USER_LOW + && !page_rec_is_supremum(rec)) { + + int ret = cmp_rec_rec( + rec, old_rec, offsets, old_offsets, index); + + /* For spatial index, on nonleaf leavel, we + allow recs to be equal. */ + if (ret <= 0 && !(ret == 0 && index->is_spatial() + && !page_is_leaf(page))) { + + ib::error() << "Records in wrong order"; + + fputs("\nInnoDB: previous record ", stderr); + /* For spatial index, print the mbr info.*/ + if (index->type & DICT_SPATIAL) { + putc('\n', stderr); + rec_print_mbr_rec(stderr, + old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + putc('\n', stderr); + rec_print_mbr_rec(stderr, rec, offsets); + putc('\n', stderr); + putc('\n', stderr); + + } else { + rec_print_new(stderr, old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + + ret = FALSE; + } + } + + if (page_rec_is_user_rec(rec)) { + + data_size += rec_offs_size(offsets); + +#if defined(UNIV_GIS_DEBUG) + /* For spatial index, print the mbr info.*/ + if (index->type & DICT_SPATIAL) { + rec_print_mbr_rec(stderr, rec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_GIS_DEBUG */ + } + + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= srv_page_size)) { + ib::error() << "Record offset out of bounds: " + << offs << '+' << i; + ret = FALSE; + goto next_rec; + } + while (i--) { + if (UNIV_UNLIKELY(buf[offs + i])) { + ib::error() << "Record overlaps another: " + << offs << '+' << i; + ret = FALSE; + break; + } + buf[offs + i] = 1; + } + + if (ulint rec_own_count = page_is_comp(page) + ? rec_get_n_owned_new(rec) + : rec_get_n_owned_old(rec)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_own_count != own_count)) { + ib::error() << "Wrong owned count at " << offs + << ": " << rec_own_count + << ", " << own_count; + ret = FALSE; + } + + if (page_dir_slot_get_rec(slot) != rec) { + ib::error() << "Dir slot does not" + " point to right rec at " << offs; + ret = FALSE; + } + + if (ret) { + page_dir_slot_check(slot); + } + + own_count = 0; + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + +next_rec: + old_rec = rec; + rec = page_rec_get_next_const(rec); + + if (UNIV_UNLIKELY(!rec != page_rec_is_supremum(old_rec))) { + ib::error() << "supremum is not last record: " << offs; + ret = FALSE; + } + + if (!rec) { + rec = old_rec; /* supremum */ + break; + } + + count++; + own_count++; + + if (page_rec_is_infimum(old_rec) + && page_rec_is_user_rec(rec)) { + first_rec = rec; + } + + /* set old_offsets to offsets; recycle offsets */ + std::swap(old_offsets, offsets); + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: + ib::error() << "n owned is zero at " << offs; + ret = FALSE; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + ib::error() << "n slots wrong " << slot_no << " " + << (n_slots - 1); + ret = FALSE; + } + + if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS)) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + ib::error() << "n recs wrong " + << page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW << " " << (count + 1); + ret = FALSE; + } + + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { + ib::error() << "Summed data size " << data_size + << ", returned by func " << page_get_data_size(page); + ret = FALSE; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + offsets = rec_get_offsets(rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + ret = FALSE; +next_free: + const ulint offs = rec_get_next_offs( + rec, page_is_comp(page)); + if (!offs) { + break; + } + if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM + || offs >= srv_page_size)) { + ib::error() << "Page free list is corrupted"; + ret = FALSE; + break; + } + + rec = page + offs; + continue; + } + + count++; + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= srv_page_size)) { + ib::error() << "Free record offset out of bounds: " + << offs << '+' << i; + ret = FALSE; + goto next_free; + } + while (i--) { + if (UNIV_UNLIKELY(buf[offs + i])) { + ib::error() << "Free record overlaps another: " + << offs << '+' << i; + ret = FALSE; + break; + } + buf[offs + i] = 1; + } + + goto next_free; + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + ib::error() << "N heap is wrong " + << page_dir_get_n_heap(page) << " " << count + 1; + ret = FALSE; + } + + mem_heap_free(heap); + + if (UNIV_UNLIKELY(!ret)) { + goto func_exit2; + } + + return(ret); +} + +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no)/*!< in: heap number */ +{ + const rec_t* rec; + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_new(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, TRUE); + } + } else { + rec = page + PAGE_OLD_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_old(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, FALSE); + } + } +} + +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ +const rec_t *page_find_rec_last_not_deleted(const page_t *page) +{ + ut_ad(page_is_leaf(page)); + + if (page_is_comp(page)) + { + const rec_t *rec= page + PAGE_NEW_INFIMUM; + const rec_t *prev_rec= rec; + do + { + if (!(rec[-REC_NEW_INFO_BITS] & + (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))) + prev_rec= rec; + if (!(rec= page_rec_get_next_low(rec, true))) + return page + PAGE_NEW_INFIMUM; + } while (rec != page + PAGE_NEW_SUPREMUM); + return prev_rec; + } + else + { + const rec_t *rec= page + PAGE_OLD_INFIMUM; + const rec_t *prev_rec= rec; + do + { + if (!(rec[-REC_OLD_INFO_BITS] & + (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG))) + prev_rec= rec; + if (!(rec= page_rec_get_next_low(rec, false))) + return page + PAGE_OLD_INFIMUM; + } while (rec != page + PAGE_OLD_SUPREMUM); + return prev_rec; + } +} diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc new file mode 100644 index 00000000..89e6d149 --- /dev/null +++ b/storage/innobase/page/page0zip.cc @@ -0,0 +1,4666 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0zip.cc +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#include "page0zip.h" +#include "fsp0types.h" +#include "page0page.h" +#include "buf0checksum.h" +#include "zlib.h" +#include "span.h" + +using st_::span; + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0log.h" +#include "dict0dict.h" +#include "btr0cur.h" +#include "log0recv.h" +#include "row0row.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "lock0lock.h" +#include "srv0srv.h" +#include "buf0lru.h" +#include "srv0mon.h" + +#include <map> +#include <algorithm> + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +page_zip_stat_per_index_t page_zip_stat_per_index; + +/** Compression level to be used by zlib. Settable by user. */ +uint page_zip_level; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +/** Extra bytes of an infimum record */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +/** Data bytes of an infimum record */ +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +/** Extra bytes and data bytes of a supremum record */ +static const byte supremum_extra_data alignas(4) [] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +@param b in: memory block +@param s in: size of the memory block, in bytes */ +#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s)) +/** Assert that a BLOB pointer is filled with zero bytes. +@param b in: BLOB pointer */ +#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include <stdarg.h> +MY_ATTRIBUTE((format (printf, 1, 2))) +/**********************************************************************//** +Report a failure to decompress or compress. +@return number of characters printed */ +static +int +page_zip_fail_func( +/*===============*/ + const char* fmt, /*!< in: printf(3) format string */ + ...) /*!< in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +/** Wrapper for page_zip_fail_func() +@param fmt_args in: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/** Dummy wrapper for page_zip_fail_func() +@param fmt_args ignored: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ +{ + ulint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(static_cast<uLong>(2 * (n_fields + 1))); + return(lint(size) > 0 ? size : 0); +} + +/** Check whether a tuple is too big for compressed table +@param[in] index dict index object +@param[in] entry entry for the index +@return true if it's too big, otherwise false */ +bool +page_zip_is_too_big( + const dict_index_t* index, + const dtuple_t* entry) +{ + const ulint zip_size = index->table->space->zip_size(); + + /* Estimate the free space of an empty compressed page. + Subtract one byte for the encoded heap_no in the + modification log. */ + ulint free_space_zip = page_zip_empty_size( + index->n_fields, zip_size); + ulint n_uniq = dict_index_get_n_unique_in_tree(index); + + ut_ad(dict_table_is_comp(index->table)); + ut_ad(zip_size); + + if (free_space_zip == 0) { + return(true); + } + + /* Subtract one byte for the encoded heap_no in the + modification log. */ + free_space_zip--; + + /* There should be enough room for two node pointer + records on an empty non-leaf page. This prevents + infinite page splits. */ + + if (entry->n_fields >= n_uniq + && (REC_NODE_PTR_SIZE + + rec_get_converted_size_comp_prefix( + index, entry->fields, n_uniq, NULL) + /* On a compressed page, there is + a two-byte entry in the dense + page directory for every record. + But there is no record header. */ + - (REC_N_NEW_EXTRA_BYTES - 2) + > free_space_zip / 2)) { + return(true); + } + + return(false); +} + +/*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return ulint(page_dir_get_n_heap(page_zip->data)) + - PAGE_HEAP_NO_USER_LOW; +} + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return length of dense page directory, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); +} + +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); +} + +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). +@return length of dense page directory comprising existing records, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * ulint(page_get_n_recs(page_zip->data)); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/*************************************************************//** +Find the slot of the given record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + byte* slot, /*!< in: start of records */ + byte* end, /*!< in: end of records */ + ulint offset) /*!< in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/*************************************************************//** +Find the slot of the given non-free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/*************************************************************//** +Find the slot of the given free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/*************************************************************//** +Read a given slot in the dense page directory. +@return record offset on the uncompressed page, possibly ORed with +PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint slot) /*!< in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in] b ROW_FORMAT=COMPRESSED index page +@param[in] offset byte offset from b.zip.data +@param[in] len length of the data to write */ +inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len) +{ + ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX || + fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE); + ut_ad(page_zip_simple_validate(&b.page.zip)); + ut_ad(offset + len <= page_zip_get_size(&b.page.zip)); + + memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len); + m_last_offset= static_cast<uint16_t>(offset + len); +} + +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in] b ROW_FORMAT=COMPRESSED index page +@param[in] dest destination within b.zip.data +@param[in] str the data to write +@param[in] len length of the data to write +@tparam w write request type */ +template<mtr_t::write_type w> +inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str, + ulint len) +{ + byte *d= static_cast<byte*>(dest); + const byte *s= static_cast<const byte*>(str); + ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET); + if (w != FORCED) + { + ut_ad(len); + const byte *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == MAYBE_NOP); + return; + } + } + s--; + d--; + len= static_cast<ulint>(end - d); + } + ::memcpy(d, s, len); + zmemcpy(b, d - b.page.zip.data, len); +} + +/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page. +@param[in,out] block ROW_FORMAT=COMPRESSED index page +@param[in] index the index that the block belongs to +@param[in,out] mtr mini-transaction */ +static void page_zip_compress_write_log(buf_block_t *block, + dict_index_t *index, mtr_t *mtr) +{ + ut_ad(!index->is_ibuf()); + + if (!mtr->is_logged()) + return; + + const page_t *page= block->page.frame; + const page_zip_des_t *page_zip= &block->page.zip; + /* Read the number of user records. */ + ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) - + PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + else if (index->is_clust()) + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + else + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE; + /* Add the space occupied by BLOB pointers. */ + trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); + compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA); + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + mtr->init(block); + mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV); + + if (trailer_size) + mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size, + trailer_size); +} + +/******************************************************//** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + const rec_t* rec, /*!< in: compact physical record + on a B-tree leaf page */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/**********************************************************************//** +Encode the length of a fixed-length column. +@return buf + length of encoded val */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + byte* buf, /*!< in: pointer to buffer where to write */ + ulint val) /*!< in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields + to compress */ + const dict_index_t* index, /*!< in: index comprising + at least n fields */ + ulint trx_id_pos, + /*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (DATA_BIG_COL(column)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_FIXED_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, ulint(field->fixed_len) << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/**********************************************************************//** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /*!< in: compact page */ + byte* buf, /*!< in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /*!< in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY(!page_has_prev(page))) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < srv_page_size - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); + compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK + & (PAGE_ZIP_DIR_SLOT_MASK + 1))); + compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK + >= UNIV_ZIP_SIZE_MAX - 1); + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (info_bits & REC_INFO_DELETED_FLAG) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(ulint(rec_get_status(rec)) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(ulint(rec_get_status(rec)) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +extern "C" { + +/**********************************************************************//** +Allocate memory for zlib. */ +static +void* +page_zip_zalloc( +/*============*/ + void* opaque, /*!< in/out: memory heap */ + uInt items, /*!< in: number of items to allocate */ + uInt size) /*!< in: size of an item in bytes */ +{ + return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size)); +} + +/**********************************************************************//** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */ + void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */ +{ +} + +} /* extern "C" */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + z_stream* strm = static_cast<z_stream*>(stream); + + strm->zalloc = page_zip_zalloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/** Symbol for enabling compression and decompression diagnostics */ +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/** Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +static bool page_zip_compress_dbg; +/** Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +static unsigned page_zip_compress_log; + +/**********************************************************************//** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +static +int +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/*!< in: log file, or NULL */ + z_streamp strm, /*!< in/out: compressed stream for deflate() */ + int flush) /*!< in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + if (fwrite(strm->next_in, 1, strm->avail_in, logfile) + != strm->avail_in) { + perror("fwrite"); + } + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +/** Debug wrapper for the zlib compression routine deflate(). +Log the operation if page_zip_compress_dbg is set. +@param strm in/out: compressed stream +@param flush in: flushing method +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +/** Declaration of the logfile parameter */ +# define FILE_LOGFILE FILE* logfile, +/** The logfile parameter */ +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +/** Empty declaration of the logfile parameter */ +# define FILE_LOGFILE +/** Missing logfile parameter */ +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/**********************************************************************//** +Compress the records of a node pointer page. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + rec_offs* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress the records of a leaf node of a secondary index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_sec( +/*==================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense) /*!< in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (UNIV_LIKELY(c_stream->avail_in != 0)) { + MEM_CHECK_DEFINED(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + byte** externs, /*!< in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /*!< in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + if (UNIV_LIKELY(c_stream->avail_in != 0)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust( +/*====================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* n_blobs, /*!< in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /*!< index of the trx_id column */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + rec_offs* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (rec_offs_any_extern(offsets)) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = static_cast<uInt>( + rec + rec_offs_data_size(offsets) - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err);} + +/** Attempt to compress a ROW_FORMAT=COMPRESSED page. +@retval true on success +@retval false on failure; block->page.zip will be left intact. */ +bool +page_zip_compress( + buf_block_t* block, /*!< in/out: buffer block */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + z_stream c_stream; + int err; + byte* fields; /*!< index field information */ + byte* buf; /*!< compressed payload of the + page */ + byte* buf_end; /* end of buf */ + ulint n_dense; + ulint slot_size; /* amount of uncompressed bytes + per record */ + const rec_t** recs; /*!< dense page directory, + sorted by address */ + mem_heap_t* heap; + ulint trx_id_col = ULINT_UNDEFINED; + ulint n_blobs = 0; + byte* storage; /* storage of uncompressed + columns */ + const ulonglong ns = my_interval_timer(); +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled; + cmp_per_index_enabled = srv_cmp_per_index_enabled; + + page_t* page = block->page.frame; + page_zip_des_t* page_zip = &block->page.zip; + + ut_a(page_is_comp(page)); + ut_a(fil_page_index_page_check(page)); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(!dict_index_is_ibuf(index)); + + MEM_CHECK_DEFINED(page, srv_page_size); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (page_is_empty(page)) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + const ulint n_fields = page_is_leaf(page) + ? dict_index_get_n_fields(index) + : dict_index_get_n_unique_in_tree_nonleaf(index); + index_id_t ind_id = index->id; + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ib::info() << "compress " + << static_cast<void*>(page_zip) << " " + << static_cast<const void*>(page) << " " + << page_is_leaf(page) << " " + << n_fields << " " << n_dense; + } + + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + if (fwrite(page, 1, srv_page_size, logfile) + != srv_page_size) { + perror("fwrite"); + } + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed++; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + MONITOR_INC(MONITOR_PAGE_COMPRESS); + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + srv_page_size * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = static_cast<const rec_t**>( + mem_heap_zalloc(heap, n_dense * sizeof *recs)); + + fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2)); + + buf = static_cast<byte*>( + mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA)); + + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, static_cast<int>(level), + Z_DEFLATED, static_cast<int>(srv_page_size_shift), + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1); + + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = index->db_trx_id(); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= uInt(n_dense * slot_size); + c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index, + trx_id_col, fields)); + c_stream.next_in = fields; + + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in = static_cast<uInt>( + page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page)); + ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR); + + MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].compressed_usec + += time_diff; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed_usec + += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + return false; + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + +#if defined HAVE_valgrind && !__has_feature(memory_sanitizer) + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + MEM_MAKE_DEFINED(buf, c_stream.total_out); +#endif /* HAVE_valgrind && !memory_sanitizer */ + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out); + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1); + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_zip_compress_write_log(block, index, mtr); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, srv_page_size, SEEK_SET); + if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) { + perror("fwrite"); + } + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed_ok++; + page_zip_stat_per_index[ind_id].compressed_usec += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); + } + + return true; +} + +/**********************************************************************//** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /*!< in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + index->zip_pad.mutex.~mutex(); + mem_heap_free(index->heap); + + dict_mem_table_free(table); + } +} + +/**********************************************************************//** +Read the index information for the compressed page. +@return own: dummy index describing the page, or NULL on error */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + const byte* buf, /*!< in: index information */ + const byte* end, /*!< in: end of buf */ + ulint* trx_id_col,/*!< in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ + bool is_spatial)/*< in: is spatial index or not */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_table_t::create({C_STRING_WITH_LEN("ZIP_DUMMY")}, + nullptr, n, 0, DICT_TF_COMPACT, 0); + index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n); + index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { +fail: + page_zip_fields_free(index); + return NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + goto fail; + } else { + index->n_nullable = static_cast<unsigned>(val) + & dict_index_t::MAX_N_FIELDS; + } + } + + /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */ + index->n_core_fields = index->n_fields; + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + + ut_ad(b == end); + + if (is_spatial) { + index->type |= DICT_SPATIAL; + } + + return(index); +} + +/**********************************************************************//** +Populate the sparse page directory from the dense directory. +@return TRUE on success, FALSE on failure */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ibool +page_zip_dir_decode( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + page_t* page, /*!< in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /*!< out: dense page directory sorted by + ascending address (and heap_no) */ + ulint n_dense)/*!< in: number of user records, and + size of recs[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1U); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + std::sort(recs, recs + n_dense); + return(TRUE); +} + +/**********************************************************************//** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + page_t* page, /*!< in/out: uncompressed page */ + ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_DEL) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/**********************************************************************//** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. +@return pointer to modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + const byte* data, /*!< in: modification log */ + const byte* end) /*!< in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, ulint(dst - next_out)); + data += ulint(dst - next_out); + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += ulint(dst - next_out) + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext:" + " ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = ulint(rec_get_end(rec, offsets) - next_out); + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext:" + " last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. +@return pointer to end of modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log( +/*===============*/ + const byte* data, /*!< in: modification log */ + ulint size, /*!< in: maximum length of the log, in bytes */ + rec_t** recs, /*!< in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/*!< in: size of recs[] */ + ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */ + ulint trx_id_col,/*!< in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /*!< in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /*!< in: index of the page */ + rec_offs* offsets)/*!< in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + rec_offs* offs; + offs = rec_get_offsets(rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + + compile_time_assert(REC_STATUS_NODE_PTR == TRUE); + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + /* Silence a debug assertion in rec_offs_make_valid(). + This will be overwritten in page_zip_set_extra_bytes(), + called by page_zip_decompress_low(). */ + ut_d(rec[-REC_NEW_INFO_BITS] = 0); + rec_offs_make_valid(rec, index, n_core != 0, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log:" + " %lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log:" + " trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = ulint(rec_get_end(rec, offsets) - b); + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/**********************************************************************//** +Set the heap_no in a record, and skip the fixed-size record header +that is not included in the d_stream. +@return TRUE on success, FALSE if d_stream does not end at rec */ +static +ibool +page_zip_decompress_heap_no( +/*========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + ulint& heap_status) /*!< in/out: heap_no and status bits */ +{ + if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) { + /* n_dense has grown since the page was last compressed. */ + return(FALSE); + } + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a node pointer page. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + rec_offs* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>( + n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE)); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < srv_page_size + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out =static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in); +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, 0, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index)) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index)), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip_dir_start_low(page_zip, n_dense); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a leaf node of a secondary index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + rec_offs* offsets) /*!< in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uint>( + n_dense * PAGE_ZIP_DIR_SLOT_SIZE); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = static_cast<uint>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + + ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in)); + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + index->n_fields, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/**********************************************************************//** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. +@return TRUE on success */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col) /*!< in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint trx_id_col, /*!< index of the trx_id column */ + rec_offs* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>(n_dense) + * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out =static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < srv_page_size + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (rec_offs_any_extern(offsets)) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = static_cast<uInt>( + rec_get_end(rec, offsets) - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + + ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in)); + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + index->n_fields, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip_dir_start_low(page_zip, n_dense); + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + bool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust:" + " %p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_low( +/*====================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /*!< dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + rec_offs* offsets; + + ut_ad(page_zip_simple_validate(page_zip)); + MEM_CHECK_ADDRESSABLE(page, srv_page_size); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size); + + recs = static_cast<rec_t**>( + mem_heap_alloc(heap, n_dense * sizeof *recs)); + + if (all) { + /* Copy the page header. */ + memcpy_aligned<2>(page, page_zip->data, PAGE_DATA); + } else { + /* Check that the bytes that we skip are identical. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(FIL_PAGE_TYPE + page, + FIL_PAGE_TYPE + page_zip->data, + PAGE_HEADER - FIL_PAGE_TYPE)); + ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page, + PAGE_HEADER + PAGE_LEVEL + page_zip->data, + PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL))); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + /* Copy the mutable parts of the page header. */ + memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE); + memcpy_aligned<2>(PAGE_HEADER + page, + PAGE_HEADER + page_zip->data, + PAGE_LEVEL - PAGE_N_DIR_SLOTS); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the page headers match after copying. */ + ut_a(!memcmp(page, page_zip->data, PAGE_DATA)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + +#ifdef UNIV_ZIP_DEBUG + /* Clear the uncompressed page, except the header. */ + memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA); +#endif /* UNIV_ZIP_DEBUG */ + MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (page_is_empty(page)) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1 + + page, supremum_extra_data, + sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = static_cast<uInt>( + page_zip_get_size(page_zip) - (PAGE_DATA + 1)); + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START); + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift)) + != Z_OK)) { + ut_error; + } + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL, + fil_page_get_type(page) == FIL_PAGE_RTREE); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + offsets = static_cast<rec_offs*>( + mem_heap_alloc(heap, n * sizeof(ulint))); + + rec_offs_set_n_alloc(offsets, n); + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + MEM_CHECK_DEFINED(page, srv_page_size); + + page_zip_fields_free(index); + mem_heap_free(heap); + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + const ulonglong ns = my_interval_timer(); + + if (!page_zip_decompress_low(page_zip, page, all)) { + return(FALSE); + } + + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + MONITOR_INC(MONITOR_PAGE_DECOMPRESS); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /*!< in: name of the data structure */ + const void* buf, /*!< in: data */ + ulint size) /*!< in: length of the data, in bytes */ +{ + const byte* s = static_cast<const byte*>(buf); + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +/** Dump a block of memory on the standard error stream. +@param buf in: data +@param size in: length of the data, in bytes */ +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/** Flag: make page_zip_validate() compare page headers only */ +bool page_zip_validate_header_only; + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_ROOT_AUTO_INC) + /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch + on the clustered index root page (page number 3 in .ibd files). + That allows concurrent readers (holding buf_block_t::lock S-latch). + Because we do not know what type of a latch our caller is holding, + we will ignore the field on clustered index root pages in order + to avoid false positives. */ + || (page_get_page_no(page) != 3/* clustered index root page */ + && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], + &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8)) + || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END], + &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END], + PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, srv_page_size); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + srv_page_size aligned. */ + page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + temp_page_zip = *page_zip; + valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + rec_offs* offsets; + mem_heap_t* heap; + const rec_t* rec; + const rec_t* trec; + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5/*REC_NEW_INFO_BITS*/; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + srv_page_size - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate:" + " min_rec_flag" + " (%s" UINT32PF "," UINT32PF + ",0x%02x)\n", + sloppy ? "ignored, " : "", + page_get_space_id(page), + page_get_page_no(page), + page[offset])); + /* We don't check for spatial index, since + the "minimum record" could be deleted when + doing rtr_update_mbr_field. + GIS_FIXME: need to validate why + rtr_update_mbr_field.() could affect this */ + if (index && dict_index_is_spatial(index)) { + valid = true; + } else { + valid = sloppy; + } + goto func_exit; + } + } + + /* Compare the pointers in the PAGE_FREE list. */ + rec = page_header_get_ptr(page, PAGE_FREE); + trec = page_header_get_ptr(temp_page, PAGE_FREE); + + while (rec || trec) { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate:" + " PAGE_FREE list: %u!=%u\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + goto func_exit; + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } + + /* Compare the records. */ + heap = NULL; + offsets = NULL; + rec = page_rec_get_next_low( + page + PAGE_NEW_INFIMUM, TRUE); + trec = page_rec_get_next_low( + temp_page + PAGE_NEW_INFIMUM, TRUE); + const ulint n_core = (index && page_is_leaf(page)) + ? index->n_fields : 0; + + do { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate:" + " record list: 0x%02x!=0x%02x\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + break; + } + + if (index) { + /* Compare the data. */ + offsets = rec_get_offsets( + rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + + if (memcmp(rec - rec_offs_extra_size(offsets), + trec - rec_offs_extra_size(offsets), + rec_offs_size(offsets))) { + page_zip_fail( + ("page_zip_validate:" + " record content: 0x%02x", + (unsigned) page_offset(rec))); + valid = FALSE; + break; + } + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } while (rec || trec); + + if (heap) { + mem_heap_free(heap); + } + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, srv_page_size); + page_zip_hexdump(temp_page, srv_page_size); + } + aligned_free(temp_page); + return(valid); +} + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ +{ + return(page_zip_validate_low(page_zip, page, index, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Assert that the compressed and decompressed page headers match. +@return TRUE */ +static +ibool +page_zip_header_cmp( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const byte* page) /*!< in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. +@return end of modification log */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + buf_block_t* block, /*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + const dict_index_t*index, /*!< in: record descriptor */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + ulint create, /*!< in: nonzero=insert, zero=update */ + ulint trx_id_col, /*!< in: position of DB_TRX_ID */ + ulint heap_no, /*!< in: heap number of rec */ + byte* storage, /*!< in: end of dense page directory */ + byte* data, /*!< in: end of modification log */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + + ut_ad(rec_offs_validate(rec, index, offsets)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * FIELD_REF_SIZE + < externs - FIELD_REF_SIZE * page_zip->n_blobs); + + if (n_ext) { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs = (page_zip->n_blobs + n_ext) + & ((1U << 12) - 1); + ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE); + if (ulint len = ulint(externs - ext_end)) { + byte* ext_start = ext_end + - n_ext * FIELD_REF_SIZE; + memmove(ext_start, ext_end, len); + mtr->memmove(*block, + ext_start - page_zip->data, + ext_end - page_zip->data, len); + } + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, ulint(src - start)); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + constexpr ulint sys_len = DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + byte* sys = storage - sys_len * (heap_no - 1); + memcpy(sys, src, sys_len); + i++; /* skip also roll_ptr */ + mtr->zmemcpy(*block, sys - page_zip->data, sys_len); + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len >= FIELD_REF_SIZE); + src += len - FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, ulint(src - start)); + data += src - start; + start = src + FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, FIELD_REF_SIZE); + mtr->zmemcpy(*block, externs - page_zip->data, + FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - ulint(start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/** Write an entire record to the ROW_FORMAT=COMPRESSED page. +The data must already have been written to the uncompressed page. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] rec record in the uncompressed page +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] create nonzero=insert, zero=update +@param[in,out] mtr mini-transaction */ +void page_zip_write_rec(buf_block_t *block, const byte *rec, + const dict_index_t *index, const rec_offs *offsets, + ulint create, mtr_t *mtr) +{ + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + byte s = *slot; + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. + On non-leaf pages, the delete-mark flag is garbage. */ + ut_ad(!index->is_primary() || !page_is_leaf(page) + || row_get_rec_trx_id(rec, index, offsets)); + s |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8)); + } + + if (s != *slot) { + *slot = s; + mtr->zmemcpy(*block, slot - page_zip->data, 1); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip_dir_start(page_zip); + + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + block, + rec, index, offsets, create, + index->db_trx_id(), heap_no, + storage, data, mtr); + } else { + /* Locate trx_id and roll_ptr. */ + ulint len; + const byte* src + = rec_get_nth_field(rec, offsets, + index->db_trx_id(), + &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + index->db_roll_ptr(), &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, ulint(src - rec)); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + constexpr ulint sys_len + = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + byte* sys = storage - sys_len * (heap_no - 1); + memcpy(sys, src, sys_len); + + src += sys_len; + mtr->zmemcpy(*block, sys - page_zip->data, + sys_len); + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - ulint(src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + ulint len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr, + rec + len, REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + mtr->zmemcpy(*block, page_zip->m_end, + data - page_zip->data - page_zip->m_end); + page_zip->m_end = uint16_t(data - page_zip->data); + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +void +page_zip_write_blob_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const byte* field; + byte* externs; + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + ulint blob_no; + ulint len; + + ut_ad(page_align(rec) == page); + ut_ad(index != NULL); + ut_ad(offsets != NULL); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field, + BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +void +page_zip_write_node_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + byte* field; + byte* storage; + page_zip_des_t* const page_zip = &block->page.zip; + + ut_d(const page_t* const page = block->page.frame); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, size); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + compile_time_assert(REC_NODE_PTR_SIZE == 4); + mach_write_to_4(field, ptr); + mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE); +} + +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction */ +void +page_zip_write_trx_id_and_roll_ptr( + buf_block_t* block, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr) +{ + page_zip_des_t* const page_zip = &block->page.zip; + + ut_d(const page_t* const page = block->page.frame); + ut_ad(page_align(rec) == page); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + const ulint heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len; + + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + ulint len; + byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + compile_time_assert(DATA_TRX_ID_LEN == 6); + mach_write_to_6(field, trx_id); + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + len = 0; + if (heap_no > PAGE_HEAP_NO_USER_LOW) { + byte* prev = storage + sys_len; + for (; len < sys_len && prev[len] == field[len]; len++); + if (len > 4) { + /* We save space by replacing a single record + + WRITE,offset(storage),byte[13] + + with up to two records: + + MEMMOVE,offset(storage),len(1 byte),+13(1 byte), + WRITE|0x80,0,byte[13-len] + + The single WRITE record would be x+13 bytes long (x>2). + The MEMMOVE record would be x+1+1 = x+2 bytes, and + the second WRITE would be 1+1+13-len = 15-len bytes. + + The total size is: x+13 versus x+2+15-len = x+17-len. + To save space, we must have len>4. */ + memcpy(storage, prev, len); + mtr->memmove(*block, ulint(storage - page_zip->data), + ulint(storage - page_zip->data) + sys_len, + len); + storage += len; + field += len; + if (UNIV_LIKELY(len < sys_len)) { + goto write; + } + } else { + len = 0; + goto write; + } + } else { +write: + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field, + sys_len - len); + } +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage - len, field - len, sys_len)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ +static +void +page_zip_clear_rec( +/*===============*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint heap_no; + byte* storage; + byte* field; + ulint len; + + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t* const page_zip = &block->page.zip; + + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, block->page.frame)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if (!page_is_leaf(block->page.frame)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip_dir_start(page_zip); + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + storage -= (heap_no - 1) * REC_NODE_PTR_SIZE; + len = REC_NODE_PTR_SIZE; +clear_page_zip: + memset(storage, 0, len); + mtr->memset(*block, storage - page_zip->data, len, 0); + } else if (index->is_clust()) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + field = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + storage = page_zip_dir_start(page_zip) + - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + goto clear_page_zip; + } else { + ut_ad(!rec_offs_any_extern(offsets)); + } +} + +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) +{ + ut_ad(page_align(rec) == block->page.frame); + byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + else + b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8)); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(&block->page.zip, block->page.frame, nullptr)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +void +page_zip_rec_set_owned( +/*===================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag, /*!< in: the owned flag (nonzero=TRUE) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t *const page_zip= &block->page.zip; + byte *slot= page_zip_dir_find(page_zip, page_offset(rec)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + else + b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8)); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1); +} + +/**********************************************************************//** +Insert a record to the dense page directory. */ +void +page_zip_dir_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ + byte* rec, /*!< in: record to insert */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(page_align(cursor->rec) == cursor->block->page.frame); + ut_ad(page_align(rec) == cursor->block->page.frame); + page_zip_des_t *const page_zip= &cursor->block->page.zip; + + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(cursor->rec != rec); + ut_ad(page_rec_get_next_const(cursor->rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(cursor->rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(cursor->rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1U); + + if (UNIV_UNLIKELY(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(page_offset(rec) >= free_rec); + slot_free = page_zip_dir_find(page_zip, free_rec); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + if (const ulint slot_len = ulint(slot_rec - slot_free)) { + /* Shift the dense directory to allocate place for rec. */ + memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, slot_len); + mtr->memmove(*cursor->block, (slot_free - page_zip->data) + - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } + + /* Write the entry for the inserted record. + The "owned" flag must be zero. */ + uint16_t offs = page_offset(rec); + if (rec_get_deleted_flag(rec, true)) { + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs); + mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data + - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE); +} + +/** Shift the dense page directory and the array of BLOB pointers +when a record is deleted. +@param[in,out] block index page +@param[in,out] rec record being deleted +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] free previous start of the free list +@param[in,out] mtr mini-transaction */ +void page_zip_dir_delete(buf_block_t *block, byte *rec, + const dict_index_t *index, const rec_offs *offsets, + const byte *free, mtr_t *mtr) +{ + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t *const page_zip= &block->page.zip; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + mach_write_to_2(rec - REC_NEXT, + free ? static_cast<uint16_t>(free - rec) : 0); + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, page_free, page_offset(rec)); + byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, garbage, rec_offs_size(offsets) + + mach_read_from_2(garbage)); + compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2); + memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4); + byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot_rec); + uint16_t n_recs= page_get_n_recs(block->page.frame); + ut_ad(n_recs); + ut_ad(n_recs > 1 || page_get_page_no(block->page.frame) == index->page); + /* This could not be done before page_zip_dir_find(). */ + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, page_n_recs, n_recs - 1U); + memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs, + 2); + + byte *slot_free; + + if (UNIV_UNLIKELY(!free)) + /* Make the last slot the start of the free list. */ + slot_free= page_zip->data + page_zip_get_size(page_zip) - + PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) - + PAGE_HEAP_NO_USER_LOW); + else + { + slot_free= page_zip_dir_find_free(page_zip, page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free+= PAGE_ZIP_DIR_SLOT_SIZE; + } + + const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0; + if (slot_len) + { + memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_len); + mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + mtr->zmemcpy(*block, slot_free - page_zip->data, 2); + + if (const ulint n_ext= rec_offs_n_extern(offsets)) + { + ut_ad(index->is_primary()); + ut_ad(page_is_leaf(block->page.frame)); + + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + byte *externs= page_zip->data + page_zip_get_size(page_zip) - + (page_dir_get_n_heap(block->page.frame) - PAGE_HEAP_NO_USER_LOW) * + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; + byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE; + + /* Shift and zero fill the array. */ + if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) * + BTR_EXTERN_FIELD_REF_SIZE) + { + memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len); + mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE, + ext_end - page_zip->data, ext_len); + } + memset(ext_end, 0, n_ext * FIELD_REF_SIZE); + mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0); + page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1); + } + + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */ + + page_zip_clear_rec(block, rec, index, offsets, mtr); +} + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, redo log will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return error code +@retval DB_FAIL on overflow; the block_zip will be left intact */ +dberr_t +page_zip_reorganize( + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint z_level,/*!< in: compression level */ + mtr_t* mtr, /*!< in: mini-transaction */ + bool restore)/*!< whether to restore on failure */ +{ + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block->page.zip.data); + ut_ad(page_is_comp(page)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(!index->table->is_temporary()); + /* Note that page_zip_validate(page_zip, page, index) may fail here. */ + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data, + page_zip_get_size(buf_block_get_page_zip(block))); + + /* Disable logging */ + mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + temp_block = buf_block_alloc(); + btr_search_drop_page_hash_index(block, false); + temp_page = temp_block->page.frame; + + /* Copy the old page to temporary space */ + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_page, block->page.frame, + srv_page_size); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, true); + if (index->is_spatial()) { + mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE); + memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE, + page + FIL_PAGE_TYPE, 2); + memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8); + memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8); + } + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + dberr_t err = page_copy_rec_list_end_no_locks( + block, temp_block, page_get_infimum_rec(temp_page), + index, mtr); + + /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ + memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8); + /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */ + ut_ad(err != DB_SUCCESS + || index->is_clust() || !page_is_leaf(temp_page) + || page_get_max_trx_id(page) != 0); + /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than + clustered index root pages. */ + ut_ad(err != DB_SUCCESS + || page_get_max_trx_id(page) == 0 + || (index->is_clust() + ? !page_has_siblings(temp_page) + : page_is_leaf(temp_page))); + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(block, index, z_level, mtr)) { + if (restore) { + /* Restore the old page and exit. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the bytes that we skip are identical. */ + ut_a(!memcmp(page, temp_page, PAGE_HEADER)); + ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page, + PAGE_HEADER + PAGE_N_RECS + temp_page, + PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS))); + ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page, + srv_page_size - FIL_PAGE_DATA_END + + temp_page, + FIL_PAGE_DATA_END)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page, + PAGE_N_RECS - PAGE_N_DIR_SLOTS); + memcpy(PAGE_DATA + page, PAGE_DATA + temp_page, + srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(page, temp_page, srv_page_size)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + + err = DB_FAIL; + } else { + lock_move_reorganize_page(block, temp_block); + } + + buf_block_free(temp_block); + return err; +} + +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +void +page_zip_copy_recs( + buf_block_t* block, /*!< in/out: buffer block */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + page_t* page = block->page.frame; + page_zip_des_t* page_zip = &block->page.zip; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(!index->table->is_temporary()); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, index, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + MEM_CHECK_ADDRESSABLE(page, srv_page_size); + MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(src, srv_page_size); + MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ + compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src, + srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END)); + memcpy_aligned<2>(PAGE_HEADER + page_zip->data, + PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_DATA + page_zip->data, + PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + if (dict_index_is_clust(index)) { + /* Reset the PAGE_ROOT_AUTO_INC field when copying + from a root page. */ + memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + page, 0, 8); + memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + page_zip->data, 0, 8); + } else { + /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages + of secondary indexes, and 0 on others. */ + ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src)); + } + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + new (page_zip) page_zip_des_t(*src_zip, false); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index)) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(!page_has_prev(src)) + && UNIV_LIKELY(page_has_prev(page))) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] + &= byte(~REC_INFO_MIN_REC_FLAG); + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_zip_compress_write_log(block, index, mtr); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the compressed page checksum. +@param data compressed page +@param size size of compressed page +@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C +@return page checksum */ +uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler) +{ + uLong adler; + const Bytef* s = static_cast<const byte*>(data); + + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (!use_adler) { + return my_crc32c(0, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET) + ^ my_crc32c(0, s + FIL_PAGE_TYPE, 2) + ^ my_crc32c(0, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } else { + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32( + adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + static_cast<uInt>(size) + - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return(uint32_t(adler)); + } +} + +/** Validate the checksum on a ROW_FORMAT=COMPRESSED page. +@param data ROW_FORMAT=COMPRESSED page +@param size size of the page, in bytes +@return whether the stored checksum matches innodb_checksum_algorithm */ +bool page_zip_verify_checksum(const byte *data, size_t size) +{ + if (buf_is_zeroes(span<const byte>(data, size))) { + return true; + } + + const uint32_t stored = mach_read_from_4( + data + FIL_PAGE_SPACE_OR_CHKSUM); + + uint32_t calc = page_zip_calc_checksum(data, size, false); + +#ifdef UNIV_INNOCHECKSUM + extern FILE* log_file; + extern uint32_t cur_page_num; + + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " checksum: calculated = " UINT32PF ";" + " recorded = " UINT32PF "\n", cur_page_num, + calc, stored); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (stored == calc) { + return(TRUE); + } + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + break; + default: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + + return stored == page_zip_calc_checksum(data, size, true); + } +#endif /* !UNIV_INNOCHECKSUM */ + + return FALSE; +} |