summaryrefslogtreecommitdiffstats
path: root/storage/innobase/page/page0zip.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/page/page0zip.cc
parentInitial commit. (diff)
downloadmariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz
mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/page/page0zip.cc')
-rw-r--r--storage/innobase/page/page0zip.cc4666
1 files changed, 4666 insertions, 0 deletions
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
new file mode 100644
index 00000000..89e6d149
--- /dev/null
+++ b/storage/innobase/page/page0zip.cc
@@ -0,0 +1,4666 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.cc
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "page0page.h"
+#include "buf0checksum.h"
+#include "zlib.h"
+#include "span.h"
+
+using st_::span;
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "log0recv.h"
+#include "row0row.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "buf0lru.h"
+#include "srv0mon.h"
+
+#include <map>
+#include <algorithm>
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+page_zip_stat_per_index_t page_zip_stat_per_index;
+
+/** Compression level to be used by zlib. Settable by user. */
+uint page_zip_level;
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+ 0x01, /* info_bits=0, n_owned=1 */
+ 0x00, 0x02 /* heap_no=0, status=2 */
+ /* ?, ? */ /* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+ 0x69, 0x6e, 0x66, 0x69,
+ 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data alignas(4) [] = {
+ /* 0x0?, */ /* info_bits=0, n_owned=1..8 */
+ 0x00, 0x0b, /* heap_no=1, status=3 */
+ 0x00, 0x00, /* next=0 */
+ 0x73, 0x75, 0x70, 0x72,
+ 0x65, 0x6d, 0x75, 0x6d /* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+@param b in: memory block
+@param s in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
+
+/* Enable some extra debugging output. This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+MY_ATTRIBUTE((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+ const char* fmt, /*!< in: printf(3) format string */
+ ...) /*!< in: arguments corresponding to fmt */
+{
+ int res;
+ va_list ap;
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ", stderr);
+ va_start(ap, fmt);
+ res = vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+{
+ ulint size = zip_size
+ /* subtract the page header and the longest
+ uncompressed data needed for one record */
+ - (PAGE_DATA
+ + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
+ + 1/* encoded heap_no==2 in page_zip_write_rec() */
+ + 1/* end of modification log */
+ - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+ /* subtract the space for page_zip_fields_encode() */
+ - compressBound(static_cast<uLong>(2 * (n_fields + 1)));
+ return(lint(size) > 0 ? size : 0);
+}
+
+/** Check whether a tuple is too big for compressed table
+@param[in] index dict index object
+@param[in] entry entry for the index
+@return true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+ const dict_index_t* index,
+ const dtuple_t* entry)
+{
+ const ulint zip_size = index->table->space->zip_size();
+
+ /* Estimate the free space of an empty compressed page.
+ Subtract one byte for the encoded heap_no in the
+ modification log. */
+ ulint free_space_zip = page_zip_empty_size(
+ index->n_fields, zip_size);
+ ulint n_uniq = dict_index_get_n_unique_in_tree(index);
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(zip_size);
+
+ if (free_space_zip == 0) {
+ return(true);
+ }
+
+ /* Subtract one byte for the encoded heap_no in the
+ modification log. */
+ free_space_zip--;
+
+ /* There should be enough room for two node pointer
+ records on an empty non-leaf page. This prevents
+ infinite page splits. */
+
+ if (entry->n_fields >= n_uniq
+ && (REC_NODE_PTR_SIZE
+ + rec_get_converted_size_comp_prefix(
+ index, entry->fields, n_uniq, NULL)
+ /* On a compressed page, there is
+ a two-byte entry in the dense
+ page directory for every record.
+ But there is no record header. */
+ - (REC_N_NEW_EXTRA_BYTES - 2)
+ > free_space_zip / 2)) {
+ return(true);
+ }
+
+ return(false);
+}
+
+/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ /* Exclude the page infimum and supremum from the record count. */
+ return ulint(page_dir_get_n_heap(page_zip->data))
+ - PAGE_HEAP_NO_USER_LOW;
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint n_dense) /*!< in: directory size */
+{
+ ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+ return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@param[in] n_dense number of entries in the directory
+@return pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense) \
+ ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@return pointer to the dense page directory */
+#define page_zip_dir_start(page_zip) \
+ page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size = PAGE_ZIP_DIR_SLOT_SIZE
+ * ulint(page_get_n_recs(page_zip->data));
+ ut_ad(size <= page_zip_dir_size(page_zip));
+ return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+ byte* slot, /*!< in: start of records */
+ byte* end, /*!< in: end of records */
+ ulint offset) /*!< in: offset of user record */
+{
+ ut_ad(slot <= end);
+
+ for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+ if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+ == offset) {
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+ end,
+ offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+ end - page_zip_dir_user_size(page_zip),
+ offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint slot) /*!< in: slot
+ (0=first user record) */
+{
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+ return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] offset byte offset from b.zip.data
+@param[in] len length of the data to write */
+inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+ ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
+ fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
+ ut_ad(page_zip_simple_validate(&b.page.zip));
+ ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
+
+ memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
+ m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in] b ROW_FORMAT=COMPRESSED index page
+@param[in] dest destination within b.zip.data
+@param[in] str the data to write
+@param[in] len length of the data to write
+@tparam w write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
+ ulint len)
+{
+ byte *d= static_cast<byte*>(dest);
+ const byte *s= static_cast<const byte*>(str);
+ ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
+ if (w != FORCED)
+ {
+ ut_ad(len);
+ const byte *const end= d + len;
+ while (*d++ == *s++)
+ {
+ if (d == end)
+ {
+ ut_ad(w == MAYBE_NOP);
+ return;
+ }
+ }
+ s--;
+ d--;
+ len= static_cast<ulint>(end - d);
+ }
+ ::memcpy(d, s, len);
+ zmemcpy(b, d - b.page.zip.data, len);
+}
+
+/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
+@param[in,out] block ROW_FORMAT=COMPRESSED index page
+@param[in] index the index that the block belongs to
+@param[in,out] mtr mini-transaction */
+static void page_zip_compress_write_log(buf_block_t *block,
+ dict_index_t *index, mtr_t *mtr)
+{
+ ut_ad(!index->is_ibuf());
+
+ if (!mtr->is_logged())
+ return;
+
+ const page_t *page= block->page.frame;
+ const page_zip_des_t *page_zip= &block->page.zip;
+ /* Read the number of user records. */
+ ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
+ PAGE_HEAP_NO_USER_LOW;
+ /* Multiply by uncompressed of size stored per record */
+ if (!page_is_leaf(page))
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ else if (index->is_clust())
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
+ DATA_ROLL_PTR_LEN;
+ else
+ trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
+ /* Add the space occupied by BLOB pointers. */
+ trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(page_zip->m_end > PAGE_DATA);
+ compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
+ ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+ mtr->init(block);
+ mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
+
+ if (trailer_size)
+ mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
+ trailer_size);
+}
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ const rec_t* rec, /*!< in: compact physical record
+ on a B-tree leaf page */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ const page_t* page = page_align(rec);
+ ulint n_ext = 0;
+ ulint i;
+ ulint left;
+ ulint heap_no;
+ ulint n_recs = page_get_n_recs(page_zip->data);
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(page_is_comp(page));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ left = heap_no - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(!left)) {
+ return(0);
+ }
+
+ for (i = 0; i < n_recs; i++) {
+ const rec_t* r = page + (page_zip_dir_get(page_zip, i)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+
+ if (rec_get_heap_no_new(r) < heap_no) {
+ n_ext += rec_get_n_extern_new(r, index,
+ ULINT_UNDEFINED);
+ if (!--left) {
+ break;
+ }
+ }
+ }
+
+ return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+ byte* buf, /*!< in: pointer to buffer where to write */
+ ulint val) /*!< in: value to write */
+{
+ ut_ad(val >= 2);
+
+ if (UNIV_LIKELY(val < 126)) {
+ /*
+ 0 = nullable variable field of at most 255 bytes length;
+ 1 = not null variable field of at most 255 bytes length;
+ 126 = nullable variable field with maximum length >255;
+ 127 = not null variable field with maximum length >255
+ */
+ *buf++ = (byte) val;
+ } else {
+ *buf++ = (byte) (0x80 | val >> 8);
+ *buf++ = (byte) val;
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields
+ to compress */
+ const dict_index_t* index, /*!< in: index comprising
+ at least n fields */
+ ulint trx_id_pos,
+ /*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */
+{
+ const byte* buf_start = buf;
+ ulint i;
+ ulint col;
+ ulint trx_id_col = 0;
+ /* sum of lengths of preceding non-nullable fixed fields, or 0 */
+ ulint fixed_sum = 0;
+
+ ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+ for (i = col = 0; i < n; i++) {
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+ ulint val;
+
+ if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+ val = 1; /* set the "not nullable" flag */
+ } else {
+ val = 0; /* nullable field */
+ }
+
+ if (!field->fixed_len) {
+ /* variable-length field */
+ const dict_col_t* column
+ = dict_field_get_col(field);
+
+ if (DATA_BIG_COL(column)) {
+ val |= 0x7e; /* max > 255 bytes */
+ }
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ *buf++ = (byte) val;
+ col++;
+ } else if (val) {
+ /* fixed-length non-nullable field */
+
+ if (fixed_sum && UNIV_UNLIKELY
+ (fixed_sum + field->fixed_len
+ > DICT_MAX_FIXED_COL_LEN)) {
+ /* Write out the length of the
+ preceding non-nullable fields,
+ to avoid exceeding the maximum
+ length of a fixed-length column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+ if (fixed_sum) {
+ /* Write out the length of any
+ preceding non-nullable fields,
+ and start a new trx_id column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ col++;
+ }
+
+ trx_id_col = col;
+ fixed_sum = field->fixed_len;
+ } else {
+ /* add to the sum */
+ fixed_sum += field->fixed_len;
+ }
+ } else {
+ /* fixed-length nullable field */
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ buf = page_zip_fixed_field_encode(
+ buf, ulint(field->fixed_len) << 1);
+ col++;
+ }
+ }
+
+ if (fixed_sum) {
+ /* Write out the lengths of last fixed-length columns. */
+ buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+ }
+
+ if (trx_id_pos != ULINT_UNDEFINED) {
+ /* Write out the position of the trx_id column */
+ i = trx_id_col;
+ } else {
+ /* Write out the number of nullable fields */
+ i = index->n_nullable;
+ }
+
+ if (i < 128) {
+ *buf++ = (byte) i;
+ } else {
+ *buf++ = (byte) (0x80 | i >> 8);
+ *buf++ = (byte) i;
+ }
+
+ ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+ return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+ const page_t* page, /*!< in: compact page */
+ byte* buf, /*!< in: pointer to dense page directory[-1];
+ out: dense directory on compressed page */
+ const rec_t** recs) /*!< in: pointer to an array of 0, or NULL;
+ out: dense page directory sorted by ascending
+ address (and heap_no) */
+{
+ const byte* rec;
+ ulint status;
+ ulint min_mark;
+ ulint heap_no;
+ ulint i;
+ ulint n_heap;
+ ulint offs;
+
+ min_mark = 0;
+
+ if (page_is_leaf(page)) {
+ status = REC_STATUS_ORDINARY;
+ } else {
+ status = REC_STATUS_NODE_PTR;
+ if (UNIV_UNLIKELY(!page_has_prev(page))) {
+ min_mark = REC_INFO_MIN_REC_FLAG;
+ }
+ }
+
+ n_heap = page_dir_get_n_heap(page);
+
+ /* Traverse the list of stored records in the collation order,
+ starting from the first user record. */
+
+ rec = page + PAGE_NEW_INFIMUM;
+
+ i = 0;
+
+ for (;;) {
+ ulint info_bits;
+ offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+ break;
+ }
+ rec = page + offs;
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+ ut_a(offs < srv_page_size - PAGE_DIR);
+ ut_a(offs >= PAGE_ZIP_START);
+ compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
+ & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
+ compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
+ >= UNIV_ZIP_SIZE_MAX - 1);
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+ offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+ }
+
+ info_bits = rec_get_info_bits(rec, TRUE);
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ info_bits &= ~REC_INFO_DELETED_FLAG;
+ offs |= PAGE_ZIP_DIR_SLOT_DEL;
+ }
+ ut_a(info_bits == min_mark);
+ /* Only the smallest user record can have
+ REC_INFO_MIN_REC_FLAG set. */
+ min_mark = 0;
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ ut_a(ulint(rec_get_status(rec)) == status);
+ }
+
+ offs = page_header_get_field(page, PAGE_FREE);
+
+ /* Traverse the free list (of deleted records). */
+ while (offs) {
+ ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+ rec = page + offs;
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+
+ ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+ ut_a(ulint(rec_get_status(rec)) == status);
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ offs = rec_get_next_offs(rec, TRUE);
+ }
+
+ /* Ensure that each heap no occurs at least once. */
+ ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+extern "C" {
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_zalloc(
+/*============*/
+ void* opaque, /*!< in/out: memory heap */
+ uInt items, /*!< in: number of items to allocate */
+ uInt size) /*!< in: size of an item in bytes */
+{
+ return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+ void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */
+ void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */
+{
+}
+
+} /* extern "C" */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ z_stream* strm = static_cast<z_stream*>(stream);
+
+ strm->zalloc = page_zip_zalloc;
+ strm->zfree = page_zip_free;
+ strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+static bool page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+static unsigned page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set.
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+ FILE* logfile,/*!< in: log file, or NULL */
+ z_streamp strm, /*!< in/out: compressed stream for deflate() */
+ int flush) /*!< in: deflate() flushing method */
+{
+ int status;
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ut_print_buf(stderr, strm->next_in, strm->avail_in);
+ }
+ if (UNIV_LIKELY_NULL(logfile)) {
+ if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
+ != strm->avail_in) {
+ perror("fwrite");
+ }
+ }
+ status = deflate(strm, flush);
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ fprintf(stderr, " -> %d\n", status);
+ }
+ return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm in/out: compressed stream
+@param flush in: flushing method
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Only leaf nodes may contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+
+ /* Compress the data bytes, except node_ptr. */
+ c_stream->next_in = (byte*) rec;
+ c_stream->avail_in = static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+
+ memcpy(storage - REC_NODE_PTR_SIZE
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in, REC_NODE_PTR_SIZE);
+ c_stream->next_in += REC_NODE_PTR_SIZE;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense) /*!< in: size of recs[] */
+{
+ int err = Z_OK;
+
+ ut_ad(n_dense > 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ /* Compress everything up to this record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ MEM_CHECK_DEFINED(c_stream->next_in,
+ c_stream->avail_in);
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+ c_stream->next_in = (byte*) rec;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t* rec, /*!< in: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ byte** externs, /*!< in/out: pointer to the next
+ available BLOB pointer */
+ ulint* n_blobs) /*!< in/out: number of
+ externally stored columns */
+{
+ int err;
+ ulint i;
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ /* Store trx_id and roll_ptr
+ in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ i++;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ /* Reserve space for the data at
+ the end of the space reserved for
+ the compressed data and the page
+ modification log. */
+
+ if (UNIV_UNLIKELY
+ (c_stream->avail_out
+ <= BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* out of space */
+ return(Z_BUF_ERROR);
+ }
+
+ ut_ad(*externs == c_stream->next_out
+ + c_stream->avail_out
+ + 1/* end of modif. log */);
+
+ c_stream->next_in
+ += BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Skip deleted records. */
+ if (UNIV_LIKELY_NULL
+ (page_zip_dir_find_low(
+ storage, deleted,
+ page_offset(rec)))) {
+ continue;
+ }
+
+ (*n_blobs)++;
+ c_stream->avail_out
+ -= BTR_EXTERN_FIELD_REF_SIZE;
+ *externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Copy the BLOB pointer */
+ memcpy(*externs, c_stream->next_in
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+
+ return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* n_blobs, /*!< in: 0; out: number of
+ externally stored columns */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ rec_offs* offsets = NULL;
+ /* BTR_EXTERN_FIELD_REF storage */
+ byte* externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(*n_blobs == 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(rec_offs_n_fields(offsets)
+ == dict_index_get_n_fields(index));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Compress the data bytes. */
+
+ c_stream->next_in = (byte*) rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, store the
+ BTR_EXTERN_FIELD_REF separately. */
+ if (rec_offs_any_extern(offsets)) {
+ ut_ad(dict_index_is_clust(index));
+
+ err = page_zip_compress_clust_ext(
+ LOGFILE
+ c_stream, rec, offsets, trx_id_col,
+ deleted, storage, &externs, n_blobs);
+
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ } else {
+ ulint len;
+ const byte* src;
+
+ /* Store trx_id and roll_ptr in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = static_cast<uInt>(
+ src - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+ }
+
+ /* Compress the last bytes of the record. */
+ c_stream->avail_in = static_cast<uInt>(
+ rec + rec_offs_data_size(offsets) - c_stream->next_in);
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ } while (--n_dense);
+
+func_exit:
+ return(err);}
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+ buf_block_t* block, /*!< in/out: buffer block */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: commpression level */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ z_stream c_stream;
+ int err;
+ byte* fields; /*!< index field information */
+ byte* buf; /*!< compressed payload of the
+ page */
+ byte* buf_end; /* end of buf */
+ ulint n_dense;
+ ulint slot_size; /* amount of uncompressed bytes
+ per record */
+ const rec_t** recs; /*!< dense page directory,
+ sorted by address */
+ mem_heap_t* heap;
+ ulint trx_id_col = ULINT_UNDEFINED;
+ ulint n_blobs = 0;
+ byte* storage; /* storage of uncompressed
+ columns */
+ const ulonglong ns = my_interval_timer();
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ FILE* logfile = NULL;
+#endif
+ /* A local copy of srv_cmp_per_index_enabled to avoid reading that
+ variable multiple times in this function since it can be changed at
+ anytime. */
+ my_bool cmp_per_index_enabled;
+ cmp_per_index_enabled = srv_cmp_per_index_enabled;
+
+ page_t* page = block->page.frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_a(page_is_comp(page));
+ ut_a(fil_page_index_page_check(page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ /* Check the data that will be omitted. */
+ ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra));
+ ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+ infimum_data, sizeof infimum_data));
+ ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+ /* info_bits == 0, n_owned <= max */
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+ supremum_extra_data, sizeof supremum_extra_data));
+
+ if (page_is_empty(page)) {
+ ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+ == PAGE_NEW_SUPREMUM);
+ }
+
+ const ulint n_fields = page_is_leaf(page)
+ ? dict_index_get_n_fields(index)
+ : dict_index_get_n_unique_in_tree_nonleaf(index);
+ index_id_t ind_id = index->id;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ib::info() << "compress "
+ << static_cast<void*>(page_zip) << " "
+ << static_cast<const void*>(page) << " "
+ << page_is_leaf(page) << " "
+ << n_fields << " " << n_dense;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_compress_log)) {
+ /* Create a log file for every compression attempt. */
+ char logfilename[9];
+ snprintf(logfilename, sizeof logfilename,
+ "%08x", page_zip_compress_log++);
+ logfile = fopen(logfilename, "wb");
+
+ if (logfile) {
+ /* Write the uncompressed page to the log. */
+ if (fwrite(page, 1, srv_page_size, logfile)
+ != srv_page_size) {
+ perror("fwrite");
+ }
+ /* Record the compressed size as zero.
+ This will be overwritten at successful exit. */
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ }
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ page_zip_stat[page_zip->ssize - 1].compressed++;
+ if (cmp_per_index_enabled) {
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed++;
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+ }
+
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+
+ goto err_exit;
+ }
+
+ MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
+ heap = mem_heap_create(page_zip_get_size(page_zip)
+ + n_fields * (2 + sizeof(ulint))
+ + REC_OFFS_HEADER_SIZE
+ + n_dense * ((sizeof *recs)
+ - PAGE_ZIP_DIR_SLOT_SIZE)
+ + srv_page_size * 4
+ + (512 << MAX_MEM_LEVEL));
+
+ recs = static_cast<const rec_t**>(
+ mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+ fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
+
+ buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+ /* Compress the data payload. */
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, static_cast<int>(level),
+ Z_DEFLATED, static_cast<int>(srv_page_size_shift),
+ MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+
+ c_stream.next_out = buf;
+
+ /* Subtract the space reserved for uncompressed data. */
+ /* Page header and the end marker of the modification log */
+ c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
+
+ /* Dense page directory and uncompressed columns, if any */
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ trx_id_col = index->db_trx_id();
+
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ } else {
+ /* Signal the absence of trx_id
+ in page_zip_fields_encode() */
+ trx_id_col = 0;
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+ } else {
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+ + 6/* sizeof(zlib header and footer) */)) {
+ goto zlib_error;
+ }
+
+ c_stream.avail_out -= uInt(n_dense * slot_size);
+ c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
+ trx_id_col, fields));
+ c_stream.next_in = fields;
+
+ if (UNIV_LIKELY(!trx_id_col)) {
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FULL_FLUSH);
+ if (err != Z_OK) {
+ goto zlib_error;
+ }
+
+ ut_ad(!c_stream.avail_in);
+
+ page_zip_dir_encode(page, buf_end, recs);
+
+ c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+ storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ /* Compress the records in heap_no order. */
+ if (UNIV_UNLIKELY(!n_dense)) {
+ } else if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ err = page_zip_compress_node_ptrs(LOGFILE
+ &c_stream, recs, n_dense,
+ index, storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ err = page_zip_compress_sec(LOGFILE
+ &c_stream, recs, n_dense);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ err = page_zip_compress_clust(LOGFILE
+ &c_stream, recs, n_dense,
+ index, &n_blobs, trx_id_col,
+ buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+ * page_get_n_recs(page),
+ storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ }
+
+ /* Finish the compression. */
+ ut_ad(!c_stream.avail_in);
+ /* Compress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list,
+ or the data of the last record from page_zip_compress_sec(). */
+ c_stream.avail_in = static_cast<uInt>(
+ page_header_get_field(page, PAGE_HEAP_TOP)
+ - (c_stream.next_in - page));
+ ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
+
+ MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FINISH);
+
+ if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ if (page_is_leaf(page)) {
+ dict_index_zip_failure(index);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec
+ += time_diff;
+ if (cmp_per_index_enabled) {
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_usec
+ += time_diff;
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+ }
+ return false;
+ }
+
+ err = deflateEnd(&c_stream);
+ ut_a(err == Z_OK);
+
+ ut_ad(buf + c_stream.total_out == c_stream.next_out);
+ ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+ /* Valgrind believes that zlib does not initialize some bits
+ in the last 7 or 8 bytes of the stream. Make Valgrind happy. */
+ MEM_MAKE_DEFINED(buf, c_stream.total_out);
+#endif /* HAVE_valgrind && !memory_sanitizer */
+
+ /* Zero out the area reserved for the modification log.
+ Space for the end marker of the modification log is not
+ included in avail_out. */
+ memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start =
+#endif /* UNIV_DEBUG */
+ page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
+ page_zip->m_nonempty = FALSE;
+ page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
+ /* Copy those header fields that will not be written
+ in buf_flush_init_for_writing() */
+ memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2);
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA);
+ /* Copy the rest of the compressed page */
+ memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+ mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_zip_compress_write_log(block, index, mtr);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ /* Record the compressed size of the block. */
+ byte sz[4];
+ mach_write_to_4(sz, c_stream.total_out);
+ fseek(logfile, srv_page_size, SEEK_SET);
+ if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
+ perror("fwrite");
+ }
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+ if (cmp_per_index_enabled) {
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[ind_id].compressed_ok++;
+ page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+ }
+
+ if (page_is_leaf(page)) {
+ dict_index_zip_success(index);
+ }
+
+ return true;
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+ dict_index_t* index) /*!< in: dummy index to be freed */
+{
+ if (index) {
+ dict_table_t* table = index->table;
+ index->zip_pad.mutex.~mutex();
+ mem_heap_free(index->heap);
+
+ dict_mem_table_free(table);
+ }
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+ const byte* buf, /*!< in: index information */
+ const byte* end, /*!< in: end of buf */
+ ulint* trx_id_col,/*!< in: NULL for non-leaf pages;
+ for leaf pages, pointer to where to store
+ the position of the trx_id column */
+ bool is_spatial)/*< in: is spatial index or not */
+{
+ const byte* b;
+ ulint n;
+ ulint i;
+ ulint val;
+ dict_table_t* table;
+ dict_index_t* index;
+
+ /* Determine the number of fields. */
+ for (b = buf, n = 0; b < end; n++) {
+ if (*b++ & 0x80) {
+ b++; /* skip the second byte */
+ }
+ }
+
+ n--; /* n_nullable or trx_id */
+
+ if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+ page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+ (ulong) n));
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(b > end)) {
+
+ page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+ (const void*) b, (const void*) end));
+ return(NULL);
+ }
+
+ table = dict_table_t::create({C_STRING_WITH_LEN("ZIP_DUMMY")},
+ nullptr, n, 0, DICT_TF_COMPACT, 0);
+ index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
+ index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+
+ /* Initialize the fields. */
+ for (b = buf, i = 0; i < n; i++) {
+ ulint mtype;
+ ulint len;
+
+ val = *b++;
+
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ /* fixed length > 62 bytes */
+ val = (val & 0x7f) << 8 | *b++;
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ } else if (UNIV_UNLIKELY(val >= 126)) {
+ /* variable length with max > 255 bytes */
+ len = 0x7fff;
+ mtype = DATA_BINARY;
+ } else if (val <= 1) {
+ /* variable length with max <= 255 bytes */
+ len = 0;
+ mtype = DATA_BINARY;
+ } else {
+ /* fixed length < 62 bytes */
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ }
+
+ dict_mem_table_add_col(table, NULL, NULL, mtype,
+ val & 1 ? DATA_NOT_NULL : 0, len);
+ dict_index_add_col(index, table,
+ dict_table_get_nth_col(table, i), 0);
+ }
+
+ val = *b++;
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ val = (val & 0x7f) << 8 | *b++;
+ }
+
+ /* Decode the position of the trx_id column. */
+ if (trx_id_col) {
+ if (!val) {
+ val = ULINT_UNDEFINED;
+ } else if (UNIV_UNLIKELY(val >= n)) {
+fail:
+ page_zip_fields_free(index);
+ return NULL;
+ } else {
+ index->type = DICT_CLUSTERED;
+ }
+
+ *trx_id_col = val;
+ } else {
+ /* Decode the number of nullable fields. */
+ if (UNIV_UNLIKELY(index->n_nullable > val)) {
+ goto fail;
+ } else {
+ index->n_nullable = static_cast<unsigned>(val)
+ & dict_index_t::MAX_N_FIELDS;
+ }
+ }
+
+ /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
+ index->n_core_fields = index->n_fields;
+ index->n_core_null_bytes = static_cast<uint8_t>(
+ UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+ ut_ad(b == end);
+
+ if (is_spatial) {
+ index->type |= DICT_SPATIAL;
+ }
+
+ return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+page_zip_dir_decode(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ page_t* page, /*!< in: compact page with valid header;
+ out: trailer and sparse page directory
+ filled in */
+ rec_t** recs, /*!< out: dense page directory sorted by
+ ascending address (and heap_no) */
+ ulint n_dense)/*!< in: number of user records, and
+ size of recs[] */
+{
+ ulint i;
+ ulint n_recs;
+ byte* slot;
+
+ n_recs = page_get_n_recs(page);
+
+ if (UNIV_UNLIKELY(n_recs > n_dense)) {
+ page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+ (ulong) n_recs, (ulong) n_dense));
+ return(FALSE);
+ }
+
+ /* Traverse the list of stored records in the sorting order,
+ starting from the first user record. */
+
+ slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+ UNIV_PREFETCH_RW(slot);
+
+ /* Zero out the page trailer. */
+ memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+ mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+
+ /* Initialize the sparse directory and copy the dense directory. */
+ for (i = 0; i < n_recs; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+ mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+ }
+
+ if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+ < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_recs,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ {
+ const page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ page, page_dir_get_n_slots(page) - 1U);
+
+ if (UNIV_UNLIKELY(slot != last_slot)) {
+ page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+ (const void*) slot,
+ (const void*) last_slot));
+ return(FALSE);
+ }
+ }
+
+ /* Copy the rest of the dense directory. */
+ for (; i < n_dense; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+ page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_dense,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + offs;
+ }
+
+ std::sort(recs, recs + n_dense);
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ page_t* page, /*!< in/out: uncompressed page */
+ ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+ ulint n;
+ ulint i;
+ ulint n_owned = 1;
+ ulint offs;
+ rec_t* rec;
+
+ n = page_get_n_recs(page);
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (i = 0; i < n; i++) {
+ offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
+ info_bits |= REC_INFO_DELETED_FLAG;
+ }
+ if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+ info_bits |= n_owned;
+ n_owned = 1;
+ } else {
+ n_owned++;
+ }
+ offs &= PAGE_ZIP_DIR_SLOT_MASK;
+ if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+ + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_set_extra_bytes 1:"
+ " %u %u %lx\n",
+ (unsigned) i, (unsigned) n,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec_set_next_offs_new(rec, offs);
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+ info_bits = 0;
+ }
+
+ /* Set the next pointer of the last user record. */
+ rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+ /* Set n_owned of the supremum record. */
+ page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+
+ if (i >= n) {
+ if (UNIV_LIKELY(i == n)) {
+ return(TRUE);
+ }
+
+ page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+ (unsigned) i, (unsigned) n));
+ return(FALSE);
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+
+ /* Set the extra bytes of deleted records on the free list. */
+ for (;;) {
+ if (UNIV_UNLIKELY(!offs)
+ || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+ page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+ if (++i == n) {
+ break;
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+ rec_set_next_offs_new(rec, offs);
+ }
+
+ /* Terminate the free list. */
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+ rec_set_next_offs_new(rec, 0);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns. Do not copy the fields that are stored separately.
+@return pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ const byte* data, /*!< in: modification log */
+ const byte* end) /*!< in: end of modification log */
+{
+ ulint i;
+ ulint len;
+ byte* next_out = rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, skip the
+ BTR_EXTERN_FIELD_REF. */
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ if (UNIV_UNLIKELY(dst - next_out >= end - data)
+ || UNIV_UNLIKELY
+ (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+ || rec_offs_nth_extern(offsets, i)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " trx_id len %lu,"
+ " %p - %p >= %p - %p\n",
+ (ulong) len,
+ (const void*) dst,
+ (const void*) next_out,
+ (const void*) end,
+ (const void*) data));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, ulint(dst - next_out));
+ data += ulint(dst - next_out);
+ next_out = dst + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ len += ulint(dst - next_out)
+ - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " ext %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, len);
+ data += len;
+ next_out += len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ /* Copy the last bytes of the record. */
+ len = ulint(rec_get_end(rec, offsets) - next_out);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " last %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(next_out, data, len);
+ data += len;
+
+ return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+ const byte* data, /*!< in: modification log */
+ ulint size, /*!< in: maximum length of the log, in bytes */
+ rec_t** recs, /*!< in: dense page directory,
+ sorted by address (indexed by
+ heap_no - PAGE_HEAP_NO_USER_LOW) */
+ ulint n_dense,/*!< in: size of recs[] */
+ ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */
+ ulint trx_id_col,/*!< in: column number of trx_id in the index,
+ or ULINT_UNDEFINED if none */
+ ulint heap_status,
+ /*!< in: heap_no and status bits for
+ the next record to uncompress */
+ dict_index_t* index, /*!< in: index of the page */
+ rec_offs* offsets)/*!< in/out: work area for
+ rec_get_offsets_reverse() */
+{
+ const byte* const end = data + size;
+
+ for (;;) {
+ ulint val;
+ rec_t* rec;
+ ulint len;
+ ulint hs;
+
+ val = *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ return(data - 1);
+ }
+ if (val & 0x80) {
+ val = (val & 0x7f) << 8 | *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " invalid val %x%x\n",
+ data[-2], data[-1]));
+ return(NULL);
+ }
+ }
+ if (UNIV_UNLIKELY(data >= end)) {
+ page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+ (const void*) data,
+ (const void*) end));
+ return(NULL);
+ }
+ if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+ page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+ (ulong) val, (ulong) n_dense));
+ return(NULL);
+ }
+
+ /* Determine the heap number and status bits of the record. */
+ rec = recs[(val >> 1) - 1];
+
+ hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+ hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+ /* This may either be an old record that is being
+ overwritten (updated in place, or allocated from
+ the free list), or a new record, with the next
+ available_heap_no. */
+ if (UNIV_UNLIKELY(hs > heap_status)) {
+ page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+ (ulong) hs, (ulong) heap_status));
+ return(NULL);
+ } else if (hs == heap_status) {
+ /* A new record was allocated from the heap. */
+ if (UNIV_UNLIKELY(val & 1)) {
+ /* Only existing records may be cleared. */
+ page_zip_fail(("page_zip_apply_log:"
+ " attempting to create"
+ " deleted rec %lu\n",
+ (ulong) hs));
+ return(NULL);
+ }
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ }
+
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+ if (val & 1) {
+ /* Clear the data bytes of the record. */
+ mem_heap_t* heap = NULL;
+ rec_offs* offs;
+ offs = rec_get_offsets(rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+ memset(rec, 0, rec_offs_data_size(offs));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ continue;
+ }
+
+ compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
+ rec_get_offsets_reverse(data, index,
+ hs & REC_STATUS_NODE_PTR,
+ offsets);
+ /* Silence a debug assertion in rec_offs_make_valid().
+ This will be overwritten in page_zip_set_extra_bytes(),
+ called by page_zip_decompress_low(). */
+ ut_d(rec[-REC_NEW_INFO_BITS] = 0);
+ rec_offs_make_valid(rec, index, n_core != 0, offsets);
+
+ /* Copy the extra bytes (backwards). */
+ {
+ byte* start = rec_get_start(rec, offsets);
+ byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+ while (b != start) {
+ *--b = *data++;
+ }
+ }
+
+ /* Copy the data bytes. */
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ /* Non-leaf nodes should not contain any
+ externally stored columns. */
+ if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " %lu&REC_STATUS_NODE_PTR\n",
+ (ulong) hs));
+ return(NULL);
+ }
+
+ data = page_zip_apply_log_ext(
+ rec, offsets, trx_id_col, data, end);
+
+ if (UNIV_UNLIKELY(!data)) {
+ return(NULL);
+ }
+ } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ len = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+ /* Copy the data bytes, except node_ptr. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " node_ptr %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(rec, data, len);
+ data += len;
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ len = rec_offs_data_size(offsets);
+
+ /* Copy all data bytes of
+ a record in a secondary index. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " sec %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(rec, data, len);
+ data += len;
+ } else {
+ /* Skip DB_TRX_ID and DB_ROLL_PTR. */
+ ulint l = rec_get_nth_field_offs(offsets,
+ trx_id_col, &len);
+ byte* b;
+
+ if (UNIV_UNLIKELY(data + l >= end)
+ || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN))) {
+ page_zip_fail(("page_zip_apply_log:"
+ " trx_id %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) l,
+ (const void*) end));
+ return(NULL);
+ }
+
+ /* Copy any preceding data bytes. */
+ memcpy(rec, data, l);
+ data += l;
+
+ /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+ b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ len = ulint(rec_get_end(rec, offsets) - b);
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " clust %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(b, data, len);
+ data += len;
+ }
+ }
+}
+
+/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ ulint& heap_status) /*!< in/out: heap_no and status bits */
+{
+ if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+ /* n_dense has grown since the page was last compressed. */
+ return(FALSE);
+ }
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+ d_stream->next_out = rec;
+
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ ulint heap_status = REC_STATUS_NODE_PTR
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+ const byte* storage;
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(
+ n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out = static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Decompress the data bytes, except node_ptr. */
+ d_stream->avail_out =static_cast<uInt>(
+ rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ /* Clear the node pointer in case the record
+ will be deleted and the space will be reallocated
+ to a smaller record. */
+ memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+ d_stream->next_out += REC_NODE_PTR_SIZE;
+
+ ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense, 0,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index))
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " %lu + %lu >= %lu, %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, dict_index_is_clust(index)),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip),
+ (ulong) dict_index_is_clust(index)));
+ return(FALSE);
+ }
+
+ /* Restore the uncompressed columns in heap_no order. */
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ offsets = rec_get_offsets(rec, index, offsets, 0,
+ ULINT_UNDEFINED, &heap);
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ storage -= REC_NODE_PTR_SIZE;
+
+ memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+ storage, REC_NODE_PTR_SIZE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ rec_offs* offsets) /*!< in/out: temporary offsets */
+{
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+
+ ut_a(!dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uint>(
+ n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ /* Decompress everything up to this record. */
+ d_stream->avail_out = static_cast<uint>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ if (UNIV_LIKELY(d_stream->avail_out)) {
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+ }
+
+ /* Decompress the data of the last record and any trailing garbage,
+ in case the last record was allocated from an originally longer space
+ on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_sec:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, FALSE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ /* There are no uncompressed columns on leaf pages of
+ secondary indexes. */
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col) /*!< in: position of of DB_TRX_ID */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " len[%lu] = %lu\n",
+ (ulong) i, (ulong) len));
+ return(FALSE);
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " DB_TRX_ID at %lu is ext\n",
+ (ulong) i));
+ return(FALSE);
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear the BLOB pointer in case
+ the record will be deleted and the
+ space will not be reused. Note that
+ the final initialization of the BLOB
+ pointers (copying from "externs"
+ or clearing) will have to take place
+ only after the page modification log
+ has been applied. Otherwise, we
+ could end up with an uninitialized
+ BLOB pointer when a record is deleted,
+ reallocated and deleted. */
+ memset(d_stream->next_out, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ d_stream->next_out
+ += BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ rec_offs* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err;
+ ulint slot;
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ const byte* storage;
+ const byte* externs;
+
+ ut_a(dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= static_cast<uInt>(n_dense)
+ * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out =static_cast<uInt>(
+ rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+ ut_ad(d_stream->avail_out < srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR);
+ err = inflate(d_stream, Z_SYNC_FLUSH);
+ switch (err) {
+ case Z_STREAM_END:
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (UNIV_LIKELY(!d_stream->avail_out)) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ /* This is a leaf page in a clustered index. */
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, restore the
+ BTR_EXTERN_FIELD_REF separately. */
+
+ if (rec_offs_any_extern(offsets)) {
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_clust_ext(
+ d_stream, rec, offsets, trx_id_col))) {
+
+ goto zlib_error;
+ }
+ } else {
+ /* Skip trx_id and roll_ptr */
+ ulint len;
+ byte* dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " len = %lu\n", (ulong) len));
+ goto zlib_error;
+ }
+
+ d_stream->avail_out = static_cast<uInt>(
+ dst - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ }
+
+ /* Decompress the last bytes of the record. */
+ d_stream->avail_out = static_cast<uInt>(
+ rec_get_end(rec, offsets) - d_stream->next_out);
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = static_cast<uInt>(
+ page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out));
+ if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ ulint(page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page)
+ - 1U)
+ - d_stream->next_out));
+ }
+
+ ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ index->n_fields,
+ trx_id_col, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, TRUE),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ storage = page_zip_dir_start_low(page_zip, n_dense);
+
+ externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Restore the uncompressed columns in heap_no order. */
+
+ for (slot = 0; slot < n_dense; slot++) {
+ ulint i;
+ ulint len;
+ byte* dst;
+ rec_t* rec = recs[slot];
+ bool exists = !page_zip_dir_find_free(
+ page_zip, page_offset(rec));
+ offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+ ULINT_UNDEFINED, &heap);
+
+ dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ memcpy(dst, storage,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Check if there are any externally stored
+ columns in this record. For each externally
+ stored column, restore or clear the
+ BTR_EXTERN_FIELD_REF. */
+ if (!rec_offs_any_extern(offsets)) {
+ continue;
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " %lu < 20\n",
+ (ulong) len));
+ return(FALSE);
+ }
+
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_LIKELY(exists)) {
+ /* Existing record:
+ restore the BLOB pointer */
+ externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY
+ (externs < page_zip->data
+ + page_zip->m_end)) {
+ page_zip_fail(("page_zip_"
+ "decompress_clust:"
+ " %p < %p + %lu\n",
+ (const void*) externs,
+ (const void*)
+ page_zip->data,
+ (ulong)
+ page_zip->m_end));
+ return(FALSE);
+ }
+
+ memcpy(dst, externs,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ page_zip->n_blobs++;
+ } else {
+ /* Deleted record:
+ clear the BLOB pointer */
+ memset(dst, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_low(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ z_stream d_stream;
+ dict_index_t* index = NULL;
+ rec_t** recs; /*!< dense page directory, sorted by address */
+ ulint n_dense;/* number of user records on the page */
+ ulint trx_id_col = ULINT_UNDEFINED;
+ mem_heap_t* heap;
+ rec_offs* offsets;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+ (ulong) n_dense,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
+
+ recs = static_cast<rec_t**>(
+ mem_heap_alloc(heap, n_dense * sizeof *recs));
+
+ if (all) {
+ /* Copy the page header. */
+ memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
+ } else {
+ /* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(FIL_PAGE_TYPE + page,
+ FIL_PAGE_TYPE + page_zip->data,
+ PAGE_HEADER - FIL_PAGE_TYPE));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+ PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+ PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ /* Copy the mutable parts of the page header. */
+ memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
+ memcpy_aligned<2>(PAGE_HEADER + page,
+ PAGE_HEADER + page_zip->data,
+ PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the page headers match after copying. */
+ ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ /* Clear the uncompressed page, except the header. */
+ memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+ MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
+
+ /* Copy the page directory. */
+ if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+ n_dense))) {
+zlib_error:
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ /* Copy the infimum and supremum records. */
+ memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra);
+ if (page_is_empty(page)) {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ page_zip_dir_get(page_zip, 0)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+ memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+ memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
+ + page, supremum_extra_data,
+ sizeof supremum_extra_data);
+
+ page_zip_set_alloc(&d_stream, heap);
+
+ d_stream.next_in = page_zip->data + PAGE_DATA;
+ /* Subtract the space reserved for
+ the page header and the end marker of the modification log. */
+ d_stream.avail_in = static_cast<uInt>(
+ page_zip_get_size(page_zip) - (PAGE_DATA + 1));
+ d_stream.next_out = page + PAGE_ZIP_START;
+ d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
+
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
+ != Z_OK)) {
+ ut_error;
+ }
+
+ /* Decode the zlib header and the index information. */
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ index = page_zip_fields_decode(
+ page + PAGE_ZIP_START, d_stream.next_out,
+ page_is_leaf(page) ? &trx_id_col : NULL,
+ fil_page_get_type(page) == FIL_PAGE_RTREE);
+
+ if (UNIV_UNLIKELY(!index)) {
+
+ goto zlib_error;
+ }
+
+ /* Decompress the user records. */
+ page_zip->n_blobs = 0;
+ d_stream.next_out = page + PAGE_ZIP_START;
+
+ {
+ /* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+ ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+
+ offsets = static_cast<rec_offs*>(
+ mem_heap_alloc(heap, n * sizeof(ulint)));
+
+ rec_offs_set_n_alloc(offsets, n);
+ }
+
+ /* Decompress the records in heap_no order. */
+ if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ ulint info_bits;
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+ recs, n_dense, index,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+ info_bits))) {
+ goto err_exit;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+ recs, n_dense,
+ index, offsets))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+err_exit:
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+ &d_stream, recs,
+ n_dense, index,
+ trx_id_col,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+ goto err_exit;
+ }
+ }
+
+ ut_a(page_is_comp(page));
+ MEM_CHECK_DEFINED(page, srv_page_size);
+
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ const ulonglong ns = my_interval_timer();
+
+ if (!page_zip_decompress_low(page_zip, page, all)) {
+ return(FALSE);
+ }
+
+ const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+ page_zip_stat[page_zip->ssize - 1].decompressed++;
+ page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+ index_id_t index_id = btr_page_get_index_id(page);
+
+ if (srv_cmp_per_index_enabled) {
+ mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index_id].decompressed++;
+ page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+ mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+ }
+
+ /* Update the stat counter for LRU policy. */
+ buf_LRU_stat_inc_unzip();
+
+ MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
+ return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+ const char* name, /*!< in: name of the data structure */
+ const void* buf, /*!< in: data */
+ ulint size) /*!< in: length of the data, in bytes */
+{
+ const byte* s = static_cast<const byte*>(buf);
+ ulint addr;
+ const ulint width = 32; /* bytes per line */
+
+ fprintf(stderr, "%s:\n", name);
+
+ for (addr = 0; addr < size; addr += width) {
+ ulint i;
+
+ fprintf(stderr, "%04lx ", (ulong) addr);
+
+ i = ut_min(width, size - addr);
+
+ while (i--) {
+ fprintf(stderr, "%02x", *s++);
+ }
+
+ putc('\n', stderr);
+ }
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf in: data
+@param size in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+bool page_zip_validate_header_only;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index, /*!< in: index of the page, if known */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+{
+ page_zip_des_t temp_page_zip;
+ ibool valid;
+
+ if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV)
+ || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+ || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_ROOT_AUTO_INC)
+ /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
+ on the clustered index root page (page number 3 in .ibd files).
+ That allows concurrent readers (holding buf_block_t::lock S-latch).
+ Because we do not know what type of a latch our caller is holding,
+ we will ignore the field on clustered index root pages in order
+ to avoid false positives. */
+ || (page_get_page_no(page) != 3/* clustered index root page */
+ && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
+ &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
+ || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+ PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
+ page_zip_fail(("page_zip_validate: page header\n"));
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ return(FALSE);
+ }
+
+ ut_a(page_is_comp(page));
+
+ if (page_zip_validate_header_only) {
+ return(TRUE);
+ }
+
+ /* page_zip_decompress() expects the uncompressed page to be
+ srv_page_size aligned. */
+ page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ temp_page_zip = *page_zip;
+ valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
+ if (!valid) {
+ fputs("page_zip_validate(): failed to decompress\n", stderr);
+ goto func_exit;
+ }
+ if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+ page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+ page_zip->n_blobs, temp_page_zip.n_blobs));
+ valid = FALSE;
+ }
+#ifdef UNIV_DEBUG
+ if (page_zip->m_start != temp_page_zip.m_start) {
+ page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+ page_zip->m_start, temp_page_zip.m_start));
+ valid = FALSE;
+ }
+#endif /* UNIV_DEBUG */
+ if (page_zip->m_end != temp_page_zip.m_end) {
+ page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+ page_zip->m_end, temp_page_zip.m_end));
+ valid = FALSE;
+ }
+ if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+ page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+ page_zip->m_nonempty,
+ temp_page_zip.m_nonempty));
+ valid = FALSE;
+ }
+ if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+ /* In crash recovery, the "minimum record" flag may be
+ set incorrectly until the mini-transaction is
+ committed. Let us tolerate that difference when we
+ are performing a sloppy validation. */
+
+ rec_offs* offsets;
+ mem_heap_t* heap;
+ const rec_t* rec;
+ const rec_t* trec;
+ byte info_bits_diff;
+ ulint offset
+ = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
+ ut_a(offset >= PAGE_NEW_SUPREMUM);
+ offset -= 5/*REC_NEW_INFO_BITS*/;
+
+ info_bits_diff = page[offset] ^ temp_page[offset];
+
+ if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+ temp_page[offset] = page[offset];
+
+ if (!memcmp(page + PAGE_HEADER,
+ temp_page + PAGE_HEADER,
+ srv_page_size - PAGE_HEADER
+ - FIL_PAGE_DATA_END)) {
+
+ /* Only the minimum record flag
+ differed. Let us ignore it. */
+ page_zip_fail(("page_zip_validate:"
+ " min_rec_flag"
+ " (%s" UINT32PF "," UINT32PF
+ ",0x%02x)\n",
+ sloppy ? "ignored, " : "",
+ page_get_space_id(page),
+ page_get_page_no(page),
+ page[offset]));
+ /* We don't check for spatial index, since
+ the "minimum record" could be deleted when
+ doing rtr_update_mbr_field.
+ GIS_FIXME: need to validate why
+ rtr_update_mbr_field.() could affect this */
+ if (index && dict_index_is_spatial(index)) {
+ valid = true;
+ } else {
+ valid = sloppy;
+ }
+ goto func_exit;
+ }
+ }
+
+ /* Compare the pointers in the PAGE_FREE list. */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+ trec = page_header_get_ptr(temp_page, PAGE_FREE);
+
+ while (rec || trec) {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " PAGE_FREE list: %u!=%u\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ }
+
+ /* Compare the records. */
+ heap = NULL;
+ offsets = NULL;
+ rec = page_rec_get_next_low(
+ page + PAGE_NEW_INFIMUM, TRUE);
+ trec = page_rec_get_next_low(
+ temp_page + PAGE_NEW_INFIMUM, TRUE);
+ const ulint n_core = (index && page_is_leaf(page))
+ ? index->n_fields : 0;
+
+ do {
+ if (page_offset(rec) != page_offset(trec)) {
+ page_zip_fail(("page_zip_validate:"
+ " record list: 0x%02x!=0x%02x\n",
+ (unsigned) page_offset(rec),
+ (unsigned) page_offset(trec)));
+ valid = FALSE;
+ break;
+ }
+
+ if (index) {
+ /* Compare the data. */
+ offsets = rec_get_offsets(
+ rec, index, offsets, n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (memcmp(rec - rec_offs_extra_size(offsets),
+ trec - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets))) {
+ page_zip_fail(
+ ("page_zip_validate:"
+ " record content: 0x%02x",
+ (unsigned) page_offset(rec)));
+ valid = FALSE;
+ break;
+ }
+ }
+
+ rec = page_rec_get_next_low(rec, TRUE);
+ trec = page_rec_get_next_low(trec, TRUE);
+ } while (rec || trec);
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ }
+
+func_exit:
+ if (!valid) {
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, srv_page_size);
+ page_zip_hexdump(temp_page, srv_page_size);
+ }
+ aligned_free(temp_page);
+ return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ const dict_index_t* index) /*!< in: index of the page, if known */
+{
+ return(page_zip_validate_low(page_zip, page, index,
+ recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const byte* page) /*!< in: uncompressed page */
+{
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns. The data must already have been written to the uncompressed page.
+@return end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ const dict_index_t*index, /*!< in: record descriptor */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ulint create, /*!< in: nonzero=insert, zero=update */
+ ulint trx_id_col, /*!< in: position of DB_TRX_ID */
+ ulint heap_no, /*!< in: heap number of rec */
+ byte* storage, /*!< in: end of dense page directory */
+ byte* data, /*!< in: end of modification log */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* start = rec;
+ ulint i;
+ ulint len;
+ byte* externs = storage;
+ ulint n_ext = rec_offs_n_extern(offsets);
+ const page_t* const page = block->page.frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+ /* Note that this will not take into account
+ the BLOB columns of rec if create==TRUE. */
+ ut_ad(data + rec_offs_data_size(offsets)
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ - n_ext * FIELD_REF_SIZE
+ < externs - FIELD_REF_SIZE * page_zip->n_blobs);
+
+ if (n_ext) {
+ ulint blob_no = page_zip_get_n_prev_extern(
+ page_zip, rec, index);
+ byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
+ ut_ad(blob_no <= page_zip->n_blobs);
+ externs -= blob_no * FIELD_REF_SIZE;
+
+ if (create) {
+ page_zip->n_blobs = (page_zip->n_blobs + n_ext)
+ & ((1U << 12) - 1);
+ ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
+ if (ulint len = ulint(externs - ext_end)) {
+ byte* ext_start = ext_end
+ - n_ext * FIELD_REF_SIZE;
+ memmove(ext_start, ext_end, len);
+ mtr->memmove(*block,
+ ext_start - page_zip->data,
+ ext_end - page_zip->data, len);
+ }
+ }
+
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i));
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i + 1));
+ /* Locate trx_id and roll_ptr. */
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len = DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+ i++; /* skip also roll_ptr */
+ mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(len >= FIELD_REF_SIZE);
+ src += len - FIELD_REF_SIZE;
+
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, ulint(src - start));
+ data += src - start;
+ start = src + FIELD_REF_SIZE;
+
+ /* Store the BLOB pointer. */
+ externs -= FIELD_REF_SIZE;
+ ut_ad(data < externs);
+ memcpy(externs, src, FIELD_REF_SIZE);
+ mtr->zmemcpy(*block, externs - page_zip->data,
+ FIELD_REF_SIZE);
+ }
+ }
+
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets) - ulint(start - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, start, len);
+ data += len;
+
+ return(data);
+}
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in] rec record in the uncompressed page
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] create nonzero=insert, zero=update
+@param[in,out] mtr mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ ulint create, mtr_t *mtr)
+{
+ const page_t* const page = block->page.frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ byte* data;
+ byte* storage;
+ ulint heap_no;
+ byte* slot;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+
+ ut_ad(page_zip_header_cmp(page_zip, page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ byte s = *slot;
+ /* Copy the delete mark. */
+ if (rec_get_deleted_flag(rec, TRUE)) {
+ /* In delete-marked records, DB_TRX_ID must
+ always refer to an existing undo log record.
+ On non-leaf pages, the delete-mark flag is garbage. */
+ ut_ad(!index->is_primary() || !page_is_leaf(page)
+ || row_get_rec_trx_id(rec, index, offsets));
+ s |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+ } else {
+ s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ }
+
+ if (s != *slot) {
+ *slot = s;
+ mtr->zmemcpy(*block, slot - page_zip->data, 1);
+ }
+
+ ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+ ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
+ - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+ * page_dir_get_n_slots(page));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+ ut_ad(heap_no < page_dir_get_n_heap(page));
+
+ /* Append to the modification log. */
+ data = page_zip->data + page_zip->m_end;
+ ut_ad(!*data);
+
+ /* Identify the record by writing its heap number - 1.
+ 0 is reserved to indicate the end of the modification log. */
+
+ if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+ *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+ ut_ad(!*data);
+ }
+ *data++ = (byte) ((heap_no - 1) << 1);
+ ut_ad(!*data);
+
+ {
+ const byte* start = rec - rec_offs_extra_size(offsets);
+ const byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+
+ /* Write the extra bytes backwards, so that
+ rec_offs_extra_size() can be easily computed in
+ page_zip_apply_log() by invoking
+ rec_get_offsets_reverse(). */
+
+ while (b != start) {
+ *data++ = *--b;
+ ut_ad(!*data);
+ }
+ }
+
+ /* Write the data bytes. Store the uncompressed bytes separately. */
+ storage = page_zip_dir_start(page_zip);
+
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ /* Store separately trx_id, roll_ptr and
+ the BTR_EXTERN_FIELD_REF of each BLOB column. */
+ if (rec_offs_any_extern(offsets)) {
+ data = page_zip_write_rec_ext(
+ block,
+ rec, index, offsets, create,
+ index->db_trx_id(), heap_no,
+ storage, data, mtr);
+ } else {
+ /* Locate trx_id and roll_ptr. */
+ ulint len;
+ const byte* src
+ = rec_get_nth_field(rec, offsets,
+ index->db_trx_id(),
+ &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ index->db_roll_ptr(), &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - rec);
+ memcpy(data, rec, ulint(src - rec));
+ data += src - rec;
+
+ /* Store trx_id and roll_ptr. */
+ constexpr ulint sys_len
+ = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ byte* sys = storage - sys_len * (heap_no - 1);
+ memcpy(sys, src, sys_len);
+
+ src += sys_len;
+ mtr->zmemcpy(*block, sys - page_zip->data,
+ sys_len);
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets)
+ - ulint(src - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, src, len);
+ data += len;
+ }
+ } else {
+ /* Leaf page of a secondary index:
+ no externally stored columns */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Log the entire record. */
+ ulint len = rec_offs_data_size(offsets);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+ }
+ } else {
+ /* This is a node pointer page. */
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Copy the data bytes, except node_ptr. */
+ ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+ ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+
+ /* Copy the node pointer to the uncompressed area. */
+ byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
+ rec + len, REC_NODE_PTR_SIZE);
+ }
+
+ ut_a(!*data);
+ ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+ mtr->zmemcpy(*block, page_zip->m_end,
+ data - page_zip->data - page_zip->m_end);
+ page_zip->m_end = uint16_t(data - page_zip->data);
+ page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page_align(rec), index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const byte* field;
+ byte* externs;
+ const page_t* const page = block->page.frame;
+ page_zip_des_t* const page_zip = &block->page.zip;
+ ulint blob_no;
+ ulint len;
+
+ ut_ad(page_align(rec) == page);
+ ut_ad(index != NULL);
+ ut_ad(offsets != NULL);
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(rec_offs_nth_extern(offsets, n));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+ + rec_get_n_extern_new(rec, index, n);
+ ut_a(blob_no < page_zip->n_blobs);
+
+ externs = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+
+ field = rec_get_nth_field(rec, offsets, n, &len);
+
+ externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+ field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ byte* field;
+ byte* storage;
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->page.frame);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(page_rec_is_comp(rec));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(!page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, size);
+
+ storage = page_zip_dir_start(page_zip)
+ - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+ field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ compile_time_assert(REC_NODE_PTR_SIZE == 4);
+ mach_write_to_4(field, ptr);
+ mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
+}
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out] block ROW_FORMAT=COMPRESSED page
+@param[in,out] rec record
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
+@param[in] trx_id DB_TRX_ID value (transaction identifier)
+@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
+@param[in,out] mtr mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+ buf_block_t* block,
+ byte* rec,
+ const rec_offs* offsets,
+ ulint trx_id_col,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ mtr_t* mtr)
+{
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ ut_d(const page_t* const page = block->page.frame);
+ ut_ad(page_align(rec) == page);
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ const ulint heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
+
+ compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+ ulint len;
+ byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(field + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ compile_time_assert(DATA_TRX_ID_LEN == 6);
+ mach_write_to_6(field, trx_id);
+ compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+ mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+ len = 0;
+ if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+ byte* prev = storage + sys_len;
+ for (; len < sys_len && prev[len] == field[len]; len++);
+ if (len > 4) {
+ /* We save space by replacing a single record
+
+ WRITE,offset(storage),byte[13]
+
+ with up to two records:
+
+ MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+ WRITE|0x80,0,byte[13-len]
+
+ The single WRITE record would be x+13 bytes long (x>2).
+ The MEMMOVE record would be x+1+1 = x+2 bytes, and
+ the second WRITE would be 1+1+13-len = 15-len bytes.
+
+ The total size is: x+13 versus x+2+15-len = x+17-len.
+ To save space, we must have len>4. */
+ memcpy(storage, prev, len);
+ mtr->memmove(*block, ulint(storage - page_zip->data),
+ ulint(storage - page_zip->data) + sys_len,
+ len);
+ storage += len;
+ field += len;
+ if (UNIV_LIKELY(len < sys_len)) {
+ goto write;
+ }
+ } else {
+ len = 0;
+ goto write;
+ }
+ } else {
+write:
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
+ sys_len - len);
+ }
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+ buf_block_t* block, /*!< in/out: compressed page */
+ byte* rec, /*!< in: record to clear */
+ const dict_index_t* index, /*!< in: index of rec */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint heap_no;
+ byte* storage;
+ byte* field;
+ ulint len;
+
+ ut_ad(page_align(rec) == block->page.frame);
+ page_zip_des_t* const page_zip = &block->page.zip;
+
+ /* page_zip_validate() would fail here if a record
+ containing externally stored columns is being deleted. */
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+ ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+ ut_ad(page_zip_header_cmp(page_zip, block->page.frame));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ if (!page_is_leaf(block->page.frame)) {
+ /* Clear node_ptr. On the compressed page,
+ there is an array of node_ptr immediately before the
+ dense page directory, at the very end of the page. */
+ storage = page_zip_dir_start(page_zip);
+ ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
+ rec_offs_n_fields(offsets) - 1);
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1,
+ &len);
+ ut_ad(len == REC_NODE_PTR_SIZE);
+ ut_ad(!rec_offs_any_extern(offsets));
+ memset(field, 0, REC_NODE_PTR_SIZE);
+ storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
+ len = REC_NODE_PTR_SIZE;
+clear_page_zip:
+ memset(storage, 0, len);
+ mtr->memset(*block, storage - page_zip->data, len, 0);
+ } else if (index->is_clust()) {
+ /* Clear trx_id and roll_ptr. On the compressed page,
+ there is an array of these fields immediately before the
+ dense page directory, at the very end of the page. */
+ const ulint trx_id_pos
+ = dict_col_get_clust_pos(
+ dict_table_get_sys_col(
+ index->table, DATA_TRX_ID), index);
+ field = rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ /* Clear all BLOB pointers in order to make
+ page_zip_validate() pass. */
+ if (rec_offs_nth_extern(offsets, i)) {
+ field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ ut_ad(len
+ == BTR_EXTERN_FIELD_REF_SIZE);
+ memset(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ 0, BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ storage = page_zip_dir_start(page_zip)
+ - (heap_no - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ goto clear_page_zip;
+ } else {
+ ut_ad(!rec_offs_any_extern(offsets));
+ }
+}
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out] block buffer block
+@param[in,out] rec record on a physical index page
+@param[in] flag the value of the delete-mark flag
+@param[in,out] mtr mini-transaction */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+ mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->page.frame);
+ byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&block->page.zip, block->page.frame, nullptr));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+ buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(rec) == block->page.frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+ byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ byte b= *slot;
+ if (flag)
+ b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ else
+ b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
+ mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: page cursor */
+ uint16_t free_rec,/*!< in: record from which rec was
+ allocated, or 0 */
+ byte* rec, /*!< in: record to insert */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(page_align(cursor->rec) == cursor->block->page.frame);
+ ut_ad(page_align(rec) == cursor->block->page.frame);
+ page_zip_des_t *const page_zip= &cursor->block->page.zip;
+
+ ulint n_dense;
+ byte* slot_rec;
+ byte* slot_free;
+
+ ut_ad(cursor->rec != rec);
+ ut_ad(page_rec_get_next_const(cursor->rec) == rec);
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+ if (page_rec_is_infimum(cursor->rec)) {
+ /* Use the first slot. */
+ slot_rec = page_zip->data + page_zip_get_size(page_zip);
+ } else {
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+ byte* start = end - page_zip_dir_user_size(page_zip);
+
+ if (UNIV_LIKELY(!free_rec)) {
+ /* PAGE_N_RECS was already incremented
+ in page_cur_insert_rec_zip(), but the
+ dense directory slot at that position
+ contains garbage. Skip it. */
+ start += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ slot_rec = page_zip_dir_find_low(start, end,
+ page_offset(cursor->rec));
+ ut_a(slot_rec);
+ }
+
+ /* Read the old n_dense (n_heap may have been incremented). */
+ n_dense = page_dir_get_n_heap(page_zip->data)
+ - (PAGE_HEAP_NO_USER_LOW + 1U);
+
+ if (UNIV_UNLIKELY(free_rec)) {
+ /* The record was allocated from the free list.
+ Shift the dense directory only up to that slot.
+ Note that in this case, n_dense is actually
+ off by one, because page_cur_insert_rec_zip()
+ did not increment n_heap. */
+ ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+ + PAGE_HEAP_NO_USER_LOW);
+ ut_ad(page_offset(rec) >= free_rec);
+ slot_free = page_zip_dir_find(page_zip, free_rec);
+ ut_ad(slot_free);
+ slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+ } else {
+ /* The record was allocated from the heap.
+ Shift the entire dense directory. */
+ ut_ad(rec_get_heap_no_new(rec) == n_dense
+ + PAGE_HEAP_NO_USER_LOW);
+
+ /* Shift to the end of the dense page directory. */
+ slot_free = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ }
+
+ if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+ /* Shift the dense directory to allocate place for rec. */
+ memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free, slot_len);
+ mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+ - PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the inserted record.
+ The "owned" flag must be zero. */
+ uint16_t offs = page_offset(rec);
+ if (rec_get_deleted_flag(rec, true)) {
+ offs |= PAGE_ZIP_DIR_SLOT_DEL;
+ }
+
+ mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs);
+ mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
+ - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out] block index page
+@param[in,out] rec record being deleted
+@param[in] index the index that the page belongs to
+@param[in] offsets rec_get_offsets(rec, index)
+@param[in] free previous start of the free list
+@param[in,out] mtr mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+ const dict_index_t *index, const rec_offs *offsets,
+ const byte *free, mtr_t *mtr)
+{
+ ut_ad(page_align(rec) == block->page.frame);
+ page_zip_des_t *const page_zip= &block->page.zip;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+ MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ mach_write_to_2(rec - REC_NEXT,
+ free ? static_cast<uint16_t>(free - rec) : 0);
+ byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+ block->page.frame);
+ mtr->write<2>(*block, page_free, page_offset(rec));
+ byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+ block->page.frame);
+ mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+ mach_read_from_2(garbage));
+ compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
+ memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
+ byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot_rec);
+ uint16_t n_recs= page_get_n_recs(block->page.frame);
+ ut_ad(n_recs);
+ ut_ad(n_recs > 1 || page_get_page_no(block->page.frame) == index->page);
+ /* This could not be done before page_zip_dir_find(). */
+ byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+ block->page.frame);
+ mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+ memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+ 2);
+
+ byte *slot_free;
+
+ if (UNIV_UNLIKELY(!free))
+ /* Make the last slot the start of the free list. */
+ slot_free= page_zip->data + page_zip_get_size(page_zip) -
+ PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
+ PAGE_HEAP_NO_USER_LOW);
+ else
+ {
+ slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
+ ut_a(slot_free < slot_rec);
+ /* Grow the free list by one slot by moving the start. */
+ slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
+ if (slot_len)
+ {
+ memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+ slot_len);
+ mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free - page_zip->data, slot_len);
+ }
+
+ /* Write the entry for the deleted record.
+ The "owned" and "deleted" flags will be cleared. */
+ mach_write_to_2(slot_free, page_offset(rec));
+ mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
+
+ if (const ulint n_ext= rec_offs_n_extern(offsets))
+ {
+ ut_ad(index->is_primary());
+ ut_ad(page_is_leaf(block->page.frame));
+
+ /* Shift and zero fill the array of BLOB pointers. */
+ ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+ byte *externs= page_zip->data + page_zip_get_size(page_zip) -
+ (page_dir_get_n_heap(block->page.frame) - PAGE_HEAP_NO_USER_LOW) *
+ PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+ byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
+
+ /* Shift and zero fill the array. */
+ if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+ BTR_EXTERN_FIELD_REF_SIZE)
+ {
+ memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+ mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+ ext_end - page_zip->data, ext_len);
+ }
+ memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
+ mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
+ page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
+ }
+
+ /* The compression algorithm expects info_bits and n_owned
+ to be 0 for deleted records. */
+ rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
+
+ page_zip_clear_rec(block, rec, index, offsets, mtr);
+}
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
+page_zip_reorganize(
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint z_level,/*!< in: compression level */
+ mtr_t* mtr, /*!< in: mini-transaction */
+ bool restore)/*!< whether to restore on failure */
+{
+ page_t* page = buf_block_get_frame(block);
+ buf_block_t* temp_block;
+ page_t* temp_page;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(block->page.zip.data);
+ ut_ad(page_is_comp(page));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+ /* Note that page_zip_validate(page_zip, page, index) may fail here. */
+ MEM_CHECK_DEFINED(page, srv_page_size);
+ MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
+ page_zip_get_size(buf_block_get_page_zip(block)));
+
+ /* Disable logging */
+ mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+ temp_block = buf_block_alloc();
+ btr_search_drop_page_hash_index(block, false);
+ temp_page = temp_block->page.frame;
+
+ /* Copy the old page to temporary space */
+ memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_page, block->page.frame,
+ srv_page_size);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ page_create(block, mtr, true);
+ if (index->is_spatial()) {
+ mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
+ memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
+ page + FIL_PAGE_TYPE, 2);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
+ memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
+ }
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ dberr_t err = page_copy_rec_list_end_no_locks(
+ block, temp_block, page_get_infimum_rec(temp_page),
+ index, mtr);
+
+ /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+ memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+ temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
+ /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
+ ut_ad(err != DB_SUCCESS
+ || index->is_clust() || !page_is_leaf(temp_page)
+ || page_get_max_trx_id(page) != 0);
+ /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+ clustered index root pages. */
+ ut_ad(err != DB_SUCCESS
+ || page_get_max_trx_id(page) == 0
+ || (index->is_clust()
+ ? !page_has_siblings(temp_page)
+ : page_is_leaf(temp_page)));
+
+ /* Restore logging. */
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (!page_zip_compress(block, index, z_level, mtr)) {
+ if (restore) {
+ /* Restore the old page and exit. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the bytes that we skip are identical. */
+ ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+ PAGE_HEADER + PAGE_N_RECS + temp_page,
+ PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+ ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
+ srv_page_size - FIL_PAGE_DATA_END
+ + temp_page,
+ FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+ PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+ memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+ srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(page, temp_page, srv_page_size));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+ err = DB_FAIL;
+ } else {
+ lock_move_reorganize_page(block, temp_block);
+ }
+
+ buf_block_free(temp_block);
+ return err;
+}
+
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+ buf_block_t* block, /*!< in/out: buffer block */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ page_t* page = block->page.frame;
+ page_zip_des_t* page_zip = &block->page.zip;
+
+ ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_ibuf(index));
+ ut_ad(!index->table->is_temporary());
+#ifdef UNIV_ZIP_DEBUG
+ /* The B-tree operations that call this function may set
+ FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+ mismatch. A strict page_zip_validate() will be executed later
+ during the B-tree operations. */
+ ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+ if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+ ut_a(page_is_leaf(src));
+ ut_a(dict_index_is_clust(index));
+ }
+
+ MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+ MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
+ MEM_CHECK_DEFINED(src, srv_page_size);
+ MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
+
+ /* Copy those B-tree page header fields that are related to
+ the records stored in the page. Also copy the field
+ PAGE_MAX_TRX_ID. Skip the rest of the page header and
+ trailer. On the compressed page, there is no trailer. */
+ compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
+ srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
+ memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
+ PAGE_HEADER + src_zip->data,
+ PAGE_HEADER_PRIV_END);
+ memcpy_aligned<2>(PAGE_DATA + page_zip->data,
+ PAGE_DATA + src_zip->data,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+
+ if (dict_index_is_clust(index)) {
+ /* Reset the PAGE_ROOT_AUTO_INC field when copying
+ from a root page. */
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page, 0, 8);
+ memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+ + page_zip->data, 0, 8);
+ } else {
+ /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
+ of secondary indexes, and 0 on others. */
+ ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
+ }
+
+ /* Copy all fields of src_zip to page_zip, except the pointer
+ to the compressed data page. */
+ {
+ page_zip_t* data = page_zip->data;
+ new (page_zip) page_zip_des_t(*src_zip, false);
+ page_zip->data = data;
+ }
+ ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
+ + page_zip->m_end < page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(src)
+ && UNIV_UNLIKELY(!page_has_prev(src))
+ && UNIV_LIKELY(page_has_prev(page))) {
+ /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+ ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE);
+ if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+ rec_t* rec = page + offs;
+ ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+ & REC_INFO_MIN_REC_FLAG);
+ rec[-REC_N_NEW_EXTRA_BYTES]
+ &= byte(~REC_INFO_MIN_REC_FLAG);
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ page_zip_compress_write_log(block, index, mtr);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param data compressed page
+@param size size of compressed page
+@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C
+@return page checksum */
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler)
+{
+ uLong adler;
+ const Bytef* s = static_cast<const byte*>(data);
+
+ /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+ and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ if (!use_adler) {
+ return my_crc32c(0, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+ ^ my_crc32c(0, s + FIL_PAGE_TYPE, 2)
+ ^ my_crc32c(0, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ } else {
+ adler = adler32(0L, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+ adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+ adler = adler32(
+ adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ static_cast<uInt>(size)
+ - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return(uint32_t(adler));
+ }
+}
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data ROW_FORMAT=COMPRESSED page
+@param size size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size)
+{
+ if (buf_is_zeroes(span<const byte>(data, size))) {
+ return true;
+ }
+
+ const uint32_t stored = mach_read_from_4(
+ data + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ uint32_t calc = page_zip_calc_checksum(data, size, false);
+
+#ifdef UNIV_INNOCHECKSUM
+ extern FILE* log_file;
+ extern uint32_t cur_page_num;
+
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " checksum: calculated = " UINT32PF ";"
+ " recorded = " UINT32PF "\n", cur_page_num,
+ calc, stored);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (stored == calc) {
+ return(TRUE);
+ }
+
+#ifndef UNIV_INNOCHECKSUM
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ break;
+ default:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return(TRUE);
+ }
+
+ return stored == page_zip_calc_checksum(data, size, true);
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+
+ return FALSE;
+}