diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/page/page0zip.cc | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/page/page0zip.cc')
-rw-r--r-- | storage/innobase/page/page0zip.cc | 4666 |
1 files changed, 4666 insertions, 0 deletions
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc new file mode 100644 index 00000000..89e6d149 --- /dev/null +++ b/storage/innobase/page/page0zip.cc @@ -0,0 +1,4666 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0zip.cc +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#include "page0zip.h" +#include "fsp0types.h" +#include "page0page.h" +#include "buf0checksum.h" +#include "zlib.h" +#include "span.h" + +using st_::span; + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0log.h" +#include "dict0dict.h" +#include "btr0cur.h" +#include "log0recv.h" +#include "row0row.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "lock0lock.h" +#include "srv0srv.h" +#include "buf0lru.h" +#include "srv0mon.h" + +#include <map> +#include <algorithm> + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +page_zip_stat_per_index_t page_zip_stat_per_index; + +/** Compression level to be used by zlib. Settable by user. */ +uint page_zip_level; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +/** Extra bytes of an infimum record */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +/** Data bytes of an infimum record */ +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +/** Extra bytes and data bytes of a supremum record */ +static const byte supremum_extra_data alignas(4) [] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +@param b in: memory block +@param s in: size of the memory block, in bytes */ +#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s)) +/** Assert that a BLOB pointer is filled with zero bytes. +@param b in: BLOB pointer */ +#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include <stdarg.h> +MY_ATTRIBUTE((format (printf, 1, 2))) +/**********************************************************************//** +Report a failure to decompress or compress. +@return number of characters printed */ +static +int +page_zip_fail_func( +/*===============*/ + const char* fmt, /*!< in: printf(3) format string */ + ...) /*!< in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +/** Wrapper for page_zip_fail_func() +@param fmt_args in: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/** Dummy wrapper for page_zip_fail_func() +@param fmt_args ignored: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ +{ + ulint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(static_cast<uLong>(2 * (n_fields + 1))); + return(lint(size) > 0 ? size : 0); +} + +/** Check whether a tuple is too big for compressed table +@param[in] index dict index object +@param[in] entry entry for the index +@return true if it's too big, otherwise false */ +bool +page_zip_is_too_big( + const dict_index_t* index, + const dtuple_t* entry) +{ + const ulint zip_size = index->table->space->zip_size(); + + /* Estimate the free space of an empty compressed page. + Subtract one byte for the encoded heap_no in the + modification log. */ + ulint free_space_zip = page_zip_empty_size( + index->n_fields, zip_size); + ulint n_uniq = dict_index_get_n_unique_in_tree(index); + + ut_ad(dict_table_is_comp(index->table)); + ut_ad(zip_size); + + if (free_space_zip == 0) { + return(true); + } + + /* Subtract one byte for the encoded heap_no in the + modification log. */ + free_space_zip--; + + /* There should be enough room for two node pointer + records on an empty non-leaf page. This prevents + infinite page splits. */ + + if (entry->n_fields >= n_uniq + && (REC_NODE_PTR_SIZE + + rec_get_converted_size_comp_prefix( + index, entry->fields, n_uniq, NULL) + /* On a compressed page, there is + a two-byte entry in the dense + page directory for every record. + But there is no record header. */ + - (REC_N_NEW_EXTRA_BYTES - 2) + > free_space_zip / 2)) { + return(true); + } + + return(false); +} + +/*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return ulint(page_dir_get_n_heap(page_zip->data)) + - PAGE_HEAP_NO_USER_LOW; +} + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return length of dense page directory, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); +} + +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); +} + +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). +@return length of dense page directory comprising existing records, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * ulint(page_get_n_recs(page_zip->data)); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/*************************************************************//** +Find the slot of the given record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + byte* slot, /*!< in: start of records */ + byte* end, /*!< in: end of records */ + ulint offset) /*!< in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/*************************************************************//** +Find the slot of the given non-free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/*************************************************************//** +Find the slot of the given free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/*************************************************************//** +Read a given slot in the dense page directory. +@return record offset on the uncompressed page, possibly ORed with +PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint slot) /*!< in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in] b ROW_FORMAT=COMPRESSED index page +@param[in] offset byte offset from b.zip.data +@param[in] len length of the data to write */ +inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len) +{ + ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX || + fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE); + ut_ad(page_zip_simple_validate(&b.page.zip)); + ut_ad(offset + len <= page_zip_get_size(&b.page.zip)); + + memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len); + m_last_offset= static_cast<uint16_t>(offset + len); +} + +/** Write a byte string to a ROW_FORMAT=COMPRESSED page. +@param[in] b ROW_FORMAT=COMPRESSED index page +@param[in] dest destination within b.zip.data +@param[in] str the data to write +@param[in] len length of the data to write +@tparam w write request type */ +template<mtr_t::write_type w> +inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str, + ulint len) +{ + byte *d= static_cast<byte*>(dest); + const byte *s= static_cast<const byte*>(str); + ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET); + if (w != FORCED) + { + ut_ad(len); + const byte *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == MAYBE_NOP); + return; + } + } + s--; + d--; + len= static_cast<ulint>(end - d); + } + ::memcpy(d, s, len); + zmemcpy(b, d - b.page.zip.data, len); +} + +/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page. +@param[in,out] block ROW_FORMAT=COMPRESSED index page +@param[in] index the index that the block belongs to +@param[in,out] mtr mini-transaction */ +static void page_zip_compress_write_log(buf_block_t *block, + dict_index_t *index, mtr_t *mtr) +{ + ut_ad(!index->is_ibuf()); + + if (!mtr->is_logged()) + return; + + const page_t *page= block->page.frame; + const page_zip_des_t *page_zip= &block->page.zip; + /* Read the number of user records. */ + ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) - + PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + else if (index->is_clust()) + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + else + trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE; + /* Add the space occupied by BLOB pointers. */ + trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); + compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA); + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + mtr->init(block); + mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV); + + if (trailer_size) + mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size, + trailer_size); +} + +/******************************************************//** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + const rec_t* rec, /*!< in: compact physical record + on a B-tree leaf page */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/**********************************************************************//** +Encode the length of a fixed-length column. +@return buf + length of encoded val */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + byte* buf, /*!< in: pointer to buffer where to write */ + ulint val) /*!< in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields + to compress */ + const dict_index_t* index, /*!< in: index comprising + at least n fields */ + ulint trx_id_pos, + /*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (DATA_BIG_COL(column)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_FIXED_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, ulint(field->fixed_len) << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/**********************************************************************//** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /*!< in: compact page */ + byte* buf, /*!< in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /*!< in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY(!page_has_prev(page))) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < srv_page_size - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); + compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK + & (PAGE_ZIP_DIR_SLOT_MASK + 1))); + compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK + >= UNIV_ZIP_SIZE_MAX - 1); + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (info_bits & REC_INFO_DELETED_FLAG) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(ulint(rec_get_status(rec)) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(ulint(rec_get_status(rec)) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +extern "C" { + +/**********************************************************************//** +Allocate memory for zlib. */ +static +void* +page_zip_zalloc( +/*============*/ + void* opaque, /*!< in/out: memory heap */ + uInt items, /*!< in: number of items to allocate */ + uInt size) /*!< in: size of an item in bytes */ +{ + return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size)); +} + +/**********************************************************************//** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */ + void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */ +{ +} + +} /* extern "C" */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + z_stream* strm = static_cast<z_stream*>(stream); + + strm->zalloc = page_zip_zalloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/** Symbol for enabling compression and decompression diagnostics */ +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/** Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +static bool page_zip_compress_dbg; +/** Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +static unsigned page_zip_compress_log; + +/**********************************************************************//** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +static +int +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/*!< in: log file, or NULL */ + z_streamp strm, /*!< in/out: compressed stream for deflate() */ + int flush) /*!< in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + if (fwrite(strm->next_in, 1, strm->avail_in, logfile) + != strm->avail_in) { + perror("fwrite"); + } + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +/** Debug wrapper for the zlib compression routine deflate(). +Log the operation if page_zip_compress_dbg is set. +@param strm in/out: compressed stream +@param flush in: flushing method +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +/** Declaration of the logfile parameter */ +# define FILE_LOGFILE FILE* logfile, +/** The logfile parameter */ +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +/** Empty declaration of the logfile parameter */ +# define FILE_LOGFILE +/** Missing logfile parameter */ +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/**********************************************************************//** +Compress the records of a node pointer page. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + rec_offs* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress the records of a leaf node of a secondary index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_sec( +/*==================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense) /*!< in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (UNIV_LIKELY(c_stream->avail_in != 0)) { + MEM_CHECK_DEFINED(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + byte** externs, /*!< in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /*!< in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + if (UNIV_LIKELY(c_stream->avail_in != 0)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust( +/*====================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* n_blobs, /*!< in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /*!< index of the trx_id column */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + rec_offs* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (rec_offs_any_extern(offsets)) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = static_cast<uInt>( + rec + rec_offs_data_size(offsets) - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err);} + +/** Attempt to compress a ROW_FORMAT=COMPRESSED page. +@retval true on success +@retval false on failure; block->page.zip will be left intact. */ +bool +page_zip_compress( + buf_block_t* block, /*!< in/out: buffer block */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + z_stream c_stream; + int err; + byte* fields; /*!< index field information */ + byte* buf; /*!< compressed payload of the + page */ + byte* buf_end; /* end of buf */ + ulint n_dense; + ulint slot_size; /* amount of uncompressed bytes + per record */ + const rec_t** recs; /*!< dense page directory, + sorted by address */ + mem_heap_t* heap; + ulint trx_id_col = ULINT_UNDEFINED; + ulint n_blobs = 0; + byte* storage; /* storage of uncompressed + columns */ + const ulonglong ns = my_interval_timer(); +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled; + cmp_per_index_enabled = srv_cmp_per_index_enabled; + + page_t* page = block->page.frame; + page_zip_des_t* page_zip = &block->page.zip; + + ut_a(page_is_comp(page)); + ut_a(fil_page_index_page_check(page)); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(!dict_index_is_ibuf(index)); + + MEM_CHECK_DEFINED(page, srv_page_size); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (page_is_empty(page)) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + const ulint n_fields = page_is_leaf(page) + ? dict_index_get_n_fields(index) + : dict_index_get_n_unique_in_tree_nonleaf(index); + index_id_t ind_id = index->id; + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ib::info() << "compress " + << static_cast<void*>(page_zip) << " " + << static_cast<const void*>(page) << " " + << page_is_leaf(page) << " " + << n_fields << " " << n_dense; + } + + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + if (fwrite(page, 1, srv_page_size, logfile) + != srv_page_size) { + perror("fwrite"); + } + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed++; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + MONITOR_INC(MONITOR_PAGE_COMPRESS); + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + srv_page_size * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = static_cast<const rec_t**>( + mem_heap_zalloc(heap, n_dense * sizeof *recs)); + + fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2)); + + buf = static_cast<byte*>( + mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA)); + + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, static_cast<int>(level), + Z_DEFLATED, static_cast<int>(srv_page_size_shift), + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1); + + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = index->db_trx_id(); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= uInt(n_dense * slot_size); + c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index, + trx_id_col, fields)); + c_stream.next_in = fields; + + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in = static_cast<uInt>( + page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page)); + ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR); + + MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].compressed_usec + += time_diff; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed_usec + += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + return false; + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + +#if defined HAVE_valgrind && !__has_feature(memory_sanitizer) + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + MEM_MAKE_DEFINED(buf, c_stream.total_out); +#endif /* HAVE_valgrind && !memory_sanitizer */ + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out); + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1); + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_zip_compress_write_log(block, index, mtr); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, srv_page_size, SEEK_SET); + if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) { + perror("fwrite"); + } + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[ind_id].compressed_ok++; + page_zip_stat_per_index[ind_id].compressed_usec += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); + } + + return true; +} + +/**********************************************************************//** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /*!< in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + index->zip_pad.mutex.~mutex(); + mem_heap_free(index->heap); + + dict_mem_table_free(table); + } +} + +/**********************************************************************//** +Read the index information for the compressed page. +@return own: dummy index describing the page, or NULL on error */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + const byte* buf, /*!< in: index information */ + const byte* end, /*!< in: end of buf */ + ulint* trx_id_col,/*!< in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ + bool is_spatial)/*< in: is spatial index or not */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_table_t::create({C_STRING_WITH_LEN("ZIP_DUMMY")}, + nullptr, n, 0, DICT_TF_COMPACT, 0); + index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n); + index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { +fail: + page_zip_fields_free(index); + return NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + goto fail; + } else { + index->n_nullable = static_cast<unsigned>(val) + & dict_index_t::MAX_N_FIELDS; + } + } + + /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */ + index->n_core_fields = index->n_fields; + index->n_core_null_bytes = static_cast<uint8_t>( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + + ut_ad(b == end); + + if (is_spatial) { + index->type |= DICT_SPATIAL; + } + + return(index); +} + +/**********************************************************************//** +Populate the sparse page directory from the dense directory. +@return TRUE on success, FALSE on failure */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ibool +page_zip_dir_decode( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + page_t* page, /*!< in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /*!< out: dense page directory sorted by + ascending address (and heap_no) */ + ulint n_dense)/*!< in: number of user records, and + size of recs[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1U); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + std::sort(recs, recs + n_dense); + return(TRUE); +} + +/**********************************************************************//** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + page_t* page, /*!< in/out: uncompressed page */ + ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_DEL) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/**********************************************************************//** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. +@return pointer to modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + const byte* data, /*!< in: modification log */ + const byte* end) /*!< in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, ulint(dst - next_out)); + data += ulint(dst - next_out); + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += ulint(dst - next_out) + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext:" + " ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = ulint(rec_get_end(rec, offsets) - next_out); + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext:" + " last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. +@return pointer to end of modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log( +/*===============*/ + const byte* data, /*!< in: modification log */ + ulint size, /*!< in: maximum length of the log, in bytes */ + rec_t** recs, /*!< in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/*!< in: size of recs[] */ + ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */ + ulint trx_id_col,/*!< in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /*!< in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /*!< in: index of the page */ + rec_offs* offsets)/*!< in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + rec_offs* offs; + offs = rec_get_offsets(rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + + compile_time_assert(REC_STATUS_NODE_PTR == TRUE); + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + /* Silence a debug assertion in rec_offs_make_valid(). + This will be overwritten in page_zip_set_extra_bytes(), + called by page_zip_decompress_low(). */ + ut_d(rec[-REC_NEW_INFO_BITS] = 0); + rec_offs_make_valid(rec, index, n_core != 0, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log:" + " %lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log:" + " trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = ulint(rec_get_end(rec, offsets) - b); + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log:" + " clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/**********************************************************************//** +Set the heap_no in a record, and skip the fixed-size record header +that is not included in the d_stream. +@return TRUE on success, FALSE if d_stream does not end at rec */ +static +ibool +page_zip_decompress_heap_no( +/*========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + ulint& heap_status) /*!< in/out: heap_no and status bits */ +{ + if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) { + /* n_dense has grown since the page was last compressed. */ + return(FALSE); + } + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a node pointer page. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + rec_offs* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>( + n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE)); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < srv_page_size + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out =static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in); +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, 0, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index)) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index)), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip_dir_start_low(page_zip, n_dense); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a leaf node of a secondary index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + rec_offs* offsets) /*!< in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uint>( + n_dense * PAGE_ZIP_DIR_SLOT_SIZE); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = static_cast<uint>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + + ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in)); + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + index->n_fields, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/**********************************************************************//** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. +@return TRUE on success */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col) /*!< in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint trx_id_col, /*!< index of the trx_id column */ + rec_offs* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>(n_dense) + * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out =static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < srv_page_size + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (rec_offs_any_extern(offsets)) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = static_cast<uInt>( + rec_get_end(rec, offsets) - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + ulint(page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) + - 1U) + - d_stream->next_out)); + } + + ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in)); + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + index->n_fields, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data); + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip_dir_start_low(page_zip, n_dense); + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + bool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust:" + " %p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_low( +/*====================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /*!< dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + rec_offs* offsets; + + ut_ad(page_zip_simple_validate(page_zip)); + MEM_CHECK_ADDRESSABLE(page, srv_page_size); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size); + + recs = static_cast<rec_t**>( + mem_heap_alloc(heap, n_dense * sizeof *recs)); + + if (all) { + /* Copy the page header. */ + memcpy_aligned<2>(page, page_zip->data, PAGE_DATA); + } else { + /* Check that the bytes that we skip are identical. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(FIL_PAGE_TYPE + page, + FIL_PAGE_TYPE + page_zip->data, + PAGE_HEADER - FIL_PAGE_TYPE)); + ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page, + PAGE_HEADER + PAGE_LEVEL + page_zip->data, + PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL))); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + /* Copy the mutable parts of the page header. */ + memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE); + memcpy_aligned<2>(PAGE_HEADER + page, + PAGE_HEADER + page_zip->data, + PAGE_LEVEL - PAGE_N_DIR_SLOTS); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the page headers match after copying. */ + ut_a(!memcmp(page, page_zip->data, PAGE_DATA)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + +#ifdef UNIV_ZIP_DEBUG + /* Clear the uncompressed page, except the header. */ + memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA); +#endif /* UNIV_ZIP_DEBUG */ + MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (page_is_empty(page)) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1 + + page, supremum_extra_data, + sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = static_cast<uInt>( + page_zip_get_size(page_zip) - (PAGE_DATA + 1)); + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START); + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift)) + != Z_OK)) { + ut_error; + } + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL, + fil_page_get_type(page) == FIL_PAGE_RTREE); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + offsets = static_cast<rec_offs*>( + mem_heap_alloc(heap, n * sizeof(ulint))); + + rec_offs_set_n_alloc(offsets, n); + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + MEM_CHECK_DEFINED(page, srv_page_size); + + page_zip_fields_free(index); + mem_heap_free(heap); + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + const ulonglong ns = my_interval_timer(); + + if (!page_zip_decompress_low(page_zip, page, all)) { + return(FALSE); + } + + const uint64_t time_diff = (my_interval_timer() - ns) / 1000; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + MONITOR_INC(MONITOR_PAGE_DECOMPRESS); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /*!< in: name of the data structure */ + const void* buf, /*!< in: data */ + ulint size) /*!< in: length of the data, in bytes */ +{ + const byte* s = static_cast<const byte*>(buf); + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +/** Dump a block of memory on the standard error stream. +@param buf in: data +@param size in: length of the data, in bytes */ +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/** Flag: make page_zip_validate() compare page headers only */ +bool page_zip_validate_header_only; + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_ROOT_AUTO_INC) + /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch + on the clustered index root page (page number 3 in .ibd files). + That allows concurrent readers (holding buf_block_t::lock S-latch). + Because we do not know what type of a latch our caller is holding, + we will ignore the field on clustered index root pages in order + to avoid false positives. */ + || (page_get_page_no(page) != 3/* clustered index root page */ + && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], + &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8)) + || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END], + &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END], + PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, srv_page_size); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + srv_page_size aligned. */ + page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + temp_page_zip = *page_zip; + valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + rec_offs* offsets; + mem_heap_t* heap; + const rec_t* rec; + const rec_t* trec; + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5/*REC_NEW_INFO_BITS*/; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + srv_page_size - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate:" + " min_rec_flag" + " (%s" UINT32PF "," UINT32PF + ",0x%02x)\n", + sloppy ? "ignored, " : "", + page_get_space_id(page), + page_get_page_no(page), + page[offset])); + /* We don't check for spatial index, since + the "minimum record" could be deleted when + doing rtr_update_mbr_field. + GIS_FIXME: need to validate why + rtr_update_mbr_field.() could affect this */ + if (index && dict_index_is_spatial(index)) { + valid = true; + } else { + valid = sloppy; + } + goto func_exit; + } + } + + /* Compare the pointers in the PAGE_FREE list. */ + rec = page_header_get_ptr(page, PAGE_FREE); + trec = page_header_get_ptr(temp_page, PAGE_FREE); + + while (rec || trec) { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate:" + " PAGE_FREE list: %u!=%u\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + goto func_exit; + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } + + /* Compare the records. */ + heap = NULL; + offsets = NULL; + rec = page_rec_get_next_low( + page + PAGE_NEW_INFIMUM, TRUE); + trec = page_rec_get_next_low( + temp_page + PAGE_NEW_INFIMUM, TRUE); + const ulint n_core = (index && page_is_leaf(page)) + ? index->n_fields : 0; + + do { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate:" + " record list: 0x%02x!=0x%02x\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + break; + } + + if (index) { + /* Compare the data. */ + offsets = rec_get_offsets( + rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + + if (memcmp(rec - rec_offs_extra_size(offsets), + trec - rec_offs_extra_size(offsets), + rec_offs_size(offsets))) { + page_zip_fail( + ("page_zip_validate:" + " record content: 0x%02x", + (unsigned) page_offset(rec))); + valid = FALSE; + break; + } + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } while (rec || trec); + + if (heap) { + mem_heap_free(heap); + } + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, srv_page_size); + page_zip_hexdump(temp_page, srv_page_size); + } + aligned_free(temp_page); + return(valid); +} + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ +{ + return(page_zip_validate_low(page_zip, page, index, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Assert that the compressed and decompressed page headers match. +@return TRUE */ +static +ibool +page_zip_header_cmp( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const byte* page) /*!< in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. +@return end of modification log */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + buf_block_t* block, /*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + const dict_index_t*index, /*!< in: record descriptor */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + ulint create, /*!< in: nonzero=insert, zero=update */ + ulint trx_id_col, /*!< in: position of DB_TRX_ID */ + ulint heap_no, /*!< in: heap number of rec */ + byte* storage, /*!< in: end of dense page directory */ + byte* data, /*!< in: end of modification log */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + + ut_ad(rec_offs_validate(rec, index, offsets)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * FIELD_REF_SIZE + < externs - FIELD_REF_SIZE * page_zip->n_blobs); + + if (n_ext) { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs = (page_zip->n_blobs + n_ext) + & ((1U << 12) - 1); + ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE); + if (ulint len = ulint(externs - ext_end)) { + byte* ext_start = ext_end + - n_ext * FIELD_REF_SIZE; + memmove(ext_start, ext_end, len); + mtr->memmove(*block, + ext_start - page_zip->data, + ext_end - page_zip->data, len); + } + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, ulint(src - start)); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + constexpr ulint sys_len = DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + byte* sys = storage - sys_len * (heap_no - 1); + memcpy(sys, src, sys_len); + i++; /* skip also roll_ptr */ + mtr->zmemcpy(*block, sys - page_zip->data, sys_len); + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len >= FIELD_REF_SIZE); + src += len - FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, ulint(src - start)); + data += src - start; + start = src + FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, FIELD_REF_SIZE); + mtr->zmemcpy(*block, externs - page_zip->data, + FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - ulint(start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/** Write an entire record to the ROW_FORMAT=COMPRESSED page. +The data must already have been written to the uncompressed page. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] rec record in the uncompressed page +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] create nonzero=insert, zero=update +@param[in,out] mtr mini-transaction */ +void page_zip_write_rec(buf_block_t *block, const byte *rec, + const dict_index_t *index, const rec_offs *offsets, + ulint create, mtr_t *mtr) +{ + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + byte s = *slot; + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. + On non-leaf pages, the delete-mark flag is garbage. */ + ut_ad(!index->is_primary() || !page_is_leaf(page) + || row_get_rec_trx_id(rec, index, offsets)); + s |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8)); + } + + if (s != *slot) { + *slot = s; + mtr->zmemcpy(*block, slot - page_zip->data, 1); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip_dir_start(page_zip); + + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + block, + rec, index, offsets, create, + index->db_trx_id(), heap_no, + storage, data, mtr); + } else { + /* Locate trx_id and roll_ptr. */ + ulint len; + const byte* src + = rec_get_nth_field(rec, offsets, + index->db_trx_id(), + &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + index->db_roll_ptr(), &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, ulint(src - rec)); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + constexpr ulint sys_len + = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + byte* sys = storage - sys_len * (heap_no - 1); + memcpy(sys, src, sys_len); + + src += sys_len; + mtr->zmemcpy(*block, sys - page_zip->data, + sys_len); + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - ulint(src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + ulint len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr, + rec + len, REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + mtr->zmemcpy(*block, page_zip->m_end, + data - page_zip->data - page_zip->m_end); + page_zip->m_end = uint16_t(data - page_zip->data); + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +void +page_zip_write_blob_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const byte* field; + byte* externs; + const page_t* const page = block->page.frame; + page_zip_des_t* const page_zip = &block->page.zip; + ulint blob_no; + ulint len; + + ut_ad(page_align(rec) == page); + ut_ad(index != NULL); + ut_ad(offsets != NULL); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field, + BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +void +page_zip_write_node_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + byte* field; + byte* storage; + page_zip_des_t* const page_zip = &block->page.zip; + + ut_d(const page_t* const page = block->page.frame); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, size); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + compile_time_assert(REC_NODE_PTR_SIZE == 4); + mach_write_to_4(field, ptr); + mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE); +} + +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction */ +void +page_zip_write_trx_id_and_roll_ptr( + buf_block_t* block, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr) +{ + page_zip_des_t* const page_zip = &block->page.zip; + + ut_d(const page_t* const page = block->page.frame); + ut_ad(page_align(rec) == page); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + const ulint heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len; + + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + ulint len; + byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + compile_time_assert(DATA_TRX_ID_LEN == 6); + mach_write_to_6(field, trx_id); + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + len = 0; + if (heap_no > PAGE_HEAP_NO_USER_LOW) { + byte* prev = storage + sys_len; + for (; len < sys_len && prev[len] == field[len]; len++); + if (len > 4) { + /* We save space by replacing a single record + + WRITE,offset(storage),byte[13] + + with up to two records: + + MEMMOVE,offset(storage),len(1 byte),+13(1 byte), + WRITE|0x80,0,byte[13-len] + + The single WRITE record would be x+13 bytes long (x>2). + The MEMMOVE record would be x+1+1 = x+2 bytes, and + the second WRITE would be 1+1+13-len = 15-len bytes. + + The total size is: x+13 versus x+2+15-len = x+17-len. + To save space, we must have len>4. */ + memcpy(storage, prev, len); + mtr->memmove(*block, ulint(storage - page_zip->data), + ulint(storage - page_zip->data) + sys_len, + len); + storage += len; + field += len; + if (UNIV_LIKELY(len < sys_len)) { + goto write; + } + } else { + len = 0; + goto write; + } + } else { +write: + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field, + sys_len - len); + } +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage - len, field - len, sys_len)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ +static +void +page_zip_clear_rec( +/*===============*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint heap_no; + byte* storage; + byte* field; + ulint len; + + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t* const page_zip = &block->page.zip; + + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, block->page.frame)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if (!page_is_leaf(block->page.frame)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip_dir_start(page_zip); + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + storage -= (heap_no - 1) * REC_NODE_PTR_SIZE; + len = REC_NODE_PTR_SIZE; +clear_page_zip: + memset(storage, 0, len); + mtr->memset(*block, storage - page_zip->data, len, 0); + } else if (index->is_clust()) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + field = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + storage = page_zip_dir_start(page_zip) + - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + goto clear_page_zip; + } else { + ut_ad(!rec_offs_any_extern(offsets)); + } +} + +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) +{ + ut_ad(page_align(rec) == block->page.frame); + byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + else + b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8)); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(&block->page.zip, block->page.frame, nullptr)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +void +page_zip_rec_set_owned( +/*===================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag, /*!< in: the owned flag (nonzero=TRUE) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t *const page_zip= &block->page.zip; + byte *slot= page_zip_dir_find(page_zip, page_offset(rec)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + byte b= *slot; + if (flag) + b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + else + b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8)); + mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1); +} + +/**********************************************************************//** +Insert a record to the dense page directory. */ +void +page_zip_dir_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ + byte* rec, /*!< in: record to insert */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(page_align(cursor->rec) == cursor->block->page.frame); + ut_ad(page_align(rec) == cursor->block->page.frame); + page_zip_des_t *const page_zip= &cursor->block->page.zip; + + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(cursor->rec != rec); + ut_ad(page_rec_get_next_const(cursor->rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(cursor->rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(cursor->rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1U); + + if (UNIV_UNLIKELY(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(page_offset(rec) >= free_rec); + slot_free = page_zip_dir_find(page_zip, free_rec); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + if (const ulint slot_len = ulint(slot_rec - slot_free)) { + /* Shift the dense directory to allocate place for rec. */ + memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, slot_len); + mtr->memmove(*cursor->block, (slot_free - page_zip->data) + - PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } + + /* Write the entry for the inserted record. + The "owned" flag must be zero. */ + uint16_t offs = page_offset(rec); + if (rec_get_deleted_flag(rec, true)) { + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs); + mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data + - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE); +} + +/** Shift the dense page directory and the array of BLOB pointers +when a record is deleted. +@param[in,out] block index page +@param[in,out] rec record being deleted +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] free previous start of the free list +@param[in,out] mtr mini-transaction */ +void page_zip_dir_delete(buf_block_t *block, byte *rec, + const dict_index_t *index, const rec_offs *offsets, + const byte *free, mtr_t *mtr) +{ + ut_ad(page_align(rec) == block->page.frame); + page_zip_des_t *const page_zip= &block->page.zip; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets)); + MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + mach_write_to_2(rec - REC_NEXT, + free ? static_cast<uint16_t>(free - rec) : 0); + byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, page_free, page_offset(rec)); + byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, garbage, rec_offs_size(offsets) + + mach_read_from_2(garbage)); + compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2); + memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4); + byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot_rec); + uint16_t n_recs= page_get_n_recs(block->page.frame); + ut_ad(n_recs); + ut_ad(n_recs > 1 || page_get_page_no(block->page.frame) == index->page); + /* This could not be done before page_zip_dir_find(). */ + byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + + block->page.frame); + mtr->write<2>(*block, page_n_recs, n_recs - 1U); + memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs, + 2); + + byte *slot_free; + + if (UNIV_UNLIKELY(!free)) + /* Make the last slot the start of the free list. */ + slot_free= page_zip->data + page_zip_get_size(page_zip) - + PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) - + PAGE_HEAP_NO_USER_LOW); + else + { + slot_free= page_zip_dir_find_free(page_zip, page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free+= PAGE_ZIP_DIR_SLOT_SIZE; + } + + const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0; + if (slot_len) + { + memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_len); + mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free - page_zip->data, slot_len); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + mtr->zmemcpy(*block, slot_free - page_zip->data, 2); + + if (const ulint n_ext= rec_offs_n_extern(offsets)) + { + ut_ad(index->is_primary()); + ut_ad(page_is_leaf(block->page.frame)); + + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + byte *externs= page_zip->data + page_zip_get_size(page_zip) - + (page_dir_get_n_heap(block->page.frame) - PAGE_HEAP_NO_USER_LOW) * + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE; + byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE; + + /* Shift and zero fill the array. */ + if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) * + BTR_EXTERN_FIELD_REF_SIZE) + { + memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len); + mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE, + ext_end - page_zip->data, ext_len); + } + memset(ext_end, 0, n_ext * FIELD_REF_SIZE); + mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0); + page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1); + } + + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */ + + page_zip_clear_rec(block, rec, index, offsets, mtr); +} + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, redo log will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return error code +@retval DB_FAIL on overflow; the block_zip will be left intact */ +dberr_t +page_zip_reorganize( + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint z_level,/*!< in: compression level */ + mtr_t* mtr, /*!< in: mini-transaction */ + bool restore)/*!< whether to restore on failure */ +{ + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block->page.zip.data); + ut_ad(page_is_comp(page)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(!index->table->is_temporary()); + /* Note that page_zip_validate(page_zip, page, index) may fail here. */ + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data, + page_zip_get_size(buf_block_get_page_zip(block))); + + /* Disable logging */ + mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + temp_block = buf_block_alloc(); + btr_search_drop_page_hash_index(block, false); + temp_page = temp_block->page.frame; + + /* Copy the old page to temporary space */ + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_page, block->page.frame, + srv_page_size); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, true); + if (index->is_spatial()) { + mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE); + memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE, + page + FIL_PAGE_TYPE, 2); + memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8); + memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8); + } + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + dberr_t err = page_copy_rec_list_end_no_locks( + block, temp_block, page_get_infimum_rec(temp_page), + index, mtr); + + /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ + memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8); + /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */ + ut_ad(err != DB_SUCCESS + || index->is_clust() || !page_is_leaf(temp_page) + || page_get_max_trx_id(page) != 0); + /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than + clustered index root pages. */ + ut_ad(err != DB_SUCCESS + || page_get_max_trx_id(page) == 0 + || (index->is_clust() + ? !page_has_siblings(temp_page) + : page_is_leaf(temp_page))); + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(block, index, z_level, mtr)) { + if (restore) { + /* Restore the old page and exit. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the bytes that we skip are identical. */ + ut_a(!memcmp(page, temp_page, PAGE_HEADER)); + ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page, + PAGE_HEADER + PAGE_N_RECS + temp_page, + PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS))); + ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page, + srv_page_size - FIL_PAGE_DATA_END + + temp_page, + FIL_PAGE_DATA_END)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page, + PAGE_N_RECS - PAGE_N_DIR_SLOTS); + memcpy(PAGE_DATA + page, PAGE_DATA + temp_page, + srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(page, temp_page, srv_page_size)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + + err = DB_FAIL; + } else { + lock_move_reorganize_page(block, temp_block); + } + + buf_block_free(temp_block); + return err; +} + +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +void +page_zip_copy_recs( + buf_block_t* block, /*!< in/out: buffer block */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + page_t* page = block->page.frame; + page_zip_des_t* page_zip = &block->page.zip; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(!index->table->is_temporary()); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, index, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + MEM_CHECK_ADDRESSABLE(page, srv_page_size); + MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip)); + MEM_CHECK_DEFINED(src, srv_page_size); + MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ + compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src, + srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END)); + memcpy_aligned<2>(PAGE_HEADER + page_zip->data, + PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy_aligned<2>(PAGE_DATA + page_zip->data, + PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + if (dict_index_is_clust(index)) { + /* Reset the PAGE_ROOT_AUTO_INC field when copying + from a root page. */ + memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + page, 0, 8); + memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + + page_zip->data, 0, 8); + } else { + /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages + of secondary indexes, and 0 on others. */ + ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src)); + } + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + new (page_zip) page_zip_des_t(*src_zip, false); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index)) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(!page_has_prev(src)) + && UNIV_LIKELY(page_has_prev(page))) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] + &= byte(~REC_INFO_MIN_REC_FLAG); + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_zip_compress_write_log(block, index, mtr); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the compressed page checksum. +@param data compressed page +@param size size of compressed page +@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C +@return page checksum */ +uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler) +{ + uLong adler; + const Bytef* s = static_cast<const byte*>(data); + + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (!use_adler) { + return my_crc32c(0, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET) + ^ my_crc32c(0, s + FIL_PAGE_TYPE, 2) + ^ my_crc32c(0, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } else { + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32( + adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + static_cast<uInt>(size) + - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return(uint32_t(adler)); + } +} + +/** Validate the checksum on a ROW_FORMAT=COMPRESSED page. +@param data ROW_FORMAT=COMPRESSED page +@param size size of the page, in bytes +@return whether the stored checksum matches innodb_checksum_algorithm */ +bool page_zip_verify_checksum(const byte *data, size_t size) +{ + if (buf_is_zeroes(span<const byte>(data, size))) { + return true; + } + + const uint32_t stored = mach_read_from_4( + data + FIL_PAGE_SPACE_OR_CHKSUM); + + uint32_t calc = page_zip_calc_checksum(data, size, false); + +#ifdef UNIV_INNOCHECKSUM + extern FILE* log_file; + extern uint32_t cur_page_num; + + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " checksum: calculated = " UINT32PF ";" + " recorded = " UINT32PF "\n", cur_page_num, + calc, stored); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (stored == calc) { + return(TRUE); + } + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + break; + default: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + + return stored == page_zip_calc_checksum(data, size, true); + } +#endif /* !UNIV_INNOCHECKSUM */ + + return FALSE; +} |