From a68fb2d8219f6bccc573009600e9f23e89226a5e Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 4 May 2024 20:04:16 +0200
Subject: Adding upstream version 1:10.6.11.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 storage/rocksdb/rdb_datadic.h | 1639 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1639 insertions(+)
 create mode 100644 storage/rocksdb/rdb_datadic.h

(limited to 'storage/rocksdb/rdb_datadic.h')

diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
new file mode 100644
index 00000000..fb368a17
--- /dev/null
+++ b/storage/rocksdb/rdb_datadic.h
@@ -0,0 +1,1639 @@
+/*
+   Copyright (c) 2012,2013 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
+#pragma once
+
+/* C++ standard header files */
+#include <cstdlib>
+#include <algorithm>
+#include <atomic>
+#include <map>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <array>
+
+/* C standard header files */
+#ifndef _WIN32
+#include <arpa/inet.h>
+#endif
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+#include "./properties_collector.h"
+#include "./rdb_buff.h"
+#include "./rdb_utils.h"
+
+namespace myrocks {
+
+class Rdb_dict_manager;
+class Rdb_key_def;
+class Rdb_field_packing;
+class Rdb_cf_manager;
+class Rdb_ddl_manager;
+
+const uint32_t GTID_BUF_LEN = 60;
+
+class Rdb_convert_to_record_key_decoder {
+ public:
+  Rdb_convert_to_record_key_decoder() = default;
+  Rdb_convert_to_record_key_decoder(
+      const Rdb_convert_to_record_key_decoder &decoder) = delete;
+  Rdb_convert_to_record_key_decoder &operator=(
+      const Rdb_convert_to_record_key_decoder &decoder) = delete;
+  static int decode(uchar *const buf, uint *offset, Rdb_field_packing *fpi,
+                    TABLE *table, Field *field, bool has_unpack_info,
+                    Rdb_string_reader *reader,
+                    Rdb_string_reader *unpack_reader);
+  static int skip(const Rdb_field_packing *fpi, const Field *field,
+                  Rdb_string_reader *reader, Rdb_string_reader *unpack_reader);
+
+ private:
+  static int decode_field(Rdb_field_packing *fpi, Field *field,
+                          Rdb_string_reader *reader,
+                          const uchar *const default_value,
+                          Rdb_string_reader *unpack_reader);
+};
+
+/*
+  @brief
+  Field packing context.
+  The idea is to ensure that a call to rdb_index_field_pack_t function
+  is followed by a call to rdb_make_unpack_info_t.
+
+  @detail
+  For some datatypes, unpack_info is produced as a side effect of
+  rdb_index_field_pack_t function call.
+  For other datatypes, packing is just calling make_sort_key(), while
+  rdb_make_unpack_info_t is a custom function.
+  In order to accommodate both cases, we require both calls to be made and
+  unpack_info is passed as context data between the two.
+*/
+class Rdb_pack_field_context {
+ public:
+  Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
+  Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
+
+  explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
+      : writer(writer_arg) {}
+
+  // NULL means we're not producing unpack_info.
+  Rdb_string_writer *writer;
+};
+
+class Rdb_key_field_iterator {
+ private:
+  Rdb_field_packing *m_pack_info;
+  int m_iter_index;
+  int m_iter_end;
+  TABLE *m_table;
+  Rdb_string_reader *m_reader;
+  Rdb_string_reader *m_unp_reader;
+  uint m_curr_bitmap_pos;
+  const MY_BITMAP *m_covered_bitmap;
+  uchar *m_buf;
+  bool m_has_unpack_info;
+  const Rdb_key_def *m_key_def;
+  bool m_secondary_key;
+  bool m_hidden_pk_exists;
+  bool m_is_hidden_pk;
+  bool m_is_null;
+  Field *m_field;
+  uint m_offset;
+  Rdb_field_packing *m_fpi;
+
+ public:
+  Rdb_key_field_iterator(const Rdb_key_field_iterator &) = delete;
+  Rdb_key_field_iterator &operator=(const Rdb_key_field_iterator &) = delete;
+  Rdb_key_field_iterator(const Rdb_key_def *key_def,
+                         Rdb_field_packing *pack_info,
+                         Rdb_string_reader *reader,
+                         Rdb_string_reader *unp_reader, TABLE *table,
+                         bool has_unpack_info, const MY_BITMAP *covered_bitmap,
+                         uchar *buf);
+
+  int next();
+  bool has_next();
+  bool get_is_null() const;
+  Field *get_field() const;
+  int get_field_index() const;
+  void *get_dst() const;
+};
+
+struct Rdb_collation_codec;
+struct Rdb_index_info;
+
+/*
+  C-style "virtual table" allowing different handling of packing logic based
+  on the field type. See Rdb_field_packing::setup() implementation.
+  */
+using rdb_make_unpack_info_t = void (*)(const Rdb_collation_codec *codec,
+                                        const Field *field,
+                                        Rdb_pack_field_context *pack_ctx);
+using rdb_index_field_unpack_t = int (*)(Rdb_field_packing *fpi, Field *field,
+                                         uchar *field_ptr,
+                                         Rdb_string_reader *reader,
+                                         Rdb_string_reader *unpack_reader);
+using rdb_index_field_skip_t = int (*)(const Rdb_field_packing *fpi,
+                                       const Field *field,
+                                       Rdb_string_reader *reader);
+using rdb_index_field_pack_t = void (*)(Rdb_field_packing *fpi, Field *field,
+                                        uchar *buf, uchar **dst,
+                                        Rdb_pack_field_context *pack_ctx);
+
+const uint RDB_INVALID_KEY_LEN = uint(-1);
+
+/* How much one checksum occupies when stored in the record */
+const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
+
+/*
+  How much the checksum data occupies in record, in total.
+  It is storing two checksums plus 1 tag-byte.
+*/
+const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
+
+/*
+  Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
+  checksums.
+*/
+const char RDB_CHECKSUM_DATA_TAG = 0x01;
+
+/*
+  Unpack data is variable length. The header is 1 tag-byte plus a two byte
+  length field. The length field includes the header as well.
+*/
+const char RDB_UNPACK_DATA_TAG = 0x02;
+const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
+const size_t RDB_UNPACK_HEADER_SIZE =
+    sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
+
+/*
+  This header format is 1 tag-byte plus a two byte length field plus a two byte
+  covered bitmap. The length field includes the header size.
+*/
+const char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
+const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
+const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
+const size_t RDB_UNPACK_COVERED_HEADER_SIZE =
+    sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
+    RDB_COVERED_BITMAP_SIZE;
+
+/*
+  Data dictionary index info field sizes.
+*/
+const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
+const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
+const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
+const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
+const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
+
+// Possible return values for rdb_index_field_unpack_t functions.
+enum {
+  UNPACK_SUCCESS = 0,
+  UNPACK_FAILURE = 1,
+};
+
+/*
+  An object of this class represents information about an index in an SQL
+  table. It provides services to encode and decode index tuples.
+
+  Note: a table (as in, on-disk table) has a single Rdb_key_def object which
+  is shared across multiple TABLE* objects and may be used simultaneously from
+  different threads.
+
+  There are several data encodings:
+
+  === SQL LAYER ===
+  SQL layer uses two encodings:
+
+  - "Table->record format". This is the format that is used for the data in
+     the record buffers, table->record[i]
+
+  - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
+    lookup functions, like handler::index_read_map().
+
+  === Inside RocksDB ===
+  Primary Key is stored as a mapping:
+
+    index_tuple -> StoredRecord
+
+  StoredRecord is in Table->record format, except for blobs, which are stored
+  in-place. See ha_rocksdb::convert_record_to_storage_format for details.
+
+  Secondary indexes are stored as one of two variants:
+
+    index_tuple -> unpack_info
+    index_tuple -> empty_string
+
+  index_tuple here is the form of key that can be compared with memcmp(), aka
+  "mem-comparable form".
+
+  unpack_info is extra data that allows to restore the original value from its
+  mem-comparable form. It is present only if the index supports index-only
+  reads.
+*/
+
+class Rdb_key_def {
+ public:
+  /* Convert a key from KeyTupleFormat to mem-comparable form */
+  uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
+                        uchar *const packed_tuple, uchar *const record_buffer,
+                        const uchar *const key_tuple,
+                        const key_part_map &keypart_map) const;
+
+  uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
+                    uchar *tuple, uchar *const packed_tuple,
+                    uchar *const pack_buffer,
+                    Rdb_string_writer *const unpack_info,
+                    uint *const n_null_fields) const;
+  /* Convert a key from Table->record format to mem-comparable form */
+  uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
+                   const uchar *const record, uchar *const packed_tuple,
+                   Rdb_string_writer *const unpack_info,
+                   const bool should_store_row_debug_checksums,
+                   const longlong hidden_pk_id = 0, uint n_key_parts = 0,
+                   uint *const n_null_fields = nullptr,
+                   const char *const ttl_bytes = nullptr) const;
+  /* Pack the hidden primary key into mem-comparable form. */
+  uint pack_hidden_pk(const longlong hidden_pk_id,
+                      uchar *const packed_tuple) const;
+  int unpack_record(TABLE *const table, uchar *const buf,
+                    const rocksdb::Slice *const packed_key,
+                    const rocksdb::Slice *const unpack_info,
+                    const bool verify_row_debug_checksums) const;
+
+  static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
+  int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
+                   std::size_t *const column_index) const;
+
+  size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
+
+  /* Get the key that is the "infimum" for this index */
+  inline void get_infimum_key(uchar *const key, uint *const size) const {
+    rdb_netbuf_store_index(key, m_index_number);
+    *size = INDEX_NUMBER_SIZE;
+  }
+
+  /* Get the key that is a "supremum" for this index */
+  inline void get_supremum_key(uchar *const key, uint *const size) const {
+    rdb_netbuf_store_index(key, m_index_number + 1);
+    *size = INDEX_NUMBER_SIZE;
+  }
+
+  /*
+    Get the first key that you need to position at to start iterating.
+    Stores into *key a "supremum" or "infimum" key value for the index.
+    @parameters key    OUT  Big Endian, value is m_index_number or
+                            m_index_number + 1
+    @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
+    @return Number of bytes in the key that are usable for bloom filter use.
+  */
+  inline int get_first_key(uchar *const key, uint *const size) const {
+    if (m_is_reverse_cf) {
+      get_supremum_key(key, size);
+      /* Find out how many bytes of infimum are the same as m_index_number */
+      uchar unmodified_key[INDEX_NUMBER_SIZE];
+      rdb_netbuf_store_index(unmodified_key, m_index_number);
+      int i;
+      for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+        if (key[i] != unmodified_key[i]) {
+          break;
+        }
+      }
+      return i;
+    } else {
+      get_infimum_key(key, size);
+      // For infimum key, its value will be m_index_number
+      // Thus return its own size instead.
+      return INDEX_NUMBER_SIZE;
+    }
+  }
+
+  /*
+    The same as get_first_key, but get the key for the last entry in the index
+    @parameters key    OUT  Big Endian, value is m_index_number or
+                            m_index_number + 1
+    @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
+
+    @return Number of bytes in the key that are usable for bloom filter use.
+  */
+  inline int get_last_key(uchar *const key, uint *const size) const {
+    if (m_is_reverse_cf) {
+      get_infimum_key(key, size);
+      // For infimum key, its value will be m_index_number
+      // Thus return its own size instead.
+      return INDEX_NUMBER_SIZE;
+    } else {
+      get_supremum_key(key, size);
+      /* Find out how many bytes are the same as m_index_number */
+      uchar unmodified_key[INDEX_NUMBER_SIZE];
+      rdb_netbuf_store_index(unmodified_key, m_index_number);
+      int i;
+      for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+        if (key[i] != unmodified_key[i]) {
+          break;
+        }
+      }
+      return i;
+    }
+  }
+
+  /* Make a key that is right after the given key. */
+  static int successor(uchar *const packed_tuple, const uint len);
+
+  /* Make a key that is right before the given key. */
+  static int predecessor(uchar *const packed_tuple, const uint len);
+
+  /*
+    This can be used to compare prefixes.
+    if  X is a prefix of Y, then we consider that X = Y.
+  */
+  // b describes the lookup key, which can be a prefix of a.
+  // b might be outside of the index_number range, if successor() is called.
+  int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
+    DBUG_ASSERT(covers_key(a));
+
+    return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
+  }
+
+  /* Check if given mem-comparable key belongs to this index */
+  bool covers_key(const rocksdb::Slice &slice) const {
+    if (slice.size() < INDEX_NUMBER_SIZE) return false;
+
+    if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
+
+  bool covers_lookup(const rocksdb::Slice *const unpack_info,
+                     const MY_BITMAP *const map) const;
+
+  inline bool use_covered_bitmap_format() const {
+    return m_index_type == INDEX_TYPE_SECONDARY &&
+           m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
+  }
+
+  /* Indicates that all key parts can be unpacked to cover a secondary lookup */
+  bool can_cover_lookup() const;
+
+  /*
+    Return true if the passed mem-comparable key
+    - is from this index, and
+    - it matches the passed key prefix (the prefix is also in mem-comparable
+      form)
+  */
+  bool value_matches_prefix(const rocksdb::Slice &value,
+                            const rocksdb::Slice &prefix) const {
+    return covers_key(value) && !cmp_full_keys(value, prefix);
+  }
+
+  uint32 get_keyno() const { return m_keyno; }
+
+  uint32 get_index_number() const { return m_index_number; }
+
+  GL_INDEX_ID get_gl_index_id() const {
+    const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
+    return gl_index_id;
+  }
+
+  int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
+                           const uint part_num) const;
+
+  /* Must only be called for secondary keys: */
+  uint get_primary_key_tuple(const TABLE *const tbl,
+                             const Rdb_key_def &pk_descr,
+                             const rocksdb::Slice *const key,
+                             uchar *const pk_buffer) const;
+
+  uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
+                           uchar *sk_buffer, uint *n_null_fields) const;
+
+  /* Return max length of mem-comparable form */
+  uint max_storage_fmt_length() const { return m_maxlength; }
+
+  uint get_key_parts() const { return m_key_parts; }
+
+  uint get_ttl_field_index() const { return m_ttl_field_index; }
+
+  /*
+    Get a field object for key part #part_no
+
+    @detail
+      SQL layer thinks unique secondary indexes and indexes in partitioned
+      tables are not "Extended" with Primary Key columns.
+
+      Internally, we always extend all indexes with PK columns. This function
+      uses our definition of how the index is Extended.
+  */
+  inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
+
+  const std::string &get_name() const { return m_name; }
+
+  const rocksdb::SliceTransform *get_extractor() const {
+    return m_prefix_extractor.get();
+  }
+
+  static size_t get_unpack_header_size(char tag);
+
+  Rdb_key_def &operator=(const Rdb_key_def &) = delete;
+  Rdb_key_def(const Rdb_key_def &k);
+  Rdb_key_def(uint indexnr_arg, uint keyno_arg,
+              rocksdb::ColumnFamilyHandle *cf_handle_arg,
+              uint16_t index_dict_version_arg, uchar index_type_arg,
+              uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
+              bool is_per_partition_cf, const char *name,
+              Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
+              uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
+  ~Rdb_key_def();
+
+  enum {
+    INDEX_NUMBER_SIZE = 4,
+    VERSION_SIZE = 2,
+    CF_NUMBER_SIZE = 4,
+    CF_FLAG_SIZE = 4,
+    PACKED_SIZE = 4,  // one int
+  };
+
+  // bit flags for combining bools when writing to disk
+  enum {
+    REVERSE_CF_FLAG = 1,
+    AUTO_CF_FLAG = 2,  // Deprecated
+    PER_PARTITION_CF_FLAG = 4,
+  };
+
+  // bit flags which denote myrocks specific fields stored in the record
+  // currently only used for TTL.
+  enum INDEX_FLAG {
+    TTL_FLAG = 1 << 0,
+
+    // MAX_FLAG marks where the actual record starts
+    // This flag always needs to be set to the last index flag enum.
+    MAX_FLAG = TTL_FLAG << 1,
+  };
+
+  // Set of flags to ignore when comparing two CF-s and determining if
+  // they're same.
+  static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
+
+  // Data dictionary types
+  enum DATA_DICT_TYPE {
+    DDL_ENTRY_INDEX_START_NUMBER = 1,
+    INDEX_INFO = 2,
+    CF_DEFINITION = 3,
+    BINLOG_INFO_INDEX_NUMBER = 4,
+    DDL_DROP_INDEX_ONGOING = 5,
+    INDEX_STATISTICS = 6,
+    MAX_INDEX_ID = 7,
+    DDL_CREATE_INDEX_ONGOING = 8,
+    AUTO_INC = 9,
+    // MariaDB: 10 through 12 are already taken in upstream
+    TABLE_VERSION = 20, // MariaDB: table version record
+    END_DICT_INDEX_ID = 255
+  };
+
+  // Data dictionary schema version. Introduce newer versions
+  // if changing schema layout
+  enum {
+    DDL_ENTRY_INDEX_VERSION = 1,
+    CF_DEFINITION_VERSION = 1,
+    BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
+    DDL_DROP_INDEX_ONGOING_VERSION = 1,
+    MAX_INDEX_ID_VERSION = 1,
+    DDL_CREATE_INDEX_ONGOING_VERSION = 1,
+    AUTO_INCREMENT_VERSION = 1,
+    // Version for index stats is stored in IndexStats struct
+  };
+
+  // Index info version.  Introduce newer versions when changing the
+  // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
+  // latest version number.
+  enum {
+    INDEX_INFO_VERSION_INITIAL = 1,  // Obsolete
+    INDEX_INFO_VERSION_KV_FORMAT,
+    INDEX_INFO_VERSION_GLOBAL_ID,
+    // There is no change to data format in this version, but this version
+    // verifies KV format version, whereas previous versions do not. A version
+    // bump is needed to prevent older binaries from skipping the KV version
+    // check inadvertently.
+    INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
+    // This changes the data format to include a 8 byte TTL duration for tables
+    INDEX_INFO_VERSION_TTL,
+    // This changes the data format to include a bitmap before the TTL duration
+    // which will indicate in the future whether TTL or other special fields
+    // are turned on or off.
+    INDEX_INFO_VERSION_FIELD_FLAGS,
+    // This normally point to the latest (currently it does).
+    INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
+  };
+
+  // MyRocks index types
+  enum {
+    INDEX_TYPE_PRIMARY = 1,
+    INDEX_TYPE_SECONDARY = 2,
+    INDEX_TYPE_HIDDEN_PRIMARY = 3,
+  };
+
+  // Key/Value format version for each index type
+  enum {
+    PRIMARY_FORMAT_VERSION_INITIAL = 10,
+    // This change includes:
+    //  - For columns that can be unpacked with unpack_info, PK
+    //    stores the unpack_info.
+    //  - DECIMAL datatype is no longer stored in the row (because
+    //    it can be decoded from its mem-comparable form)
+    //  - VARCHAR-columns use endspace-padding.
+    PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
+    // This change includes:
+    //  - Binary encoded variable length fields have a new format that avoids
+    //    an inefficient where data that was a multiple of 8 bytes in length
+    //    had an extra 9 bytes of encoded data.
+    PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
+    // This change includes support for TTL
+    //  - This means that when TTL is specified for the table an 8-byte TTL
+    //    field is prepended in front of each value.
+    PRIMARY_FORMAT_VERSION_TTL = 13,
+    PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
+
+    SECONDARY_FORMAT_VERSION_INITIAL = 10,
+    // This change the SK format to include unpack_info.
+    SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
+    // This change includes:
+    //  - Binary encoded variable length fields have a new format that avoids
+    //    an inefficient where data that was a multiple of 8 bytes in length
+    //    had an extra 9 bytes of encoded data.
+    SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
+    // This change includes support for TTL
+    //  - This means that when TTL is specified for the table an 8-byte TTL
+    //    field is prepended in front of each value.
+    SECONDARY_FORMAT_VERSION_TTL = 13,
+    SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
+    // This change includes support for covering SK lookups for varchars.  A
+    // 2-byte bitmap is added after the tag-byte to unpack_info only for
+    // records which have covered varchar columns. Currently waiting before
+    // enabling in prod.
+    SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
+  };
+
+  void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
+
+  static uint extract_ttl_duration(const TABLE *const table_arg,
+                                   const Rdb_tbl_def *const tbl_def_arg,
+                                   uint64 *ttl_duration);
+  static uint extract_ttl_col(const TABLE *const table_arg,
+                              const Rdb_tbl_def *const tbl_def_arg,
+                              std::string *ttl_column, uint *ttl_field_index,
+                              bool skip_checks = false);
+  inline bool has_ttl() const { return m_ttl_duration > 0; }
+
+  static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
+  static uint32 calculate_index_flag_offset(uint32 index_flags,
+                                            enum INDEX_FLAG flag,
+                                            uint *const field_length = nullptr);
+  void write_index_flag_field(Rdb_string_writer *const buf,
+                              const uchar *const val,
+                              enum INDEX_FLAG flag) const;
+
+  static const std::string gen_qualifier_for_table(
+      const char *const qualifier, const std::string &partition_name = "");
+  static const std::string gen_cf_name_qualifier_for_partition(
+      const std::string &s);
+  static const std::string gen_ttl_duration_qualifier_for_partition(
+      const std::string &s);
+  static const std::string gen_ttl_col_qualifier_for_partition(
+      const std::string &s);
+
+  static const std::string parse_comment_for_qualifier(
+      const std::string &comment, const TABLE *const table_arg,
+      const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
+      const char *const qualifier);
+
+  rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; }
+
+  /* Check if keypart #kp can be unpacked from index tuple */
+  inline bool can_unpack(const uint kp) const;
+  /* Check if keypart #kp needs unpack info */
+  inline bool has_unpack_info(const uint kp) const;
+
+  /* Check if given table has a primary key */
+  static bool table_has_hidden_pk(const TABLE *const table);
+
+  void report_checksum_mismatch(const bool is_key, const char *const data,
+                                const size_t data_size) const;
+
+  /* Check if index is at least pk_min if it is a PK,
+    or at least sk_min if SK.*/
+  bool index_format_min_check(const int pk_min, const int sk_min) const;
+
+  static void pack_with_make_sort_key(
+      Rdb_field_packing *const fpi, Field *const field,
+      uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
+      Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
+
+  static void pack_with_varchar_encoding(
+      Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
+      Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
+
+  static void pack_with_varchar_space_pad(
+      Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
+      Rdb_pack_field_context *const pack_ctx);
+
+  static int unpack_integer(Rdb_field_packing *const fpi, Field *const field,
+                            uchar *const to, Rdb_string_reader *const reader,
+                            Rdb_string_reader *const unp_reader
+                                MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_double(
+      Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
+      Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+      Rdb_string_reader *const reader,
+      Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_float(
+      Rdb_field_packing *const fpi,
+      Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+      Rdb_string_reader *const reader,
+      Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field,
+                               uchar *const to, Rdb_string_reader *const reader,
+                               Rdb_string_reader *const unp_reader
+                                   MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_binary_or_utf8_varchar(
+      Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+      Rdb_string_reader *const reader,
+      Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_binary_or_utf8_varchar_space_pad(
+      Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+      Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
+
+  static int unpack_newdate(
+      Rdb_field_packing *const fpi,
+      Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+      Rdb_string_reader *const reader,
+      Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field,
+                             uchar *dst, Rdb_string_reader *const reader,
+                             Rdb_string_reader *const unp_reader
+                                 MY_ATTRIBUTE((__unused__)));
+
+  static int unpack_unknown_varchar(Rdb_field_packing *const fpi,
+                                    Field *const field, uchar *dst,
+                                    Rdb_string_reader *const reader,
+                                    Rdb_string_reader *const unp_reader);
+
+  static int unpack_simple_varchar_space_pad(
+      Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+      Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
+
+  static int unpack_simple(Rdb_field_packing *const fpi,
+                           Field *const field MY_ATTRIBUTE((__unused__)),
+                           uchar *const dst, Rdb_string_reader *const reader,
+                           Rdb_string_reader *const unp_reader);
+
+  static int unpack_unknown(Rdb_field_packing *const fpi, Field *const field,
+                            uchar *const dst, Rdb_string_reader *const reader,
+                            Rdb_string_reader *const unp_reader);
+
+  static int unpack_floating_point(uchar *const dst,
+                                   Rdb_string_reader *const reader,
+                                   const size_t size, const int exp_digit,
+                                   const uchar *const zero_pattern,
+                                   const uchar *const zero_val,
+                                   void (*swap_func)(uchar *, const uchar *));
+
+  static void make_unpack_simple_varchar(
+      const Rdb_collation_codec *const codec, const Field *const field,
+      Rdb_pack_field_context *const pack_ctx);
+
+  static void make_unpack_simple(const Rdb_collation_codec *const codec,
+                                 const Field *const field,
+                                 Rdb_pack_field_context *const pack_ctx);
+
+  static void make_unpack_unknown(
+      const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
+      const Field *const field, Rdb_pack_field_context *const pack_ctx);
+
+  static void make_unpack_unknown_varchar(
+      const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
+      const Field *const field, Rdb_pack_field_context *const pack_ctx);
+
+  static void dummy_make_unpack_info(
+      const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
+      const Field *field MY_ATTRIBUTE((__unused__)),
+      Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__)));
+
+  static int skip_max_length(const Rdb_field_packing *const fpi,
+                             const Field *const field
+                                 MY_ATTRIBUTE((__unused__)),
+                             Rdb_string_reader *const reader);
+
+  static int skip_variable_length(const Rdb_field_packing *const fpi,
+                                  const Field *const field,
+                                  Rdb_string_reader *const reader);
+
+  static int skip_variable_space_pad(const Rdb_field_packing *const fpi,
+                                     const Field *const field,
+                                     Rdb_string_reader *const reader);
+
+  inline bool use_legacy_varbinary_format() const {
+    return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
+                                   SECONDARY_FORMAT_VERSION_UPDATE2);
+  }
+
+  static inline bool is_unpack_data_tag(char c) {
+    return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
+  }
+
+ private:
+#ifndef DBUG_OFF
+  inline bool is_storage_available(const int offset, const int needed) const {
+    const int storage_length = static_cast<int>(max_storage_fmt_length());
+    return (storage_length - offset) >= needed;
+  }
+#else
+  inline bool is_storage_available(const int &offset, const int &needed) const {
+    return 1;
+  }
+#endif  // DBUG_OFF
+
+  /* Global number of this index (used as prefix in StorageFormat) */
+  const uint32 m_index_number;
+
+  uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
+
+  rocksdb::ColumnFamilyHandle *m_cf_handle;
+
+  static void pack_legacy_variable_format(const uchar *src, size_t src_len,
+                                          uchar **dst);
+
+  static void pack_variable_format(const uchar *src, size_t src_len,
+                                   uchar **dst);
+
+  static uint calc_unpack_legacy_variable_format(uchar flag, bool *done);
+
+  static uint calc_unpack_variable_format(uchar flag, bool *done);
+
+ public:
+  uint16_t m_index_dict_version;
+  uchar m_index_type;
+  /* KV format version for the index id */
+  uint16_t m_kv_format_version;
+  /* If true, the column family stores data in the reverse order */
+  bool m_is_reverse_cf;
+
+  /* If true, then column family is created per partition. */
+  bool m_is_per_partition_cf;
+
+  std::string m_name;
+  mutable Rdb_index_stats m_stats;
+
+  /*
+    Bitmap containing information about whether TTL or other special fields
+    are enabled for the given index.
+  */
+  uint32 m_index_flags_bitmap;
+
+  /*
+    How much space in bytes the index flag fields occupy.
+  */
+  uint32 m_total_index_flags_length;
+
+  /*
+    Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
+  */
+  uint32 m_ttl_rec_offset;
+
+  /* Default TTL duration */
+  uint64 m_ttl_duration;
+
+  /* TTL column (if defined by user, otherwise implicit TTL is used) */
+  std::string m_ttl_column;
+
+ private:
+  /* Number of key parts in the primary key*/
+  uint m_pk_key_parts;
+
+  /*
+     pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
+     primary key.  Y==-1 means this column is not present in the primary key.
+  */
+  uint *m_pk_part_no;
+
+  /* Array of index-part descriptors. */
+  Rdb_field_packing *m_pack_info;
+
+  uint m_keyno; /* number of this index in the table */
+
+  /*
+    Number of key parts in the index (including "index extension"). This is how
+    many elements are in the m_pack_info array.
+  */
+  uint m_key_parts;
+
+  /*
+    If TTL column is part of the PK, offset of the column within pk.
+    Default is UINT_MAX to denote that TTL col is not part of PK.
+  */
+  uint m_ttl_pk_key_part_offset;
+
+  /*
+    Index of the TTL column in table->s->fields, if it exists.
+    Default is UINT_MAX to denote that it does not exist.
+  */
+  uint m_ttl_field_index;
+
+  /* Prefix extractor for the column family of the key definiton */
+  std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
+
+  /* Maximum length of the mem-comparable form. */
+  uint m_maxlength;
+
+  /* mutex to protect setup */
+  mysql_mutex_t m_mutex;
+};
+
+// "Simple" collations (those specified in strings/ctype-simple.c) are simple
+// because their strnxfrm function maps one byte to one byte. However, the
+// mapping is not injective, so the inverse function will take in an extra
+// index parameter containing information to disambiguate what the original
+// character was.
+//
+// The m_enc* members are for encoding. Generally, we want encoding to be:
+//      src -> (dst, idx)
+//
+// Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
+// idx.
+//
+// For the inverse, we have:
+//      (dst, idx) -> src
+//
+// We have m_dec_idx[idx][dst] = src to get our original character back.
+//
+struct Rdb_collation_codec {
+  const my_core::CHARSET_INFO *m_cs;
+  // The first element unpacks VARCHAR(n), the second one - CHAR(n).
+  std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
+  std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
+
+  std::array<uchar, 256> m_enc_idx;
+  std::array<uchar, 256> m_enc_size;
+
+  std::array<uchar, 256> m_dec_size;
+  std::vector<std::array<uchar, 256>> m_dec_idx;
+};
+
+extern mysql_mutex_t rdb_collation_data_mutex;
+extern mysql_mutex_t rdb_mem_cmp_space_mutex;
+extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
+    rdb_collation_data;
+
+class Rdb_field_packing {
+ public:
+  Rdb_field_packing(const Rdb_field_packing &) = delete;
+  Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
+  Rdb_field_packing() = default;
+
+  /* Length of mem-comparable image of the field, in bytes */
+  int m_max_image_len;
+
+  /* Length of image in the unpack data */
+  int m_unpack_data_len;
+  int m_unpack_data_offset;
+
+  bool m_maybe_null; /* TRUE <=> NULL-byte is stored */
+
+  /*
+    Valid only for VARCHAR fields.
+  */
+  const CHARSET_INFO *m_varchar_charset;
+  bool m_use_legacy_varbinary_format;
+
+  // (Valid when Variable Length Space Padded Encoding is used):
+  uint m_segment_size;  // size of segment used
+
+  // number of bytes used to store number of trimmed (or added)
+  // spaces in the upack_info
+  bool m_unpack_info_uses_two_bytes;
+
+  /*
+    True implies that an index-only read is always possible for this field.
+    False means an index-only read may be possible depending on the record and
+    field type.
+  */
+  bool m_covered;
+
+  const std::vector<uchar> *space_xfrm;
+  size_t space_xfrm_len;
+  size_t space_mb_len;
+
+  const Rdb_collation_codec *m_charset_codec;
+
+  /*
+    @return TRUE: this field makes use of unpack_info.
+  */
+  bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
+
+  /* TRUE means unpack_info stores the original field value */
+  bool m_unpack_info_stores_value;
+
+  rdb_index_field_pack_t m_pack_func;
+  rdb_make_unpack_info_t m_make_unpack_info_func;
+
+  /*
+    This function takes
+    - mem-comparable form
+    - unpack_info data
+    and restores the original value.
+  */
+  rdb_index_field_unpack_t m_unpack_func;
+
+  /*
+    This function skips over mem-comparable form.
+  */
+  rdb_index_field_skip_t m_skip_func;
+
+ private:
+  /*
+    Location of the field in the table (key number and key part number).
+
+    Note that this describes not the field, but rather a position of field in
+    the index. Consider an example:
+
+      col1 VARCHAR (100),
+      INDEX idx1 (col1)),
+      INDEX idx2 (col1(10)),
+
+    Here, idx2 has a special Field object that is set to describe a 10-char
+    prefix of col1.
+
+    We must also store the keynr. It is needed for implicit "extended keys".
+    Every key in MyRocks needs to include PK columns.  Generally, SQL layer
+    includes PK columns as part of its "Extended Keys" feature, but sometimes
+    it does not (known examples are unique secondary indexes and partitioned
+    tables).
+    In that case, MyRocks's index descriptor has invisible suffix of PK
+    columns (and the point is that these columns are parts of PK, not parts
+    of the current index).
+  */
+  uint m_keynr;
+  uint m_key_part;
+
+ public:
+  bool setup(const Rdb_key_def *const key_descr, const Field *const field,
+             const uint keynr_arg, const uint key_part_arg,
+             const uint16 key_length);
+  Field *get_field_in_table(const TABLE *const tbl) const;
+  void fill_hidden_pk_val(uchar **dst, const longlong hidden_pk_id) const;
+};
+
+/*
+  Descriptor telling how to decode/encode a field to on-disk record storage
+  format. Not all information is in the structure yet, but eventually we
+  want to have as much as possible there to avoid virtual calls.
+
+  For encoding/decoding of index tuples, see Rdb_key_def.
+  */
+class Rdb_field_encoder {
+ public:
+  Rdb_field_encoder(const Rdb_field_encoder &) = delete;
+  Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
+  /*
+    STORE_NONE is set when a column can be decoded solely from their
+    mem-comparable form.
+    STORE_SOME is set when a column can be decoded from their mem-comparable
+    form plus unpack_info.
+    STORE_ALL is set when a column cannot be decoded, so its original value
+    must be stored in the PK records.
+    */
+  enum STORAGE_TYPE {
+    STORE_NONE,
+    STORE_SOME,
+    STORE_ALL,
+  };
+  STORAGE_TYPE m_storage_type;
+
+  uint m_null_offset;
+  uint16 m_field_index;
+
+  uchar m_null_mask;  // 0 means the field cannot be null
+
+  my_core::enum_field_types m_field_type;
+
+  uint m_pack_length_in_rec;
+
+  bool maybe_null() const { return m_null_mask != 0; }
+
+  bool uses_variable_len_encoding() const {
+    return (m_field_type == MYSQL_TYPE_BLOB ||
+            m_field_type == MYSQL_TYPE_VARCHAR);
+  }
+};
+
+inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
+                                                       uint part_no) const {
+  DBUG_ASSERT(part_no < get_key_parts());
+  return m_pack_info[part_no].get_field_in_table(table);
+}
+
+inline bool Rdb_key_def::can_unpack(const uint kp) const {
+  DBUG_ASSERT(kp < m_key_parts);
+  return (m_pack_info[kp].m_unpack_func != nullptr);
+}
+
+inline bool Rdb_key_def::has_unpack_info(const uint kp) const {
+  DBUG_ASSERT(kp < m_key_parts);
+  return m_pack_info[kp].uses_unpack_info();
+}
+
+/*
+  A table definition. This is an entry in the mapping
+
+    dbname.tablename -> {index_nr, index_nr, ... }
+
+  There is only one Rdb_tbl_def object for a given table.
+  That's why we keep auto_increment value here, too.
+*/
+
+class Rdb_tbl_def {
+ private:
+  void check_if_is_mysql_system_table();
+
+  /* Stores 'dbname.tablename' */
+  std::string m_dbname_tablename;
+
+  /* Store the db name, table name, and partition name */
+  std::string m_dbname;
+  std::string m_tablename;
+  std::string m_partition;
+
+  void set_name(const std::string &name);
+
+ public:
+  Rdb_tbl_def(const Rdb_tbl_def &) = delete;
+  Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
+
+  explicit Rdb_tbl_def(const std::string &name)
+      : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+        m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+    set_name(name);
+  }
+
+  Rdb_tbl_def(const char *const name, const size_t len)
+      : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+        m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+    set_name(std::string(name, len));
+  }
+
+  explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t pos = 0)
+      : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+        m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+    set_name(std::string(slice.data() + pos, slice.size() - pos));
+  }
+
+  ~Rdb_tbl_def();
+
+  void check_and_set_read_free_rpl_table();
+
+  /* Number of indexes */
+  uint m_key_count;
+
+  /* Array of index descriptors */
+  std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
+
+  std::atomic<longlong> m_hidden_pk_val;
+  std::atomic<ulonglong> m_auto_incr_val;
+
+  /* Is this a system table */
+  bool m_is_mysql_system_table;
+
+  /* Is this table read free repl enabled */
+  std::atomic_bool m_is_read_free_rpl_table{false};
+
+  bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch,
+                const rocksdb::Slice &key);
+
+  const std::string &full_tablename() const { return m_dbname_tablename; }
+  const std::string &base_dbname() const { return m_dbname; }
+  const std::string &base_tablename() const { return m_tablename; }
+  const std::string &base_partition() const { return m_partition; }
+  GL_INDEX_ID get_autoincr_gl_index_id();
+
+  time_t get_create_time();
+  std::atomic<time_t> m_update_time; // in-memory only value
+
+ private:
+  const time_t CREATE_TIME_UNKNOWN= 1;
+  // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
+  // 0 means "no data available"
+  std::atomic<time_t> m_create_time;
+};
+
+/*
+  A thread-safe sequential number generator. Its performance is not a concern
+  hence it is ok to protect it by a mutex.
+*/
+
+class Rdb_seq_generator {
+  uint m_next_number = 0;
+
+  mysql_mutex_t m_mutex;
+
+ public:
+  Rdb_seq_generator(const Rdb_seq_generator &) = delete;
+  Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
+  Rdb_seq_generator() = default;
+
+  void init(const uint initial_number) {
+    mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
+    m_next_number = initial_number;
+  }
+
+  uint get_and_update_next_number(Rdb_dict_manager *const dict);
+
+  void cleanup() { mysql_mutex_destroy(&m_mutex); }
+};
+
+interface Rdb_tables_scanner {
+  virtual int add_table(Rdb_tbl_def * tdef) = 0;
+  virtual ~Rdb_tables_scanner() {} /* Keep the compiler happy */
+};
+
+/*
+  This contains a mapping of
+
+     dbname.table_name -> array{Rdb_key_def}.
+
+  objects are shared among all threads.
+*/
+
+class Rdb_ddl_manager {
+  Rdb_dict_manager *m_dict = nullptr;
+
+  // Contains Rdb_tbl_def elements
+  std::unordered_map<std::string, Rdb_tbl_def *> m_ddl_map;
+
+  // Maps index id to <table_name, index number>
+  std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
+
+  // Maps index id to key definitons not yet committed to data dictionary.
+  // This is mainly used to store key definitions during ALTER TABLE.
+  std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
+      m_index_num_to_uncommitted_keydef;
+  mysql_rwlock_t m_rwlock;
+
+  Rdb_seq_generator m_sequence;
+  // A queue of table stats to write into data dictionary
+  // It is produced by event listener (ie compaction and flush threads)
+  // and consumed by the rocksdb background thread
+  std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
+
+  const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
+
+ public:
+  Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
+  Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
+  Rdb_ddl_manager() {}
+
+  /* Load the data dictionary from on-disk storage */
+  bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
+            const uint32_t validate_tables);
+
+  void cleanup();
+
+  Rdb_tbl_def *find(const std::string &table_name, const bool lock = true);
+  std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
+  void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
+  void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
+                    const std::vector<Rdb_index_stats> &deleted_data =
+                        std::vector<Rdb_index_stats>());
+  void persist_stats(const bool sync = false);
+
+  /* Modify the mapping and write it to on-disk storage */
+  int put_and_write(Rdb_tbl_def *const key_descr,
+                    rocksdb::WriteBatch *const batch);
+  void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
+              const bool lock = true);
+  bool rename(const std::string &from, const std::string &to,
+              rocksdb::WriteBatch *const batch);
+
+  uint get_and_update_next_number(Rdb_dict_manager *const dict) {
+    return m_sequence.get_and_update_next_number(dict);
+  }
+
+  const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
+
+  /* Walk the data dictionary */
+  int scan_for_tables(Rdb_tables_scanner *tables_scanner);
+
+  void erase_index_num(const GL_INDEX_ID &gl_index_id);
+  void add_uncommitted_keydefs(
+      const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
+  void remove_uncommitted_keydefs(
+      const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
+
+ private:
+  /* Put the data into in-memory table (only) */
+  int put(Rdb_tbl_def *const key_descr, const bool lock = true);
+
+  /* Helper functions to be passed to my_core::HASH object */
+  static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
+                                   my_bool not_used MY_ATTRIBUTE((unused)));
+  static void free_hash_elem(void *const data);
+
+  bool validate_schemas();
+
+  bool validate_auto_incr();
+};
+
+/*
+  Writing binlog information into RocksDB at commit(),
+  and retrieving binlog information at crash recovery.
+  commit() and recovery are always executed by at most single client
+  at the same time, so concurrency control is not needed.
+
+  Binlog info is stored in RocksDB as the following.
+   key: BINLOG_INFO_INDEX_NUMBER
+   value: packed single row:
+     binlog_name_length (2 byte form)
+     binlog_name
+     binlog_position (4 byte form)
+     binlog_gtid_length (2 byte form)
+     binlog_gtid
+*/
+class Rdb_binlog_manager {
+ public:
+  Rdb_binlog_manager(const Rdb_binlog_manager &) = delete;
+  Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete;
+  Rdb_binlog_manager() = default;
+
+  bool init(Rdb_dict_manager *const dict);
+  void cleanup();
+  void update(const char *const binlog_name, const my_off_t binlog_pos,
+              rocksdb::WriteBatchBase *const batch);
+  bool read(char *const binlog_name, my_off_t *const binlog_pos,
+            char *const binlog_gtid) const;
+  void update_slave_gtid_info(const uint id, const char *const db,
+                              const char *const gtid,
+                              rocksdb::WriteBatchBase *const write_batch);
+
+ private:
+  Rdb_dict_manager *m_dict = nullptr;
+  Rdb_buf_writer<Rdb_key_def::INDEX_NUMBER_SIZE> m_key_writer;
+  rocksdb::Slice m_key_slice;
+
+  bool unpack_value(const uchar *const value, size_t value_size,
+                    char *const binlog_name,
+                    my_off_t *const binlog_pos, char *const binlog_gtid) const;
+
+  std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl;
+};
+
+/*
+   Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
+  internal data dictionary.
+   MyRocks stores data dictionary on dedicated system column family
+  named __system__. The system column family is used by MyRocks
+  internally only, and not used by applications.
+
+   Currently MyRocks has the following data dictionary data models.
+
+  1. Table Name => internal index id mappings
+  key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
+  value: version, {cf_id, index_id}*n_indexes_of_the_table
+  version is 2 bytes. cf_id and index_id are 4 bytes.
+
+  2. internal cf_id, index id => index information
+  key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
+  value: version, index_type, kv_format_version, index_flags, ttl_duration
+  index_type is 1 byte, version and kv_format_version are 2 bytes.
+  index_flags is 4 bytes.
+  ttl_duration is 8 bytes.
+
+  3. CF id => CF flags
+  key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
+  value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
+  cf_flags is 4 bytes in total.
+
+  4. Binlog entry (updated at commit)
+  key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
+  value: version, {binlog_name,binlog_pos,binlog_gtid}
+
+  5. Ongoing drop index entry
+  key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
+  value: version
+
+  6. index stats
+  key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
+  value: version, {materialized PropertiesCollector::IndexStats}
+
+  7. maximum index id
+  key: Rdb_key_def::MAX_INDEX_ID(0x7)
+  value: index_id
+  index_id is 4 bytes
+
+  8. Ongoing create index entry
+  key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
+  value: version
+
+  9. auto_increment values
+  key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
+  value: version, {max auto_increment so far}
+  max auto_increment is 8 bytes
+
+  Data dictionary operations are atomic inside RocksDB. For example,
+  when creating a table with two indexes, it is necessary to call Put
+  three times. They have to be atomic. Rdb_dict_manager has a wrapper function
+  begin() and commit() to make it easier to do atomic operations.
+
+*/
+class Rdb_dict_manager {
+ private:
+  mysql_mutex_t m_mutex;
+  rocksdb::TransactionDB *m_db = nullptr;
+  rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
+  /* Utility to put INDEX_INFO and CF_DEFINITION */
+
+  uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
+  rocksdb::Slice m_key_slice_max_index_id;
+
+  static void dump_index_id(uchar *const netbuf,
+                            Rdb_key_def::DATA_DICT_TYPE dict_type,
+                            const GL_INDEX_ID &gl_index_id);
+  template <size_t T>
+  static void dump_index_id(Rdb_buf_writer<T> *buf_writer,
+                            Rdb_key_def::DATA_DICT_TYPE dict_type,
+                            const GL_INDEX_ID &gl_index_id) {
+    buf_writer->write_uint32(dict_type);
+    buf_writer->write_uint32(gl_index_id.cf_id);
+    buf_writer->write_uint32(gl_index_id.index_id);
+  }
+
+  void delete_with_prefix(rocksdb::WriteBatch *const batch,
+                          Rdb_key_def::DATA_DICT_TYPE dict_type,
+                          const GL_INDEX_ID &gl_index_id) const;
+  /* Functions for fast DROP TABLE/INDEX */
+  void resume_drop_indexes() const;
+  void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
+                            const uint32 n_keys,
+                            const char *const log_action) const;
+  void log_start_drop_index(GL_INDEX_ID gl_index_id,
+                            const char *log_action) const;
+
+ public:
+  Rdb_dict_manager(const Rdb_dict_manager &) = delete;
+  Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
+  Rdb_dict_manager() = default;
+
+  bool init(rocksdb::TransactionDB *const rdb_dict,
+            Rdb_cf_manager *const cf_manager);
+
+  inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
+
+  inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
+
+  inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
+
+  inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
+    return m_system_cfh;
+  }
+
+  /* Raw RocksDB operations */
+  std::unique_ptr<rocksdb::WriteBatch> begin() const;
+  int commit(rocksdb::WriteBatch *const batch, const bool sync = true) const;
+  rocksdb::Status get_value(const rocksdb::Slice &key,
+                            std::string *const value) const;
+  void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
+               const rocksdb::Slice &value) const;
+  void delete_key(rocksdb::WriteBatchBase *batch,
+                  const rocksdb::Slice &key) const;
+  rocksdb::Iterator *new_iterator() const;
+
+  /* Internal Index id => CF */
+  void add_or_update_index_cf_mapping(
+      rocksdb::WriteBatch *batch,
+      struct Rdb_index_info *const index_info) const;
+  void delete_index_info(rocksdb::WriteBatch *batch,
+                         const GL_INDEX_ID &index_id) const;
+  bool get_index_info(const GL_INDEX_ID &gl_index_id,
+                      struct Rdb_index_info *const index_info) const;
+
+  /* CF id => CF flags */
+  void add_cf_flags(rocksdb::WriteBatch *const batch, const uint cf_id,
+                    const uint cf_flags) const;
+  bool get_cf_flags(const uint cf_id, uint *const cf_flags) const;
+
+  /* Functions for fast CREATE/DROP TABLE/INDEX */
+  void get_ongoing_index_operation(
+      std::unordered_set<GL_INDEX_ID> *gl_index_ids,
+      Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+  bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
+                                  Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+  void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
+                                     const GL_INDEX_ID &gl_index_id,
+                                     Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+  void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
+                                   const GL_INDEX_ID &gl_index_id,
+                                   Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+  bool is_drop_index_empty() const;
+  void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
+                      const uint32 n_keys,
+                      rocksdb::WriteBatch *const batch) const;
+  void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+                      rocksdb::WriteBatch *const batch) const;
+  void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+                        rocksdb::WriteBatch *const batch) const;
+  void finish_indexes_operation(
+      const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+      Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+  void rollback_ongoing_index_creation() const;
+
+  inline void get_ongoing_drop_indexes(
+      std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
+    get_ongoing_index_operation(gl_index_ids,
+                                Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+  }
+  inline void get_ongoing_create_indexes(
+      std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
+    get_ongoing_index_operation(gl_index_ids,
+                                Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+  }
+  inline void start_drop_index(rocksdb::WriteBatch *wb,
+                               const GL_INDEX_ID &gl_index_id) const {
+    start_ongoing_index_operation(wb, gl_index_id,
+                                  Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+  }
+  inline void start_create_index(rocksdb::WriteBatch *wb,
+                                 const GL_INDEX_ID &gl_index_id) const {
+    start_ongoing_index_operation(wb, gl_index_id,
+                                  Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+  }
+  inline void finish_drop_indexes(
+      const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
+    finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+  }
+  inline void finish_create_indexes(
+      const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
+    finish_indexes_operation(gl_index_ids,
+                             Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+  }
+  inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
+    return is_index_operation_ongoing(gl_index_id,
+                                      Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+  }
+  inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
+    return is_index_operation_ongoing(gl_index_id,
+                                      Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+  }
+
+  bool get_max_index_id(uint32_t *const index_id) const;
+  bool update_max_index_id(rocksdb::WriteBatch *const batch,
+                           const uint32_t index_id) const;
+  void add_stats(rocksdb::WriteBatch *const batch,
+                 const std::vector<Rdb_index_stats> &stats) const;
+  Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
+
+  rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
+                                    const GL_INDEX_ID &gl_index_id,
+                                    ulonglong val,
+                                    bool overwrite = false) const;
+  bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
+                         ulonglong *new_val) const;
+};
+
+struct Rdb_index_info {
+  GL_INDEX_ID m_gl_index_id;
+  uint16_t m_index_dict_version = 0;
+  uchar m_index_type = 0;
+  uint16_t m_kv_version = 0;
+  uint32 m_index_flags = 0;
+  uint64 m_ttl_duration = 0;
+};
+
+/*
+  @brief
+  Merge Operator for the auto_increment value in the system_cf
+
+  @detail
+  This class implements the rocksdb Merge Operator for auto_increment values
+  that are stored to the data dictionary every transaction.
+
+  The actual Merge function is triggered on compaction, memtable flushes, or
+  when get() is called on the same key.
+
+ */
+class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
+ public:
+  /*
+    Updates the new value associated with a key to be the maximum of the
+    passed in value and the existing value.
+
+    @param[IN]  key
+    @param[IN]  existing_value  existing value for a key; nullptr if nonexistent
+    key
+    @param[IN]  value
+    @param[OUT] new_value       new value after Merge
+    @param[IN]  logger
+  */
+  bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
+             const rocksdb::Slice &value, std::string *new_value,
+             rocksdb::Logger *logger) const override {
+    DBUG_ASSERT(new_value != nullptr);
+
+    if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
+        GetKeyType(key) != Rdb_key_def::AUTO_INC ||
+        value.size() !=
+            RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
+        GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
+      abort();
+    }
+
+    uint64_t merged_value = Deserialize(value);
+
+    if (existing_value != nullptr) {
+      if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
+                                        ROCKSDB_SIZEOF_AUTOINC_VALUE ||
+          GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
+        abort();
+      }
+
+      merged_value = std::max(merged_value, Deserialize(*existing_value));
+    }
+    Serialize(merged_value, new_value);
+    return true;
+  }
+
+  virtual const char *Name() const override { return "Rdb_system_merge_op"; }
+
+ private:
+  /*
+    Serializes the integer data to the new_value buffer or the target buffer
+    the merge operator will update to
+   */
+  void Serialize(const uint64_t data, std::string *new_value) const {
+    uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
+                    ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
+    uchar *ptr = value_buf;
+    /* fill in the auto increment version */
+    rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
+    ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
+    /* fill in the auto increment value */
+    rdb_netbuf_store_uint64(ptr, data);
+    ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
+    new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
+  }
+
+  /*
+    Gets the value of auto_increment type in the data dictionary from the
+    value slice
+
+    @Note Only to be used on data dictionary keys for the auto_increment type
+   */
+  uint64_t Deserialize(const rocksdb::Slice &s) const {
+    return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
+                                RDB_SIZEOF_AUTO_INCREMENT_VERSION);
+  }
+
+  /*
+    Gets the type of the key of the key in the data dictionary.
+
+    @Note Only to be used on data dictionary keys for the auto_increment type
+   */
+  uint16_t GetKeyType(const rocksdb::Slice &s) const {
+    return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
+  }
+
+  /*
+    Gets the version of the auto_increment value in the data dictionary.
+
+    @Note Only to be used on data dictionary value for the auto_increment type
+   */
+  uint16_t GetVersion(const rocksdb::Slice &s) const {
+    return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
+  }
+};
+
+bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
+
+}  // namespace myrocks
-- 
cgit v1.2.3