summaryrefslogtreecommitdiffstats
path: root/storage/rocksdb/rdb_datadic.h
diff options
context:
space:
mode:
Diffstat (limited to 'storage/rocksdb/rdb_datadic.h')
-rw-r--r--storage/rocksdb/rdb_datadic.h1639
1 files changed, 1639 insertions, 0 deletions
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
new file mode 100644
index 00000000..e9fc1da7
--- /dev/null
+++ b/storage/rocksdb/rdb_datadic.h
@@ -0,0 +1,1639 @@
+/*
+ Copyright (c) 2012,2013 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
+#pragma once
+
+/* C++ standard header files */
+#include <cstdlib>
+#include <algorithm>
+#include <atomic>
+#include <map>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <array>
+
+/* C standard header files */
+#ifndef _WIN32
+#include <arpa/inet.h>
+#endif
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+#include "./properties_collector.h"
+#include "./rdb_buff.h"
+#include "./rdb_utils.h"
+
+namespace myrocks {
+
+class Rdb_dict_manager;
+class Rdb_key_def;
+class Rdb_field_packing;
+class Rdb_cf_manager;
+class Rdb_ddl_manager;
+
+const uint32_t GTID_BUF_LEN = 60;
+
+class Rdb_convert_to_record_key_decoder {
+ public:
+ Rdb_convert_to_record_key_decoder() = default;
+ Rdb_convert_to_record_key_decoder(
+ const Rdb_convert_to_record_key_decoder &decoder) = delete;
+ Rdb_convert_to_record_key_decoder &operator=(
+ const Rdb_convert_to_record_key_decoder &decoder) = delete;
+ static int decode(uchar *const buf, uint *offset, Rdb_field_packing *fpi,
+ TABLE *table, Field *field, bool has_unpack_info,
+ Rdb_string_reader *reader,
+ Rdb_string_reader *unpack_reader);
+ static int skip(const Rdb_field_packing *fpi, const Field *field,
+ Rdb_string_reader *reader, Rdb_string_reader *unpack_reader);
+
+ private:
+ static int decode_field(Rdb_field_packing *fpi, Field *field,
+ Rdb_string_reader *reader,
+ const uchar *const default_value,
+ Rdb_string_reader *unpack_reader);
+};
+
+/*
+ @brief
+ Field packing context.
+ The idea is to ensure that a call to rdb_index_field_pack_t function
+ is followed by a call to rdb_make_unpack_info_t.
+
+ @detail
+ For some datatypes, unpack_info is produced as a side effect of
+ rdb_index_field_pack_t function call.
+ For other datatypes, packing is just calling make_sort_key(), while
+ rdb_make_unpack_info_t is a custom function.
+ In order to accommodate both cases, we require both calls to be made and
+ unpack_info is passed as context data between the two.
+*/
+class Rdb_pack_field_context {
+ public:
+ Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
+ Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
+
+ explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
+ : writer(writer_arg) {}
+
+ // NULL means we're not producing unpack_info.
+ Rdb_string_writer *writer;
+};
+
+class Rdb_key_field_iterator {
+ private:
+ Rdb_field_packing *m_pack_info;
+ int m_iter_index;
+ int m_iter_end;
+ TABLE *m_table;
+ Rdb_string_reader *m_reader;
+ Rdb_string_reader *m_unp_reader;
+ uint m_curr_bitmap_pos;
+ const MY_BITMAP *m_covered_bitmap;
+ uchar *m_buf;
+ bool m_has_unpack_info;
+ const Rdb_key_def *m_key_def;
+ bool m_secondary_key;
+ bool m_hidden_pk_exists;
+ bool m_is_hidden_pk;
+ bool m_is_null;
+ Field *m_field;
+ uint m_offset;
+ Rdb_field_packing *m_fpi;
+
+ public:
+ Rdb_key_field_iterator(const Rdb_key_field_iterator &) = delete;
+ Rdb_key_field_iterator &operator=(const Rdb_key_field_iterator &) = delete;
+ Rdb_key_field_iterator(const Rdb_key_def *key_def,
+ Rdb_field_packing *pack_info,
+ Rdb_string_reader *reader,
+ Rdb_string_reader *unp_reader, TABLE *table,
+ bool has_unpack_info, const MY_BITMAP *covered_bitmap,
+ uchar *buf);
+
+ int next();
+ bool has_next();
+ bool get_is_null() const;
+ Field *get_field() const;
+ int get_field_index() const;
+ void *get_dst() const;
+};
+
+struct Rdb_collation_codec;
+struct Rdb_index_info;
+
+/*
+ C-style "virtual table" allowing different handling of packing logic based
+ on the field type. See Rdb_field_packing::setup() implementation.
+ */
+using rdb_make_unpack_info_t = void (*)(const Rdb_collation_codec *codec,
+ const Field *field,
+ Rdb_pack_field_context *pack_ctx);
+using rdb_index_field_unpack_t = int (*)(Rdb_field_packing *fpi, Field *field,
+ uchar *field_ptr,
+ Rdb_string_reader *reader,
+ Rdb_string_reader *unpack_reader);
+using rdb_index_field_skip_t = int (*)(const Rdb_field_packing *fpi,
+ const Field *field,
+ Rdb_string_reader *reader);
+using rdb_index_field_pack_t = void (*)(Rdb_field_packing *fpi, Field *field,
+ uchar *buf, uchar **dst,
+ Rdb_pack_field_context *pack_ctx);
+
+const uint RDB_INVALID_KEY_LEN = uint(-1);
+
+/* How much one checksum occupies when stored in the record */
+const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
+
+/*
+ How much the checksum data occupies in record, in total.
+ It is storing two checksums plus 1 tag-byte.
+*/
+const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
+
+/*
+ Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
+ checksums.
+*/
+const char RDB_CHECKSUM_DATA_TAG = 0x01;
+
+/*
+ Unpack data is variable length. The header is 1 tag-byte plus a two byte
+ length field. The length field includes the header as well.
+*/
+const char RDB_UNPACK_DATA_TAG = 0x02;
+const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
+const size_t RDB_UNPACK_HEADER_SIZE =
+ sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
+
+/*
+ This header format is 1 tag-byte plus a two byte length field plus a two byte
+ covered bitmap. The length field includes the header size.
+*/
+const char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
+const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
+const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
+const size_t RDB_UNPACK_COVERED_HEADER_SIZE =
+ sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
+ RDB_COVERED_BITMAP_SIZE;
+
+/*
+ Data dictionary index info field sizes.
+*/
+const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
+const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
+const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
+const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
+const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
+
+// Possible return values for rdb_index_field_unpack_t functions.
+enum {
+ UNPACK_SUCCESS = 0,
+ UNPACK_FAILURE = 1,
+};
+
+/*
+ An object of this class represents information about an index in an SQL
+ table. It provides services to encode and decode index tuples.
+
+ Note: a table (as in, on-disk table) has a single Rdb_key_def object which
+ is shared across multiple TABLE* objects and may be used simultaneously from
+ different threads.
+
+ There are several data encodings:
+
+ === SQL LAYER ===
+ SQL layer uses two encodings:
+
+ - "Table->record format". This is the format that is used for the data in
+ the record buffers, table->record[i]
+
+ - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
+ lookup functions, like handler::index_read_map().
+
+ === Inside RocksDB ===
+ Primary Key is stored as a mapping:
+
+ index_tuple -> StoredRecord
+
+ StoredRecord is in Table->record format, except for blobs, which are stored
+ in-place. See ha_rocksdb::convert_record_to_storage_format for details.
+
+ Secondary indexes are stored as one of two variants:
+
+ index_tuple -> unpack_info
+ index_tuple -> empty_string
+
+ index_tuple here is the form of key that can be compared with memcmp(), aka
+ "mem-comparable form".
+
+ unpack_info is extra data that allows to restore the original value from its
+ mem-comparable form. It is present only if the index supports index-only
+ reads.
+*/
+
+class Rdb_key_def {
+ public:
+ /* Convert a key from KeyTupleFormat to mem-comparable form */
+ uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
+ uchar *const packed_tuple, uchar *const record_buffer,
+ const uchar *const key_tuple,
+ const key_part_map &keypart_map) const;
+
+ uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
+ uchar *tuple, uchar *const packed_tuple,
+ uchar *const pack_buffer,
+ Rdb_string_writer *const unpack_info,
+ uint *const n_null_fields) const;
+ /* Convert a key from Table->record format to mem-comparable form */
+ uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
+ const uchar *const record, uchar *const packed_tuple,
+ Rdb_string_writer *const unpack_info,
+ const bool should_store_row_debug_checksums,
+ const longlong hidden_pk_id = 0, uint n_key_parts = 0,
+ uint *const n_null_fields = nullptr,
+ const char *const ttl_bytes = nullptr) const;
+ /* Pack the hidden primary key into mem-comparable form. */
+ uint pack_hidden_pk(const longlong hidden_pk_id,
+ uchar *const packed_tuple) const;
+ int unpack_record(TABLE *const table, uchar *const buf,
+ const rocksdb::Slice *const packed_key,
+ const rocksdb::Slice *const unpack_info,
+ const bool verify_row_debug_checksums) const;
+
+ static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
+ int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
+ std::size_t *const column_index) const;
+
+ size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
+
+ /* Get the key that is the "infimum" for this index */
+ inline void get_infimum_key(uchar *const key, uint *const size) const {
+ rdb_netbuf_store_index(key, m_index_number);
+ *size = INDEX_NUMBER_SIZE;
+ }
+
+ /* Get the key that is a "supremum" for this index */
+ inline void get_supremum_key(uchar *const key, uint *const size) const {
+ rdb_netbuf_store_index(key, m_index_number + 1);
+ *size = INDEX_NUMBER_SIZE;
+ }
+
+ /*
+ Get the first key that you need to position at to start iterating.
+ Stores into *key a "supremum" or "infimum" key value for the index.
+ @parameters key OUT Big Endian, value is m_index_number or
+ m_index_number + 1
+ @parameters size OUT key size, value is INDEX_NUMBER_SIZE
+ @return Number of bytes in the key that are usable for bloom filter use.
+ */
+ inline int get_first_key(uchar *const key, uint *const size) const {
+ if (m_is_reverse_cf) {
+ get_supremum_key(key, size);
+ /* Find out how many bytes of infimum are the same as m_index_number */
+ uchar unmodified_key[INDEX_NUMBER_SIZE];
+ rdb_netbuf_store_index(unmodified_key, m_index_number);
+ int i;
+ for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+ if (key[i] != unmodified_key[i]) {
+ break;
+ }
+ }
+ return i;
+ } else {
+ get_infimum_key(key, size);
+ // For infimum key, its value will be m_index_number
+ // Thus return its own size instead.
+ return INDEX_NUMBER_SIZE;
+ }
+ }
+
+ /*
+ The same as get_first_key, but get the key for the last entry in the index
+ @parameters key OUT Big Endian, value is m_index_number or
+ m_index_number + 1
+ @parameters size OUT key size, value is INDEX_NUMBER_SIZE
+
+ @return Number of bytes in the key that are usable for bloom filter use.
+ */
+ inline int get_last_key(uchar *const key, uint *const size) const {
+ if (m_is_reverse_cf) {
+ get_infimum_key(key, size);
+ // For infimum key, its value will be m_index_number
+ // Thus return its own size instead.
+ return INDEX_NUMBER_SIZE;
+ } else {
+ get_supremum_key(key, size);
+ /* Find out how many bytes are the same as m_index_number */
+ uchar unmodified_key[INDEX_NUMBER_SIZE];
+ rdb_netbuf_store_index(unmodified_key, m_index_number);
+ int i;
+ for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+ if (key[i] != unmodified_key[i]) {
+ break;
+ }
+ }
+ return i;
+ }
+ }
+
+ /* Make a key that is right after the given key. */
+ static int successor(uchar *const packed_tuple, const uint len);
+
+ /* Make a key that is right before the given key. */
+ static int predecessor(uchar *const packed_tuple, const uint len);
+
+ /*
+ This can be used to compare prefixes.
+ if X is a prefix of Y, then we consider that X = Y.
+ */
+ // b describes the lookup key, which can be a prefix of a.
+ // b might be outside of the index_number range, if successor() is called.
+ int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
+ DBUG_ASSERT(covers_key(a));
+
+ return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
+ }
+
+ /* Check if given mem-comparable key belongs to this index */
+ bool covers_key(const rocksdb::Slice &slice) const {
+ if (slice.size() < INDEX_NUMBER_SIZE) return false;
+
+ if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
+
+ bool covers_lookup(const rocksdb::Slice *const unpack_info,
+ const MY_BITMAP *const map) const;
+
+ inline bool use_covered_bitmap_format() const {
+ return m_index_type == INDEX_TYPE_SECONDARY &&
+ m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
+ }
+
+ /* Indicates that all key parts can be unpacked to cover a secondary lookup */
+ bool can_cover_lookup() const;
+
+ /*
+ Return true if the passed mem-comparable key
+ - is from this index, and
+ - it matches the passed key prefix (the prefix is also in mem-comparable
+ form)
+ */
+ bool value_matches_prefix(const rocksdb::Slice &value,
+ const rocksdb::Slice &prefix) const {
+ return covers_key(value) && !cmp_full_keys(value, prefix);
+ }
+
+ uint32 get_keyno() const { return m_keyno; }
+
+ uint32 get_index_number() const { return m_index_number; }
+
+ GL_INDEX_ID get_gl_index_id() const {
+ const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
+ return gl_index_id;
+ }
+
+ int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
+ const uint part_num) const;
+
+ /* Must only be called for secondary keys: */
+ uint get_primary_key_tuple(const TABLE *const tbl,
+ const Rdb_key_def &pk_descr,
+ const rocksdb::Slice *const key,
+ uchar *const pk_buffer) const;
+
+ uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
+ uchar *sk_buffer, uint *n_null_fields) const;
+
+ /* Return max length of mem-comparable form */
+ uint max_storage_fmt_length() const { return m_maxlength; }
+
+ uint get_key_parts() const { return m_key_parts; }
+
+ uint get_ttl_field_index() const { return m_ttl_field_index; }
+
+ /*
+ Get a field object for key part #part_no
+
+ @detail
+ SQL layer thinks unique secondary indexes and indexes in partitioned
+ tables are not "Extended" with Primary Key columns.
+
+ Internally, we always extend all indexes with PK columns. This function
+ uses our definition of how the index is Extended.
+ */
+ inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
+
+ const std::string &get_name() const { return m_name; }
+
+ const rocksdb::SliceTransform *get_extractor() const {
+ return m_prefix_extractor.get();
+ }
+
+ static size_t get_unpack_header_size(char tag);
+
+ Rdb_key_def &operator=(const Rdb_key_def &) = delete;
+ Rdb_key_def(const Rdb_key_def &k);
+ Rdb_key_def(uint indexnr_arg, uint keyno_arg,
+ rocksdb::ColumnFamilyHandle *cf_handle_arg,
+ uint16_t index_dict_version_arg, uchar index_type_arg,
+ uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
+ bool is_per_partition_cf, const char *name,
+ Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
+ uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
+ ~Rdb_key_def();
+
+ enum {
+ INDEX_NUMBER_SIZE = 4,
+ VERSION_SIZE = 2,
+ CF_NUMBER_SIZE = 4,
+ CF_FLAG_SIZE = 4,
+ PACKED_SIZE = 4, // one int
+ };
+
+ // bit flags for combining bools when writing to disk
+ enum {
+ REVERSE_CF_FLAG = 1,
+ AUTO_CF_FLAG = 2, // Deprecated
+ PER_PARTITION_CF_FLAG = 4,
+ };
+
+ // bit flags which denote myrocks specific fields stored in the record
+ // currently only used for TTL.
+ enum INDEX_FLAG {
+ TTL_FLAG = 1 << 0,
+
+ // MAX_FLAG marks where the actual record starts
+ // This flag always needs to be set to the last index flag enum.
+ MAX_FLAG = TTL_FLAG << 1,
+ };
+
+ // Set of flags to ignore when comparing two CF-s and determining if
+ // they're same.
+ static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
+
+ // Data dictionary types
+ enum DATA_DICT_TYPE {
+ DDL_ENTRY_INDEX_START_NUMBER = 1,
+ INDEX_INFO = 2,
+ CF_DEFINITION = 3,
+ BINLOG_INFO_INDEX_NUMBER = 4,
+ DDL_DROP_INDEX_ONGOING = 5,
+ INDEX_STATISTICS = 6,
+ MAX_INDEX_ID = 7,
+ DDL_CREATE_INDEX_ONGOING = 8,
+ AUTO_INC = 9,
+ // MariaDB: 10 through 12 are already taken in upstream
+ TABLE_VERSION = 20, // MariaDB: table version record
+ END_DICT_INDEX_ID = 255
+ };
+
+ // Data dictionary schema version. Introduce newer versions
+ // if changing schema layout
+ enum {
+ DDL_ENTRY_INDEX_VERSION = 1,
+ CF_DEFINITION_VERSION = 1,
+ BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
+ DDL_DROP_INDEX_ONGOING_VERSION = 1,
+ MAX_INDEX_ID_VERSION = 1,
+ DDL_CREATE_INDEX_ONGOING_VERSION = 1,
+ AUTO_INCREMENT_VERSION = 1,
+ // Version for index stats is stored in IndexStats struct
+ };
+
+ // Index info version. Introduce newer versions when changing the
+ // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
+ // latest version number.
+ enum {
+ INDEX_INFO_VERSION_INITIAL = 1, // Obsolete
+ INDEX_INFO_VERSION_KV_FORMAT,
+ INDEX_INFO_VERSION_GLOBAL_ID,
+ // There is no change to data format in this version, but this version
+ // verifies KV format version, whereas previous versions do not. A version
+ // bump is needed to prevent older binaries from skipping the KV version
+ // check inadvertently.
+ INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
+ // This changes the data format to include a 8 byte TTL duration for tables
+ INDEX_INFO_VERSION_TTL,
+ // This changes the data format to include a bitmap before the TTL duration
+ // which will indicate in the future whether TTL or other special fields
+ // are turned on or off.
+ INDEX_INFO_VERSION_FIELD_FLAGS,
+ // This normally point to the latest (currently it does).
+ INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
+ };
+
+ // MyRocks index types
+ enum {
+ INDEX_TYPE_PRIMARY = 1,
+ INDEX_TYPE_SECONDARY = 2,
+ INDEX_TYPE_HIDDEN_PRIMARY = 3,
+ };
+
+ // Key/Value format version for each index type
+ enum {
+ PRIMARY_FORMAT_VERSION_INITIAL = 10,
+ // This change includes:
+ // - For columns that can be unpacked with unpack_info, PK
+ // stores the unpack_info.
+ // - DECIMAL datatype is no longer stored in the row (because
+ // it can be decoded from its mem-comparable form)
+ // - VARCHAR-columns use endspace-padding.
+ PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
+ // This change includes:
+ // - Binary encoded variable length fields have a new format that avoids
+ // an inefficient where data that was a multiple of 8 bytes in length
+ // had an extra 9 bytes of encoded data.
+ PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
+ // This change includes support for TTL
+ // - This means that when TTL is specified for the table an 8-byte TTL
+ // field is prepended in front of each value.
+ PRIMARY_FORMAT_VERSION_TTL = 13,
+ PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
+
+ SECONDARY_FORMAT_VERSION_INITIAL = 10,
+ // This change the SK format to include unpack_info.
+ SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
+ // This change includes:
+ // - Binary encoded variable length fields have a new format that avoids
+ // an inefficient where data that was a multiple of 8 bytes in length
+ // had an extra 9 bytes of encoded data.
+ SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
+ // This change includes support for TTL
+ // - This means that when TTL is specified for the table an 8-byte TTL
+ // field is prepended in front of each value.
+ SECONDARY_FORMAT_VERSION_TTL = 13,
+ SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
+ // This change includes support for covering SK lookups for varchars. A
+ // 2-byte bitmap is added after the tag-byte to unpack_info only for
+ // records which have covered varchar columns. Currently waiting before
+ // enabling in prod.
+ SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
+ };
+
+ uint setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
+
+ static uint extract_ttl_duration(const TABLE *const table_arg,
+ const Rdb_tbl_def *const tbl_def_arg,
+ uint64 *ttl_duration);
+ static uint extract_ttl_col(const TABLE *const table_arg,
+ const Rdb_tbl_def *const tbl_def_arg,
+ std::string *ttl_column, uint *ttl_field_index,
+ bool skip_checks = false);
+ inline bool has_ttl() const { return m_ttl_duration > 0; }
+
+ static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
+ static uint32 calculate_index_flag_offset(uint32 index_flags,
+ enum INDEX_FLAG flag,
+ uint *const field_length = nullptr);
+ void write_index_flag_field(Rdb_string_writer *const buf,
+ const uchar *const val,
+ enum INDEX_FLAG flag) const;
+
+ static const std::string gen_qualifier_for_table(
+ const char *const qualifier, const std::string &partition_name = "");
+ static const std::string gen_cf_name_qualifier_for_partition(
+ const std::string &s);
+ static const std::string gen_ttl_duration_qualifier_for_partition(
+ const std::string &s);
+ static const std::string gen_ttl_col_qualifier_for_partition(
+ const std::string &s);
+
+ static const std::string parse_comment_for_qualifier(
+ const std::string &comment, const TABLE *const table_arg,
+ const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
+ const char *const qualifier);
+
+ rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; }
+
+ /* Check if keypart #kp can be unpacked from index tuple */
+ inline bool can_unpack(const uint kp) const;
+ /* Check if keypart #kp needs unpack info */
+ inline bool has_unpack_info(const uint kp) const;
+
+ /* Check if given table has a primary key */
+ static bool table_has_hidden_pk(const TABLE *const table);
+
+ void report_checksum_mismatch(const bool is_key, const char *const data,
+ const size_t data_size) const;
+
+ /* Check if index is at least pk_min if it is a PK,
+ or at least sk_min if SK.*/
+ bool index_format_min_check(const int pk_min, const int sk_min) const;
+
+ static void pack_with_make_sort_key(
+ Rdb_field_packing *const fpi, Field *const field,
+ uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
+ Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
+
+ static void pack_with_varchar_encoding(
+ Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
+ Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
+
+ static void pack_with_varchar_space_pad(
+ Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
+ Rdb_pack_field_context *const pack_ctx);
+
+ static int unpack_integer(Rdb_field_packing *const fpi, Field *const field,
+ uchar *const to, Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader
+ MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_double(
+ Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
+ Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+ Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_float(
+ Rdb_field_packing *const fpi,
+ Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+ Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field,
+ uchar *const to, Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader
+ MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_binary_or_utf8_varchar(
+ Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+ Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_binary_or_utf8_varchar_space_pad(
+ Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+ Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
+
+ static int unpack_newdate(
+ Rdb_field_packing *const fpi,
+ Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
+ Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field,
+ uchar *dst, Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader
+ MY_ATTRIBUTE((__unused__)));
+
+ static int unpack_unknown_varchar(Rdb_field_packing *const fpi,
+ Field *const field, uchar *dst,
+ Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader);
+
+ static int unpack_simple_varchar_space_pad(
+ Rdb_field_packing *const fpi, Field *const field, uchar *dst,
+ Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
+
+ static int unpack_simple(Rdb_field_packing *const fpi,
+ Field *const field MY_ATTRIBUTE((__unused__)),
+ uchar *const dst, Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader);
+
+ static int unpack_unknown(Rdb_field_packing *const fpi, Field *const field,
+ uchar *const dst, Rdb_string_reader *const reader,
+ Rdb_string_reader *const unp_reader);
+
+ static int unpack_floating_point(uchar *const dst,
+ Rdb_string_reader *const reader,
+ const size_t size, const int exp_digit,
+ const uchar *const zero_pattern,
+ const uchar *const zero_val,
+ void (*swap_func)(uchar *, const uchar *));
+
+ static void make_unpack_simple_varchar(
+ const Rdb_collation_codec *const codec, const Field *const field,
+ Rdb_pack_field_context *const pack_ctx);
+
+ static void make_unpack_simple(const Rdb_collation_codec *const codec,
+ const Field *const field,
+ Rdb_pack_field_context *const pack_ctx);
+
+ static void make_unpack_unknown(
+ const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
+ const Field *const field, Rdb_pack_field_context *const pack_ctx);
+
+ static void make_unpack_unknown_varchar(
+ const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
+ const Field *const field, Rdb_pack_field_context *const pack_ctx);
+
+ static void dummy_make_unpack_info(
+ const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
+ const Field *field MY_ATTRIBUTE((__unused__)),
+ Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__)));
+
+ static int skip_max_length(const Rdb_field_packing *const fpi,
+ const Field *const field
+ MY_ATTRIBUTE((__unused__)),
+ Rdb_string_reader *const reader);
+
+ static int skip_variable_length(const Rdb_field_packing *const fpi,
+ const Field *const field,
+ Rdb_string_reader *const reader);
+
+ static int skip_variable_space_pad(const Rdb_field_packing *const fpi,
+ const Field *const field,
+ Rdb_string_reader *const reader);
+
+ inline bool use_legacy_varbinary_format() const {
+ return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
+ SECONDARY_FORMAT_VERSION_UPDATE2);
+ }
+
+ static inline bool is_unpack_data_tag(char c) {
+ return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
+ }
+
+ private:
+#ifndef DBUG_OFF
+ inline bool is_storage_available(const int offset, const int needed) const {
+ const int storage_length = static_cast<int>(max_storage_fmt_length());
+ return (storage_length - offset) >= needed;
+ }
+#else
+ inline bool is_storage_available(const int &offset, const int &needed) const {
+ return 1;
+ }
+#endif // DBUG_OFF
+
+ /* Global number of this index (used as prefix in StorageFormat) */
+ const uint32 m_index_number;
+
+ uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
+
+ rocksdb::ColumnFamilyHandle *m_cf_handle;
+
+ static void pack_legacy_variable_format(const uchar *src, size_t src_len,
+ uchar **dst);
+
+ static void pack_variable_format(const uchar *src, size_t src_len,
+ uchar **dst);
+
+ static uint calc_unpack_legacy_variable_format(uchar flag, bool *done);
+
+ static uint calc_unpack_variable_format(uchar flag, bool *done);
+
+ public:
+ uint16_t m_index_dict_version;
+ uchar m_index_type;
+ /* KV format version for the index id */
+ uint16_t m_kv_format_version;
+ /* If true, the column family stores data in the reverse order */
+ bool m_is_reverse_cf;
+
+ /* If true, then column family is created per partition. */
+ bool m_is_per_partition_cf;
+
+ std::string m_name;
+ mutable Rdb_index_stats m_stats;
+
+ /*
+ Bitmap containing information about whether TTL or other special fields
+ are enabled for the given index.
+ */
+ uint32 m_index_flags_bitmap;
+
+ /*
+ How much space in bytes the index flag fields occupy.
+ */
+ uint32 m_total_index_flags_length;
+
+ /*
+ Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
+ */
+ uint32 m_ttl_rec_offset;
+
+ /* Default TTL duration */
+ uint64 m_ttl_duration;
+
+ /* TTL column (if defined by user, otherwise implicit TTL is used) */
+ std::string m_ttl_column;
+
+ private:
+ /* Number of key parts in the primary key*/
+ uint m_pk_key_parts;
+
+ /*
+ pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
+ primary key. Y==-1 means this column is not present in the primary key.
+ */
+ uint *m_pk_part_no;
+
+ /* Array of index-part descriptors. */
+ Rdb_field_packing *m_pack_info;
+
+ uint m_keyno; /* number of this index in the table */
+
+ /*
+ Number of key parts in the index (including "index extension"). This is how
+ many elements are in the m_pack_info array.
+ */
+ uint m_key_parts;
+
+ /*
+ If TTL column is part of the PK, offset of the column within pk.
+ Default is UINT_MAX to denote that TTL col is not part of PK.
+ */
+ uint m_ttl_pk_key_part_offset;
+
+ /*
+ Index of the TTL column in table->s->fields, if it exists.
+ Default is UINT_MAX to denote that it does not exist.
+ */
+ uint m_ttl_field_index;
+
+ /* Prefix extractor for the column family of the key definiton */
+ std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
+
+ /* Maximum length of the mem-comparable form. */
+ uint m_maxlength;
+
+ /* mutex to protect setup */
+ mysql_mutex_t m_mutex;
+};
+
+// "Simple" collations (those specified in strings/ctype-simple.c) are simple
+// because their strnxfrm function maps one byte to one byte. However, the
+// mapping is not injective, so the inverse function will take in an extra
+// index parameter containing information to disambiguate what the original
+// character was.
+//
+// The m_enc* members are for encoding. Generally, we want encoding to be:
+// src -> (dst, idx)
+//
+// Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
+// idx.
+//
+// For the inverse, we have:
+// (dst, idx) -> src
+//
+// We have m_dec_idx[idx][dst] = src to get our original character back.
+//
+struct Rdb_collation_codec {
+ const my_core::CHARSET_INFO *m_cs;
+ // The first element unpacks VARCHAR(n), the second one - CHAR(n).
+ std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
+ std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
+
+ std::array<uchar, 256> m_enc_idx;
+ std::array<uchar, 256> m_enc_size;
+
+ std::array<uchar, 256> m_dec_size;
+ std::vector<std::array<uchar, 256>> m_dec_idx;
+};
+
+extern mysql_mutex_t rdb_collation_data_mutex;
+extern mysql_mutex_t rdb_mem_cmp_space_mutex;
+extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
+ rdb_collation_data;
+
+class Rdb_field_packing {
+ public:
+ Rdb_field_packing(const Rdb_field_packing &) = delete;
+ Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
+ Rdb_field_packing() = default;
+
+ /* Length of mem-comparable image of the field, in bytes */
+ int m_max_image_len;
+
+ /* Length of image in the unpack data */
+ int m_unpack_data_len;
+ int m_unpack_data_offset;
+
+ bool m_maybe_null; /* TRUE <=> NULL-byte is stored */
+
+ /*
+ Valid only for VARCHAR fields.
+ */
+ const CHARSET_INFO *m_varchar_charset;
+ bool m_use_legacy_varbinary_format;
+
+ // (Valid when Variable Length Space Padded Encoding is used):
+ uint m_segment_size; // size of segment used
+
+ // number of bytes used to store number of trimmed (or added)
+ // spaces in the upack_info
+ bool m_unpack_info_uses_two_bytes;
+
+ /*
+ True implies that an index-only read is always possible for this field.
+ False means an index-only read may be possible depending on the record and
+ field type.
+ */
+ bool m_covered;
+
+ const std::vector<uchar> *space_xfrm;
+ size_t space_xfrm_len;
+ size_t space_mb_len;
+
+ const Rdb_collation_codec *m_charset_codec;
+
+ /*
+ @return TRUE: this field makes use of unpack_info.
+ */
+ bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
+
+ /* TRUE means unpack_info stores the original field value */
+ bool m_unpack_info_stores_value;
+
+ rdb_index_field_pack_t m_pack_func;
+ rdb_make_unpack_info_t m_make_unpack_info_func;
+
+ /*
+ This function takes
+ - mem-comparable form
+ - unpack_info data
+ and restores the original value.
+ */
+ rdb_index_field_unpack_t m_unpack_func;
+
+ /*
+ This function skips over mem-comparable form.
+ */
+ rdb_index_field_skip_t m_skip_func;
+
+ private:
+ /*
+ Location of the field in the table (key number and key part number).
+
+ Note that this describes not the field, but rather a position of field in
+ the index. Consider an example:
+
+ col1 VARCHAR (100),
+ INDEX idx1 (col1)),
+ INDEX idx2 (col1(10)),
+
+ Here, idx2 has a special Field object that is set to describe a 10-char
+ prefix of col1.
+
+ We must also store the keynr. It is needed for implicit "extended keys".
+ Every key in MyRocks needs to include PK columns. Generally, SQL layer
+ includes PK columns as part of its "Extended Keys" feature, but sometimes
+ it does not (known examples are unique secondary indexes and partitioned
+ tables).
+ In that case, MyRocks's index descriptor has invisible suffix of PK
+ columns (and the point is that these columns are parts of PK, not parts
+ of the current index).
+ */
+ uint m_keynr;
+ uint m_key_part;
+
+ public:
+ bool setup(const Rdb_key_def *const key_descr, const Field *const field,
+ const uint keynr_arg, const uint key_part_arg,
+ const uint16 key_length);
+ Field *get_field_in_table(const TABLE *const tbl) const;
+ void fill_hidden_pk_val(uchar **dst, const longlong hidden_pk_id) const;
+};
+
+/*
+ Descriptor telling how to decode/encode a field to on-disk record storage
+ format. Not all information is in the structure yet, but eventually we
+ want to have as much as possible there to avoid virtual calls.
+
+ For encoding/decoding of index tuples, see Rdb_key_def.
+ */
+class Rdb_field_encoder {
+ public:
+ Rdb_field_encoder(const Rdb_field_encoder &) = delete;
+ Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
+ /*
+ STORE_NONE is set when a column can be decoded solely from their
+ mem-comparable form.
+ STORE_SOME is set when a column can be decoded from their mem-comparable
+ form plus unpack_info.
+ STORE_ALL is set when a column cannot be decoded, so its original value
+ must be stored in the PK records.
+ */
+ enum STORAGE_TYPE {
+ STORE_NONE,
+ STORE_SOME,
+ STORE_ALL,
+ };
+ STORAGE_TYPE m_storage_type;
+
+ uint m_null_offset;
+ uint16 m_field_index;
+
+ uchar m_null_mask; // 0 means the field cannot be null
+
+ my_core::enum_field_types m_field_type;
+
+ uint m_pack_length_in_rec;
+
+ bool maybe_null() const { return m_null_mask != 0; }
+
+ bool uses_variable_len_encoding() const {
+ return (m_field_type == MYSQL_TYPE_BLOB ||
+ m_field_type == MYSQL_TYPE_VARCHAR);
+ }
+};
+
+inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
+ uint part_no) const {
+ DBUG_ASSERT(part_no < get_key_parts());
+ return m_pack_info[part_no].get_field_in_table(table);
+}
+
+inline bool Rdb_key_def::can_unpack(const uint kp) const {
+ DBUG_ASSERT(kp < m_key_parts);
+ return (m_pack_info[kp].m_unpack_func != nullptr);
+}
+
+inline bool Rdb_key_def::has_unpack_info(const uint kp) const {
+ DBUG_ASSERT(kp < m_key_parts);
+ return m_pack_info[kp].uses_unpack_info();
+}
+
+/*
+ A table definition. This is an entry in the mapping
+
+ dbname.tablename -> {index_nr, index_nr, ... }
+
+ There is only one Rdb_tbl_def object for a given table.
+ That's why we keep auto_increment value here, too.
+*/
+
+class Rdb_tbl_def {
+ private:
+ void check_if_is_mysql_system_table();
+
+ /* Stores 'dbname.tablename' */
+ std::string m_dbname_tablename;
+
+ /* Store the db name, table name, and partition name */
+ std::string m_dbname;
+ std::string m_tablename;
+ std::string m_partition;
+
+ void set_name(const std::string &name);
+
+ public:
+ Rdb_tbl_def(const Rdb_tbl_def &) = delete;
+ Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
+
+ explicit Rdb_tbl_def(const std::string &name)
+ : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+ m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+ set_name(name);
+ }
+
+ Rdb_tbl_def(const char *const name, const size_t len)
+ : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+ m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+ set_name(std::string(name, len));
+ }
+
+ explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t pos = 0)
+ : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
+ m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
+ set_name(std::string(slice.data() + pos, slice.size() - pos));
+ }
+
+ ~Rdb_tbl_def();
+
+ void check_and_set_read_free_rpl_table();
+
+ /* Number of indexes */
+ uint m_key_count;
+
+ /* Array of index descriptors */
+ std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
+
+ std::atomic<longlong> m_hidden_pk_val;
+ std::atomic<ulonglong> m_auto_incr_val;
+
+ /* Is this a system table */
+ bool m_is_mysql_system_table;
+
+ /* Is this table read free repl enabled */
+ std::atomic_bool m_is_read_free_rpl_table{false};
+
+ bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch,
+ const rocksdb::Slice &key);
+
+ const std::string &full_tablename() const { return m_dbname_tablename; }
+ const std::string &base_dbname() const { return m_dbname; }
+ const std::string &base_tablename() const { return m_tablename; }
+ const std::string &base_partition() const { return m_partition; }
+ GL_INDEX_ID get_autoincr_gl_index_id();
+
+ time_t get_create_time();
+ std::atomic<time_t> m_update_time; // in-memory only value
+
+ private:
+ const time_t CREATE_TIME_UNKNOWN= 1;
+ // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
+ // 0 means "no data available"
+ std::atomic<time_t> m_create_time;
+};
+
+/*
+ A thread-safe sequential number generator. Its performance is not a concern
+ hence it is ok to protect it by a mutex.
+*/
+
+class Rdb_seq_generator {
+ uint m_next_number = 0;
+
+ mysql_mutex_t m_mutex;
+
+ public:
+ Rdb_seq_generator(const Rdb_seq_generator &) = delete;
+ Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
+ Rdb_seq_generator() = default;
+
+ void init(const uint initial_number) {
+ mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
+ m_next_number = initial_number;
+ }
+
+ uint get_and_update_next_number(Rdb_dict_manager *const dict);
+
+ void cleanup() { mysql_mutex_destroy(&m_mutex); }
+};
+
+interface Rdb_tables_scanner {
+ virtual int add_table(Rdb_tbl_def * tdef) = 0;
+ virtual ~Rdb_tables_scanner() = default; /* Keep the compiler happy */
+};
+
+/*
+ This contains a mapping of
+
+ dbname.table_name -> array{Rdb_key_def}.
+
+ objects are shared among all threads.
+*/
+
+class Rdb_ddl_manager {
+ Rdb_dict_manager *m_dict = nullptr;
+
+ // Contains Rdb_tbl_def elements
+ std::unordered_map<std::string, Rdb_tbl_def *> m_ddl_map;
+
+ // Maps index id to <table_name, index number>
+ std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
+
+ // Maps index id to key definitons not yet committed to data dictionary.
+ // This is mainly used to store key definitions during ALTER TABLE.
+ std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
+ m_index_num_to_uncommitted_keydef;
+ mysql_rwlock_t m_rwlock;
+
+ Rdb_seq_generator m_sequence;
+ // A queue of table stats to write into data dictionary
+ // It is produced by event listener (ie compaction and flush threads)
+ // and consumed by the rocksdb background thread
+ std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
+
+ const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
+
+ public:
+ Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
+ Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
+ Rdb_ddl_manager() = default;
+
+ /* Load the data dictionary from on-disk storage */
+ bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
+ const uint32_t validate_tables);
+
+ void cleanup();
+
+ Rdb_tbl_def *find(const std::string &table_name, const bool lock = true);
+ std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
+ void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
+ void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
+ const std::vector<Rdb_index_stats> &deleted_data =
+ std::vector<Rdb_index_stats>());
+ void persist_stats(const bool sync = false);
+
+ /* Modify the mapping and write it to on-disk storage */
+ int put_and_write(Rdb_tbl_def *const key_descr,
+ rocksdb::WriteBatch *const batch);
+ void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
+ const bool lock = true);
+ bool rename(const std::string &from, const std::string &to,
+ rocksdb::WriteBatch *const batch);
+
+ uint get_and_update_next_number(Rdb_dict_manager *const dict) {
+ return m_sequence.get_and_update_next_number(dict);
+ }
+
+ const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
+
+ /* Walk the data dictionary */
+ int scan_for_tables(Rdb_tables_scanner *tables_scanner);
+
+ void erase_index_num(const GL_INDEX_ID &gl_index_id);
+ void add_uncommitted_keydefs(
+ const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
+ void remove_uncommitted_keydefs(
+ const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
+
+ private:
+ /* Put the data into in-memory table (only) */
+ int put(Rdb_tbl_def *const key_descr, const bool lock = true);
+
+ /* Helper functions to be passed to my_core::HASH object */
+ static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
+ my_bool not_used MY_ATTRIBUTE((unused)));
+ static void free_hash_elem(void *const data);
+
+ bool validate_schemas();
+
+ bool validate_auto_incr();
+};
+
+/*
+ Writing binlog information into RocksDB at commit(),
+ and retrieving binlog information at crash recovery.
+ commit() and recovery are always executed by at most single client
+ at the same time, so concurrency control is not needed.
+
+ Binlog info is stored in RocksDB as the following.
+ key: BINLOG_INFO_INDEX_NUMBER
+ value: packed single row:
+ binlog_name_length (2 byte form)
+ binlog_name
+ binlog_position (4 byte form)
+ binlog_gtid_length (2 byte form)
+ binlog_gtid
+*/
+class Rdb_binlog_manager {
+ public:
+ Rdb_binlog_manager(const Rdb_binlog_manager &) = delete;
+ Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete;
+ Rdb_binlog_manager() = default;
+
+ bool init(Rdb_dict_manager *const dict);
+ void cleanup();
+ void update(const char *const binlog_name, const my_off_t binlog_pos,
+ rocksdb::WriteBatchBase *const batch);
+ bool read(char *const binlog_name, my_off_t *const binlog_pos,
+ char *const binlog_gtid) const;
+ void update_slave_gtid_info(const uint id, const char *const db,
+ const char *const gtid,
+ rocksdb::WriteBatchBase *const write_batch);
+
+ private:
+ Rdb_dict_manager *m_dict = nullptr;
+ Rdb_buf_writer<Rdb_key_def::INDEX_NUMBER_SIZE> m_key_writer;
+ rocksdb::Slice m_key_slice;
+
+ bool unpack_value(const uchar *const value, size_t value_size,
+ char *const binlog_name,
+ my_off_t *const binlog_pos, char *const binlog_gtid) const;
+
+ std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl;
+};
+
+/*
+ Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
+ internal data dictionary.
+ MyRocks stores data dictionary on dedicated system column family
+ named __system__. The system column family is used by MyRocks
+ internally only, and not used by applications.
+
+ Currently MyRocks has the following data dictionary data models.
+
+ 1. Table Name => internal index id mappings
+ key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
+ value: version, {cf_id, index_id}*n_indexes_of_the_table
+ version is 2 bytes. cf_id and index_id are 4 bytes.
+
+ 2. internal cf_id, index id => index information
+ key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
+ value: version, index_type, kv_format_version, index_flags, ttl_duration
+ index_type is 1 byte, version and kv_format_version are 2 bytes.
+ index_flags is 4 bytes.
+ ttl_duration is 8 bytes.
+
+ 3. CF id => CF flags
+ key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
+ value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
+ cf_flags is 4 bytes in total.
+
+ 4. Binlog entry (updated at commit)
+ key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
+ value: version, {binlog_name,binlog_pos,binlog_gtid}
+
+ 5. Ongoing drop index entry
+ key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
+ value: version
+
+ 6. index stats
+ key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
+ value: version, {materialized PropertiesCollector::IndexStats}
+
+ 7. maximum index id
+ key: Rdb_key_def::MAX_INDEX_ID(0x7)
+ value: index_id
+ index_id is 4 bytes
+
+ 8. Ongoing create index entry
+ key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
+ value: version
+
+ 9. auto_increment values
+ key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
+ value: version, {max auto_increment so far}
+ max auto_increment is 8 bytes
+
+ Data dictionary operations are atomic inside RocksDB. For example,
+ when creating a table with two indexes, it is necessary to call Put
+ three times. They have to be atomic. Rdb_dict_manager has a wrapper function
+ begin() and commit() to make it easier to do atomic operations.
+
+*/
+class Rdb_dict_manager {
+ private:
+ mysql_mutex_t m_mutex;
+ rocksdb::TransactionDB *m_db = nullptr;
+ rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
+ /* Utility to put INDEX_INFO and CF_DEFINITION */
+
+ uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
+ rocksdb::Slice m_key_slice_max_index_id;
+
+ static void dump_index_id(uchar *const netbuf,
+ Rdb_key_def::DATA_DICT_TYPE dict_type,
+ const GL_INDEX_ID &gl_index_id);
+ template <size_t T>
+ static void dump_index_id(Rdb_buf_writer<T> *buf_writer,
+ Rdb_key_def::DATA_DICT_TYPE dict_type,
+ const GL_INDEX_ID &gl_index_id) {
+ buf_writer->write_uint32(dict_type);
+ buf_writer->write_uint32(gl_index_id.cf_id);
+ buf_writer->write_uint32(gl_index_id.index_id);
+ }
+
+ void delete_with_prefix(rocksdb::WriteBatch *const batch,
+ Rdb_key_def::DATA_DICT_TYPE dict_type,
+ const GL_INDEX_ID &gl_index_id) const;
+ /* Functions for fast DROP TABLE/INDEX */
+ void resume_drop_indexes() const;
+ void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
+ const uint32 n_keys,
+ const char *const log_action) const;
+ void log_start_drop_index(GL_INDEX_ID gl_index_id,
+ const char *log_action) const;
+
+ public:
+ Rdb_dict_manager(const Rdb_dict_manager &) = delete;
+ Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
+ Rdb_dict_manager() = default;
+
+ bool init(rocksdb::TransactionDB *const rdb_dict,
+ Rdb_cf_manager *const cf_manager);
+
+ inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
+
+ inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
+
+ inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
+
+ inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
+ return m_system_cfh;
+ }
+
+ /* Raw RocksDB operations */
+ std::unique_ptr<rocksdb::WriteBatch> begin() const;
+ int commit(rocksdb::WriteBatch *const batch, const bool sync = true) const;
+ rocksdb::Status get_value(const rocksdb::Slice &key,
+ std::string *const value) const;
+ void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
+ const rocksdb::Slice &value) const;
+ void delete_key(rocksdb::WriteBatchBase *batch,
+ const rocksdb::Slice &key) const;
+ rocksdb::Iterator *new_iterator() const;
+
+ /* Internal Index id => CF */
+ void add_or_update_index_cf_mapping(
+ rocksdb::WriteBatch *batch,
+ struct Rdb_index_info *const index_info) const;
+ void delete_index_info(rocksdb::WriteBatch *batch,
+ const GL_INDEX_ID &index_id) const;
+ bool get_index_info(const GL_INDEX_ID &gl_index_id,
+ struct Rdb_index_info *const index_info) const;
+
+ /* CF id => CF flags */
+ void add_cf_flags(rocksdb::WriteBatch *const batch, const uint cf_id,
+ const uint cf_flags) const;
+ bool get_cf_flags(const uint cf_id, uint *const cf_flags) const;
+
+ /* Functions for fast CREATE/DROP TABLE/INDEX */
+ void get_ongoing_index_operation(
+ std::unordered_set<GL_INDEX_ID> *gl_index_ids,
+ Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+ bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
+ Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+ void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
+ const GL_INDEX_ID &gl_index_id,
+ Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+ void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
+ const GL_INDEX_ID &gl_index_id,
+ Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+ bool is_drop_index_empty() const;
+ void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
+ const uint32 n_keys,
+ rocksdb::WriteBatch *const batch) const;
+ void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+ rocksdb::WriteBatch *const batch) const;
+ void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+ rocksdb::WriteBatch *const batch) const;
+ void finish_indexes_operation(
+ const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
+ Rdb_key_def::DATA_DICT_TYPE dd_type) const;
+ void rollback_ongoing_index_creation() const;
+
+ inline void get_ongoing_drop_indexes(
+ std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
+ get_ongoing_index_operation(gl_index_ids,
+ Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+ }
+ inline void get_ongoing_create_indexes(
+ std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
+ get_ongoing_index_operation(gl_index_ids,
+ Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+ }
+ inline void start_drop_index(rocksdb::WriteBatch *wb,
+ const GL_INDEX_ID &gl_index_id) const {
+ start_ongoing_index_operation(wb, gl_index_id,
+ Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+ }
+ inline void start_create_index(rocksdb::WriteBatch *wb,
+ const GL_INDEX_ID &gl_index_id) const {
+ start_ongoing_index_operation(wb, gl_index_id,
+ Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+ }
+ inline void finish_drop_indexes(
+ const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
+ finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+ }
+ inline void finish_create_indexes(
+ const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
+ finish_indexes_operation(gl_index_ids,
+ Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+ }
+ inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
+ return is_index_operation_ongoing(gl_index_id,
+ Rdb_key_def::DDL_DROP_INDEX_ONGOING);
+ }
+ inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
+ return is_index_operation_ongoing(gl_index_id,
+ Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
+ }
+
+ bool get_max_index_id(uint32_t *const index_id) const;
+ bool update_max_index_id(rocksdb::WriteBatch *const batch,
+ const uint32_t index_id) const;
+ void add_stats(rocksdb::WriteBatch *const batch,
+ const std::vector<Rdb_index_stats> &stats) const;
+ Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
+
+ rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
+ const GL_INDEX_ID &gl_index_id,
+ ulonglong val,
+ bool overwrite = false) const;
+ bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
+ ulonglong *new_val) const;
+};
+
+struct Rdb_index_info {
+ GL_INDEX_ID m_gl_index_id;
+ uint16_t m_index_dict_version = 0;
+ uchar m_index_type = 0;
+ uint16_t m_kv_version = 0;
+ uint32 m_index_flags = 0;
+ uint64 m_ttl_duration = 0;
+};
+
+/*
+ @brief
+ Merge Operator for the auto_increment value in the system_cf
+
+ @detail
+ This class implements the rocksdb Merge Operator for auto_increment values
+ that are stored to the data dictionary every transaction.
+
+ The actual Merge function is triggered on compaction, memtable flushes, or
+ when get() is called on the same key.
+
+ */
+class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
+ public:
+ /*
+ Updates the new value associated with a key to be the maximum of the
+ passed in value and the existing value.
+
+ @param[IN] key
+ @param[IN] existing_value existing value for a key; nullptr if nonexistent
+ key
+ @param[IN] value
+ @param[OUT] new_value new value after Merge
+ @param[IN] logger
+ */
+ bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
+ const rocksdb::Slice &value, std::string *new_value,
+ rocksdb::Logger *logger) const override {
+ DBUG_ASSERT(new_value != nullptr);
+
+ if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
+ GetKeyType(key) != Rdb_key_def::AUTO_INC ||
+ value.size() !=
+ RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
+ GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
+ abort();
+ }
+
+ uint64_t merged_value = Deserialize(value);
+
+ if (existing_value != nullptr) {
+ if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
+ ROCKSDB_SIZEOF_AUTOINC_VALUE ||
+ GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
+ abort();
+ }
+
+ merged_value = std::max(merged_value, Deserialize(*existing_value));
+ }
+ Serialize(merged_value, new_value);
+ return true;
+ }
+
+ virtual const char *Name() const override { return "Rdb_system_merge_op"; }
+
+ private:
+ /*
+ Serializes the integer data to the new_value buffer or the target buffer
+ the merge operator will update to
+ */
+ void Serialize(const uint64_t data, std::string *new_value) const {
+ uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
+ ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
+ uchar *ptr = value_buf;
+ /* fill in the auto increment version */
+ rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
+ ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
+ /* fill in the auto increment value */
+ rdb_netbuf_store_uint64(ptr, data);
+ ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
+ new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
+ }
+
+ /*
+ Gets the value of auto_increment type in the data dictionary from the
+ value slice
+
+ @Note Only to be used on data dictionary keys for the auto_increment type
+ */
+ uint64_t Deserialize(const rocksdb::Slice &s) const {
+ return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
+ RDB_SIZEOF_AUTO_INCREMENT_VERSION);
+ }
+
+ /*
+ Gets the type of the key of the key in the data dictionary.
+
+ @Note Only to be used on data dictionary keys for the auto_increment type
+ */
+ uint16_t GetKeyType(const rocksdb::Slice &s) const {
+ return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
+ }
+
+ /*
+ Gets the version of the auto_increment value in the data dictionary.
+
+ @Note Only to be used on data dictionary value for the auto_increment type
+ */
+ uint16_t GetVersion(const rocksdb::Slice &s) const {
+ return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
+ }
+};
+
+bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
+
+} // namespace myrocks