diff options
Diffstat (limited to 'storage/rocksdb/ha_rocksdb.h')
-rw-r--r-- | storage/rocksdb/ha_rocksdb.h | 1070 |
1 files changed, 1070 insertions, 0 deletions
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h new file mode 100644 index 00000000..f847ee25 --- /dev/null +++ b/storage/rocksdb/ha_rocksdb.h @@ -0,0 +1,1070 @@ +/* + Copyright (c) 2012,2013 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ +#pragma once + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +/* C++ standard header files */ +#include <set> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> + +/* MySQL header files */ +#include "./handler.h" /* handler */ +#include "./my_global.h" /* ulonglong */ +#include "./sql_string.h" + +#ifdef _WIN32 +#undef pthread_key_create +#undef pthread_key_delete +#undef pthread_setspecific +#undef pthread_getspecific +#endif + + +/* RocksDB header files */ +#include "rocksdb/cache.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/statistics.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" + +/* MyRocks header files */ +#include "./rdb_buff.h" +#include "./rdb_comparator.h" +#include "./rdb_global.h" +#include "./rdb_index_merge.h" +#include "./rdb_io_watchdog.h" +#include "./rdb_perf_context.h" +#include "./rdb_sst_info.h" +#include "./rdb_utils.h" +#include "./ut0counter.h" + +#include "rdb_mariadb_server_port.h" + +class Field_blob; +class Field_varstring; +/** + @note MyRocks Coding Conventions: + MyRocks code follows the baseline MySQL coding conventions, available at + http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several + refinements (@see /storage/rocksdb/README file). +*/ + +namespace myrocks { + +class Rdb_converter; +class Rdb_key_def; +class Rdb_tbl_def; +class Rdb_transaction; +class Rdb_transaction_impl; +class Rdb_writebatch_impl; +class Rdb_field_encoder; +/* collations, used in MariaRocks */ +enum collations_used { + COLLATION_UTF8MB4_BIN = 46, + COLLATION_LATIN1_BIN = 47, + COLLATION_UTF16LE_BIN = 55, + COLLATION_UTF32_BIN = 61, + COLLATION_UTF16_BIN = 62, + COLLATION_BINARY = 63, + COLLATION_UTF8_BIN = 83 +}; + +#if 0 // MARIAROCKS_NOT_YET : read-free replication is not supported +extern char *rocksdb_read_free_rpl_tables; +#if defined(HAVE_PSI_INTERFACE) +extern PSI_rwlock_key key_rwlock_read_free_rpl_tables; +#endif +extern Regex_list_handler rdb_read_free_regex_handler; +#endif + +/** + @brief + Rdb_table_handler is a reference-counted structure storing information for + each open table. All the objects are stored in a global hash map. + + //TODO: join this with Rdb_tbl_def ? +*/ +struct Rdb_table_handler { + char *m_table_name; + uint m_table_name_length; + int m_ref_count; + atomic_stat<int> m_lock_wait_timeout_counter; + atomic_stat<int> m_deadlock_counter; + + my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock + + /* Stores cumulative table statistics */ + my_io_perf_atomic_t m_io_perf_read; + my_io_perf_atomic_t m_io_perf_write; + Rdb_atomic_perf_counters m_table_perf_context; + + /* Stores cached memtable estimate statistics */ + std::atomic_uint m_mtcache_lock; + uint64_t m_mtcache_count; + uint64_t m_mtcache_size; + uint64_t m_mtcache_last_update; +}; + +} // namespace myrocks + +/* Provide hash function for GL_INDEX_ID so we can include it in sets */ +namespace std { +template <> +struct hash<myrocks::GL_INDEX_ID> { + std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const { + const uint64_t val = + ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id); + return std::hash<uint64_t>()(val); + } +}; +} // namespace std + +namespace myrocks { + +/** + @brief + Class definition for ROCKSDB storage engine plugin handler +*/ + +class ha_rocksdb : public my_core::handler { + my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock + + Rdb_table_handler *m_table_handler; ///< Open table handler + + /* Iterator used for range scans and for full table/index scans */ + rocksdb::Iterator *m_scan_it; + + /* Same as handler::end_key but for start. Reverse-ordered scans need it */ + key_range m_save_start_range; + const key_range *m_start_range; + + /* Whether m_scan_it was created with skip_bloom=true */ + bool m_scan_it_skips_bloom; + + const rocksdb::Snapshot *m_scan_it_snapshot; + + /* Buffers used for upper/lower bounds for m_scan_it. */ + uchar *m_scan_it_lower_bound; + uchar *m_scan_it_upper_bound; + rocksdb::Slice m_scan_it_lower_bound_slice; + rocksdb::Slice m_scan_it_upper_bound_slice; + + Rdb_tbl_def *m_tbl_def; + + /* Primary Key encoder from KeyTupleFormat to StorageFormat */ + std::shared_ptr<Rdb_key_def> m_pk_descr; + + /* Array of index descriptors */ + std::shared_ptr<Rdb_key_def> *m_key_descr_arr; + + bool check_keyread_allowed(uint inx, uint part, bool all_parts) const; + + /* + Number of key parts in PK. This is the same as + table->key_info[table->s->primary_key].keyparts + */ + uint m_pk_key_parts; + + /* + TRUE <=> Primary Key columns can be decoded from the index + */ + mutable bool m_pk_can_be_decoded; + + uchar *m_pk_tuple; /* Buffer for storing PK in KeyTupleFormat */ + uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */ + // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey? + + /* + Temporary buffers for storing the key part of the Key/Value pair + for secondary indexes. + */ + uchar *m_sk_packed_tuple; + + /* + Temporary buffers for storing end key part of the Key/Value pair. + This is used for range scan only. + */ + uchar *m_end_key_packed_tuple; + + Rdb_string_writer m_sk_tails; + Rdb_string_writer m_pk_unpack_info; + + /* + ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here + mem-comparable form of the index lookup tuple. + */ + uchar *m_sk_match_prefix; + uint m_sk_match_length; + + /* Buffer space for the above */ + uchar *m_sk_match_prefix_buf; + + /* Second buffers, used by UPDATE. */ + uchar *m_sk_packed_tuple_old; + Rdb_string_writer m_sk_tails_old; + + /* Buffers used for duplicate checking during unique_index_creation */ + uchar *m_dup_sk_packed_tuple; + uchar *m_dup_sk_packed_tuple_old; + + /* + Temporary space for packing VARCHARs (we provide it to + pack_record()/pack_index_tuple() calls). + */ + uchar *m_pack_buffer; + + /* + A buffer long enough to store table record + */ + uchar *m_record_buffer; + + /* class to convert between Mysql format and RocksDB format*/ + std::shared_ptr<Rdb_converter> m_converter; + + /* + Pointer to the original TTL timestamp value (8 bytes) during UPDATE. + */ + char *m_ttl_bytes; + /* + The TTL timestamp value can change if the explicit TTL column is + updated. If we detect this when updating the PK, we indicate it here so + we know we must always update any SK's. + */ + bool m_ttl_bytes_updated; + + /* rowkey of the last record we've read, in StorageFormat. */ + String m_last_rowkey; + + /* + Last retrieved record, in table->record[0] data format. + + This is used only when we get the record with rocksdb's Get() call (The + other option is when we get a rocksdb::Slice from an iterator) + */ + rocksdb::PinnableSlice m_retrieved_record; + + /* Type of locking to apply to rows */ + enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows; + + /* TRUE means we're doing an index-only read. FALSE means otherwise. */ + bool m_keyread_only; + + bool m_skip_scan_it_next_call; + + /* TRUE means we are accessing the first row after a snapshot was created */ + bool m_rnd_scan_is_new_snapshot; + + /* + TRUE means we should skip unique key checks for this table if the + replication lag gets too large + */ + bool m_skip_unique_check; + + /* + TRUE means INSERT ON DUPLICATE KEY UPDATE. In such case we can optimize by + remember the failed attempt (if there is one that violates uniqueness check) + in write_row and in the following index_read to skip the lock check and read + entirely + */ + bool m_insert_with_update; + + /* TRUE if last time the insertion failed due to duplicated PK */ + bool m_dup_pk_found; + +#ifndef DBUG_OFF + /* Last retreived record for sanity checking */ + String m_dup_pk_retrieved_record; +#endif + + /** + @brief + This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may + be changed by this statement. Indexes that are not in the bitmap do not need + to be updated. + @note Valid inside UPDATE statements, IIF(m_update_scope_is_valid == true). + */ + my_core::key_map m_update_scope; + bool m_update_scope_is_valid; + + /* SST information used for bulk loading the primary key */ + std::shared_ptr<Rdb_sst_info> m_sst_info; + + /* + MySQL index number for duplicate key error + */ + uint m_dupp_errkey; + + int create_key_defs(const TABLE *const table_arg, + Rdb_tbl_def *const tbl_def_arg, + const TABLE *const old_table_arg = nullptr, + const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const + MY_ATTRIBUTE((__nonnull__(2, 3), __warn_unused_result__)); + int secondary_index_read(const int keyno, uchar *const buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void setup_iterator_for_rnd_scan(); + bool is_ascending(const Rdb_key_def &keydef, + enum ha_rkey_function find_flag) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void setup_iterator_bounds(const Rdb_key_def &kd, + const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); + bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, + const rocksdb::Slice &eq_cond, + const bool use_all_keys); + bool check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, + const rocksdb::Slice &eq_cond, + const bool use_all_keys, size_t bound_len, + uchar *const lower_bound, + uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); + void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice, + const bool use_all_keys, const uint eq_cond_len) + MY_ATTRIBUTE((__nonnull__)); + void release_scan_iterator(void); + + rocksdb::Status get_for_update( + Rdb_transaction *const tx, + rocksdb::ColumnFamilyHandle *const column_family, + const rocksdb::Slice &key, rocksdb::PinnableSlice *value) const; + + int get_row_by_rowid(uchar *const buf, const char *const rowid, + const uint rowid_size, const bool skip_lookup = false, + const bool skip_ttl_check = true) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int get_row_by_rowid(uchar *const buf, const uchar *const rowid, + const uint rowid_size, const bool skip_lookup = false, + const bool skip_ttl_check = true) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)) { + return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid), + rowid_size, skip_lookup, skip_ttl_check); + } + + void load_auto_incr_value(); + ulonglong load_auto_incr_value_from_index(); + void update_auto_incr_val(ulonglong val); + void update_auto_incr_val_from_field(); + rocksdb::Status get_datadic_auto_incr(Rdb_transaction *const tx, + const GL_INDEX_ID &gl_index_id, + ulonglong *new_val) const; + longlong update_hidden_pk_val(); + int load_hidden_pk_value() MY_ATTRIBUTE((__warn_unused_result__)); + int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + bool can_use_single_delete(const uint index) const + MY_ATTRIBUTE((__warn_unused_result__)); + bool is_blind_delete_enabled(); + bool skip_unique_check() const MY_ATTRIBUTE((__warn_unused_result__)); +#ifdef MARIAROCKS_NOT_YET // MDEV-10975 + void set_force_skip_unique_check(bool skip) override; +#endif + bool commit_in_the_middle() MY_ATTRIBUTE((__warn_unused_result__)); + bool do_bulk_commit(Rdb_transaction *const tx) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + bool has_hidden_pk(const TABLE *const table) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + void update_row_stats(const operation_type &type); + + void set_last_rowkey(const uchar *const old_data); + + /* + For the active index, indicates which columns must be covered for the + current lookup to be covered. If the bitmap field is null, that means this + index does not cover the current lookup for any record. + */ + MY_BITMAP m_lookup_bitmap = {nullptr, nullptr, 0, 0}; + + int alloc_key_buffers(const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg, + bool alloc_alter_buffers = false) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void free_key_buffers(); + + // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE + rocksdb::Range get_range(const int i, uchar buf[2 * 4]) const; + + /* + Perf timers for data reads + */ + Rdb_io_perf m_io_perf; + + /* + Update stats + */ + void update_stats(void); + + public: + /* + The following two are currently only used for getting the range bounds + from QUICK_SELECT_DESC. + We don't need to implement prepare_index_key_scan[_map] because it is + only used with HA_READ_KEY_EXACT and HA_READ_PREFIX_LAST where one + can infer the bounds of the range being scanned, anyway. + */ + int prepare_index_scan() override; + int prepare_range_scan(const key_range *start_key, + const key_range *end_key) override; + + /* + Controls whether writes include checksums. This is updated from the session + variable + at the start of each query. + */ + bool m_store_row_debug_checksums; + + int m_checksums_pct; + + ha_rocksdb(my_core::handlerton *const hton, + my_core::TABLE_SHARE *const table_arg); + virtual ~ha_rocksdb() override { + int err MY_ATTRIBUTE((__unused__)); + err = finalize_bulk_load(false); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Error %d finalizing bulk load while closing " + "handler.", + err); + } + } + + /* + MariaDB: this function: + + const char *table_type() const + + is non-virtual in class handler, so there's no point to override it. + */ + + /* The following is only used by SHOW KEYS: */ + const char *index_type(uint inx) override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN("LSMTREE"); + } + + /* + Not present in MariaDB: + const char **bas_ext() const override; + */ + + /* + Returns the name of the table's base name + */ + const std::string &get_table_basename() const; + + /** @brief + This is a list of flags that indicate what functionality the storage engine + implements. The current table flags are documented in handler.h + */ + ulonglong table_flags() const override ; +private: + bool init_with_fields(); /* no 'override' in MariaDB */ +public: + /** @brief + This is a bitmap of flags that indicates how the storage engine + implements indexes. The current index flags are documented in + handler.h. If you do not implement indexes, just return zero here. + + @details + part is the key part to check. First key part is 0. + If all_parts is set, MySQL wants to know the flags for the combined + index, up to and including 'part'. + */ + ulong index_flags(uint inx, uint part, bool all_parts) const override; + + const key_map *keys_to_use_for_scanning() override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(&key_map_full); + } + + bool should_store_row_debug_checksums() const { + return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct); + } + + int rename_table(const char *const from, const char *const to) override + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int convert_record_from_storage_format(const rocksdb::Slice *const key, + const rocksdb::Slice *const value, + uchar *const buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int convert_record_from_storage_format(const rocksdb::Slice *const key, + uchar *const buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static const std::vector<std::string> parse_into_tokens(const std::string &s, + const char delim); + + static const std::string generate_cf_name( + const uint index, const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found); + + static const char *get_key_name(const uint index, + const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static const char *get_key_comment(const uint index, + const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static const std::string get_table_comment(const TABLE *const table_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static bool is_hidden_pk(const uint index, const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static uint pk_index(const TABLE *const table_arg, + const Rdb_tbl_def *const tbl_def_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + static bool is_pk(const uint index, const TABLE *table_arg, + const Rdb_tbl_def *tbl_def_arg) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + /** @brief + unireg.cc will call max_supported_record_length(), max_supported_keys(), + max_supported_key_parts(), uint max_supported_key_length() + to make sure that the storage engine can handle the data it is about to + send. Return *real* limits of your storage engine here; MySQL will do + min(your_limits, MySQL_limits) automatically. + */ + uint max_supported_record_length() const override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(HA_MAX_REC_LENGTH); + } + + uint max_supported_keys() const override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(MAX_INDEXES); + } + + uint max_supported_key_parts() const override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(MAX_REF_PARTS); + } + + uint max_supported_key_part_length() const override; + + /** @brief + unireg.cc will call this to make sure that the storage engine can handle + the data it is about to send. Return *real* limits of your storage engine + here; MySQL will do min(your_limits, MySQL_limits) automatically. + + @details + There is no need to implement ..._key_... methods if your engine doesn't + support indexes. + */ + uint max_supported_key_length() const override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(16 * 1024); /* just to return something*/ + } + + /** + TODO: return actual upper bound of number of records in the table. + (e.g. save number of records seen on full table scan and/or use file size + as upper bound) + */ + ha_rows estimate_rows_upper_bound() override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN(HA_POS_ERROR); + } + + /* At the moment, we're ok with default handler::index_init() implementation. + */ + int index_read_map(uchar *const buf, const uchar *const key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) override + MY_ATTRIBUTE((__warn_unused_result__)); + + int index_read_map_impl(uchar *const buf, const uchar *const key, + key_part_map keypart_map, + enum ha_rkey_function find_flag, + const key_range *end_key) + MY_ATTRIBUTE((__warn_unused_result__)); + + bool is_using_full_key(key_part_map keypart_map, uint actual_key_parts); + int read_range_first(const key_range *const start_key, + const key_range *const end_key, bool eq_range, + bool sorted) override + MY_ATTRIBUTE((__warn_unused_result__)); + + virtual double scan_time() override { + DBUG_ENTER_FUNC(); + + DBUG_RETURN( + static_cast<double>((stats.records + stats.deleted) / 20.0 + 10)); + } + + virtual double read_time(uint, uint, ha_rows rows) override; + virtual void print_error(int error, myf errflag) override; + + int open(const char *const name, int mode, uint test_if_locked) override + MY_ATTRIBUTE((__warn_unused_result__)); + int close(void) override MY_ATTRIBUTE((__warn_unused_result__)); + + int write_row(const uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + int update_row(const uchar *const old_data, const uchar *const new_data) override + MY_ATTRIBUTE((__warn_unused_result__)); + int delete_row(const uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx, + rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Slice &key) + MY_ATTRIBUTE((__warn_unused_result__)); + + int index_next(uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + int index_next_with_direction(uchar *const buf, bool move_forward) + MY_ATTRIBUTE((__warn_unused_result__)); + int index_prev(uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + + int index_first(uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + int index_last(uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + + class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override; + /* + Default implementation from cancel_pushed_idx_cond() suits us + */ + private: + struct key_def_cf_info { + rocksdb::ColumnFamilyHandle *cf_handle; + bool is_reverse_cf; + bool is_per_partition_cf; + }; + + struct update_row_info { + Rdb_transaction *tx; + const uchar *new_data; + const uchar *old_data; + rocksdb::Slice new_pk_slice; + rocksdb::Slice old_pk_slice; + rocksdb::Slice old_pk_rec; + + // "unpack_info" data for the new PK value + Rdb_string_writer *new_pk_unpack_info; + + longlong hidden_pk_id; + bool skip_unique_check; + }; + + /* + Used to check for duplicate entries during fast unique secondary index + creation. + */ + struct unique_sk_buf_info { + bool sk_buf_switch = false; + rocksdb::Slice sk_memcmp_key; + rocksdb::Slice sk_memcmp_key_old; + uchar *dup_sk_buf; + uchar *dup_sk_buf_old; + + /* + This method is meant to be called back to back during inplace creation + of unique indexes. It will switch between two buffers, which + will each store the memcmp form of secondary keys, which are then + converted to slices in sk_memcmp_key or sk_memcmp_key_old. + + Switching buffers on each iteration allows us to retain the + sk_memcmp_key_old value for duplicate comparison. + */ + inline uchar *swap_and_get_sk_buf() { + sk_buf_switch = !sk_buf_switch; + return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old; + } + }; + + int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, + std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) + const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int create_key_def(const TABLE *const table_arg, const uint i, + const Rdb_tbl_def *const tbl_def_arg, + std::shared_ptr<Rdb_key_def> *const new_key_def, + const struct key_def_cf_info &cf_info, uint64 ttl_duration, + const std::string &ttl_column) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int create_inplace_key_defs( + const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg, + const TABLE *const old_table_arg, + const Rdb_tbl_def *const old_tbl_def_arg, + const std::array<key_def_cf_info, MAX_INDEXES + 1> &cf, + uint64 ttl_duration, const std::string &ttl_column) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + std::unordered_map<std::string, uint> get_old_key_positions( + const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg, + const TABLE *old_table_arg, const Rdb_tbl_def *old_tbl_def_arg) const + MY_ATTRIBUTE((__nonnull__)); + + using handler::compare_key_parts; + int compare_key_parts(const KEY *const old_key, + const KEY *const new_key) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int compare_keys(const KEY *const old_key, const KEY *const new_key) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + bool should_hide_ttl_rec(const Rdb_key_def &kd, + const rocksdb::Slice &ttl_rec_val, + const int64_t curr_ts) + MY_ATTRIBUTE((__warn_unused_result__)); + int rocksdb_skip_expired_records(const Rdb_key_def &kd, + rocksdb::Iterator *const iter, + bool seek_backward); + + int index_first_intern(uchar *buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int index_last_intern(uchar *buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int find_icp_matching_index_rec(const bool move_forward, uchar *const buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + void calc_updated_indexes(); + int update_write_row(const uchar *const old_data, const uchar *const new_data, + const bool skip_unique_check) + MY_ATTRIBUTE((__warn_unused_result__)); + int get_pk_for_update(struct update_row_info *const row_info); + int check_and_lock_unique_pk(const uint key_id, + const struct update_row_info &row_info, + bool *const found) + MY_ATTRIBUTE((__warn_unused_result__)); + int check_and_lock_sk(const uint key_id, + const struct update_row_info &row_info, + bool *const found) + MY_ATTRIBUTE((__warn_unused_result__)); + int check_uniqueness_and_lock(const struct update_row_info &row_info, + bool pk_changed) + MY_ATTRIBUTE((__warn_unused_result__)); + bool over_bulk_load_threshold(int *err) + MY_ATTRIBUTE((__warn_unused_result__)); + int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &key_def, + const rocksdb::Slice *key, + struct unique_sk_buf_info *sk_info) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, + const rocksdb::Slice &key, const rocksdb::Slice &value, + bool sort) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void update_bytes_written(ulonglong bytes_written); + int update_write_pk(const Rdb_key_def &kd, + const struct update_row_info &row_info, + const bool pk_changed) + MY_ATTRIBUTE((__warn_unused_result__)); + int update_write_sk(const TABLE *const table_arg, const Rdb_key_def &kd, + const struct update_row_info &row_info, + const bool bulk_load_sk) + MY_ATTRIBUTE((__warn_unused_result__)); + int update_write_indexes(const struct update_row_info &row_info, + const bool pk_changed) + MY_ATTRIBUTE((__warn_unused_result__)); + + int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter, + const bool using_full_key, const rocksdb::Slice &key_slice, + const int64_t ttl_filter_ts) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int read_before_key(const Rdb_key_def &kd, const bool using_full_key, + const rocksdb::Slice &key_slice, + const int64_t ttl_filter_ts) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice, + const int64_t ttl_filter_ts) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int position_to_correct_key(const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const bool full_key_match, const uchar *const key, + const key_part_map &keypart_map, + const rocksdb::Slice &key_slice, + bool *const move_forward, + const int64_t ttl_filter_ts) + MY_ATTRIBUTE((__warn_unused_result__)); + + int read_row_from_primary_key(uchar *const buf) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd, + bool move_forward) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int calc_eq_cond_len(const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const rocksdb::Slice &slice, + const int bytes_changed_by_succ, + const key_range *const end_key, + uint *const end_key_packed_size) + MY_ATTRIBUTE((__warn_unused_result__)); + + Rdb_tbl_def *get_table_if_exists(const char *const tablename) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void read_thd_vars(THD *const thd) MY_ATTRIBUTE((__nonnull__)); + + bool contains_foreign_key(THD *const thd) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int inplace_populate_sk( + TABLE *const table_arg, + const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + + int finalize_bulk_load(bool print_client_error = true) + MY_ATTRIBUTE((__warn_unused_result__)); + + int calculate_stats_for_table() MY_ATTRIBUTE((__warn_unused_result__)); + + bool should_skip_invalidated_record(const int rc); + bool should_recreate_snapshot(const int rc, const bool is_new_snapshot); + bool can_assume_tracked(THD *thd); + + public: + int index_init(uint idx, bool sorted) override + MY_ATTRIBUTE((__warn_unused_result__)); + int index_end() override MY_ATTRIBUTE((__warn_unused_result__)); + + void unlock_row() override; + + /** @brief + Unlike index_init(), rnd_init() can be called two consecutive times + without rnd_end() in between (it only makes sense if scan=1). In this + case, the second call should prepare for the new table scan (e.g if + rnd_init() allocates the cursor, the second call should position the + cursor to the start of the table; no need to deallocate and allocate + it again. This is a required method. + */ + int rnd_init(bool scan) override MY_ATTRIBUTE((__warn_unused_result__)); + int rnd_end() override MY_ATTRIBUTE((__warn_unused_result__)); + + int rnd_next(uchar *const buf) override + MY_ATTRIBUTE((__warn_unused_result__)); + int rnd_next_with_direction(uchar *const buf, bool move_forward) + MY_ATTRIBUTE((__warn_unused_result__)); + + int rnd_pos(uchar *const buf, uchar *const pos) override + MY_ATTRIBUTE((__warn_unused_result__)); + void position(const uchar *const record) override; + int info(uint) override; + + /* This function will always return success, therefore no annotation related + * to checking the return value. Can't change the signature because it's + * required by the interface. */ + int extra(enum ha_extra_function operation) override; + + int start_stmt(THD *const thd, thr_lock_type lock_type) override + MY_ATTRIBUTE((__warn_unused_result__)); + int external_lock(THD *const thd, int lock_type) override + MY_ATTRIBUTE((__warn_unused_result__)); + int truncate() override MY_ATTRIBUTE((__warn_unused_result__)); + + int reset() override { + DBUG_ENTER_FUNC(); + + /* Free blob data */ + m_retrieved_record.Reset(); + + DBUG_RETURN(HA_EXIT_SUCCESS); + } + + int check(THD *const thd, HA_CHECK_OPT *const check_opt) override + MY_ATTRIBUTE((__warn_unused_result__)); + int remove_rows(Rdb_tbl_def *const tbl); + ha_rows records_in_range(uint inx, + const key_range *const min_key, + const key_range *const max_key, + page_range *pages) override + MY_ATTRIBUTE((__warn_unused_result__)); + + int delete_table(Rdb_tbl_def *const tbl); + int delete_table(const char *const from) override + MY_ATTRIBUTE((__warn_unused_result__)); + int create(const char *const name, TABLE *const form, + HA_CREATE_INFO *const create_info) override + MY_ATTRIBUTE((__warn_unused_result__)); + int create_table(const std::string &table_name, const TABLE *table_arg, + ulonglong auto_increment_value); + bool check_if_incompatible_data(HA_CREATE_INFO *const info, + uint table_changes) override + MY_ATTRIBUTE((__warn_unused_result__)); + + THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to, + enum thr_lock_type lock_type) override + MY_ATTRIBUTE((__warn_unused_result__)); + + my_bool register_query_cache_table(THD *const thd, const char *table_key, + uint key_length, + qc_engine_callback *const engine_callback, + ulonglong *const engine_data) override { + DBUG_ENTER_FUNC(); + + /* Currently, we don't support query cache */ + DBUG_RETURN(FALSE); + } + + bool get_error_message(const int error, String *const buf) override + MY_ATTRIBUTE((__nonnull__)); + + static int rdb_error_to_mysql(const rocksdb::Status &s, + const char *msg = nullptr) + MY_ATTRIBUTE((__warn_unused_result__)); + + void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *const first_value, + ulonglong *const nb_reserved_values) override; + void update_create_info(HA_CREATE_INFO *const create_info) override; + int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override + MY_ATTRIBUTE((__warn_unused_result__)); + int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override + MY_ATTRIBUTE((__warn_unused_result__)); + + enum_alter_inplace_result check_if_supported_inplace_alter( + TABLE *altered_table, + my_core::Alter_inplace_info *const ha_alter_info) override; + + bool prepare_inplace_alter_table( + TABLE *const altered_table, + my_core::Alter_inplace_info *const ha_alter_info) override; + + bool inplace_alter_table( + TABLE *const altered_table, + my_core::Alter_inplace_info *const ha_alter_info) override; + + bool commit_inplace_alter_table( + TABLE *const altered_table, + my_core::Alter_inplace_info *const ha_alter_info, bool commit) override; + + void set_skip_unique_check_tables(const char *const whitelist); + + virtual ulonglong table_version() const override; + +#ifdef MARIAROCKS_NOT_YET // MDEV-10976 + bool is_read_free_rpl_table() const; +#endif + +#ifdef MARIAROCKS_NOT_YET // MDEV-10976 + public: + virtual void rpl_before_delete_rows() override; + virtual void rpl_after_delete_rows() override; + virtual void rpl_before_update_rows() override; + virtual void rpl_after_update_rows() override; + virtual bool use_read_free_rpl() const override; +#endif // MARIAROCKS_NOT_YET + + private: + /* Flags tracking if we are inside different replication operation */ + bool m_in_rpl_delete_rows; + bool m_in_rpl_update_rows; + + bool m_force_skip_unique_check; +}; + +/* + Helper class for in-place alter, for storing handler context between inplace + alter calls +*/ +struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx { + /* The new table definition */ + Rdb_tbl_def *const m_new_tdef; + + /* Stores the original key definitions */ + std::shared_ptr<Rdb_key_def> *const m_old_key_descr; + + /* Stores the new key definitions */ + std::shared_ptr<Rdb_key_def> *m_new_key_descr; + + /* Stores the old number of key definitions */ + const uint m_old_n_keys; + + /* Stores the new number of key definitions */ + const uint m_new_n_keys; + + /* Stores the added key glids */ + const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes; + + /* Stores the dropped key glids */ + const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids; + + /* Stores number of keys to add */ + const uint m_n_added_keys; + + /* Stores number of keys to drop */ + const uint m_n_dropped_keys; + + /* Stores the largest current auto increment value in the index */ + const ulonglong m_max_auto_incr; + + Rdb_inplace_alter_ctx( + Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr, + std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys, + uint new_n_keys, + std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes, + std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys, + uint n_dropped_keys, ulonglong max_auto_incr) + : my_core::inplace_alter_handler_ctx(), + m_new_tdef(new_tdef), + m_old_key_descr(old_key_descr), + m_new_key_descr(new_key_descr), + m_old_n_keys(old_n_keys), + m_new_n_keys(new_n_keys), + m_added_indexes(added_indexes), + m_dropped_index_ids(dropped_index_ids), + m_n_added_keys(n_added_keys), + m_n_dropped_keys(n_dropped_keys), + m_max_auto_incr(max_auto_incr) {} + + ~Rdb_inplace_alter_ctx() = default; + + private: + /* Disable Copying */ + Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &); + Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &); +}; + +// file name indicating RocksDB data corruption +std::string rdb_corruption_marker_file_name(); + +const int MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL= MariaDB_PLUGIN_MATURITY_STABLE; + +extern uint32_t rocksdb_ignore_datadic_errors; + +void sql_print_verbose_info(const char *format, ...); + +} // namespace myrocks + |