From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rocksdb/include/rocksdb/advanced_options.h | 1098 ++++++++ .../include/rocksdb/block_cache_trace_writer.h | 149 ++ src/rocksdb/include/rocksdb/c.h | 2793 ++++++++++++++++++++ src/rocksdb/include/rocksdb/cache.h | 775 ++++++ src/rocksdb/include/rocksdb/cache_bench_tool.h | 14 + src/rocksdb/include/rocksdb/cleanable.h | 128 + src/rocksdb/include/rocksdb/compaction_filter.h | 256 ++ src/rocksdb/include/rocksdb/compaction_job_stats.h | 109 + src/rocksdb/include/rocksdb/comparator.h | 164 ++ src/rocksdb/include/rocksdb/compression_type.h | 40 + .../include/rocksdb/concurrent_task_limiter.h | 51 + src/rocksdb/include/rocksdb/configurable.h | 400 +++ src/rocksdb/include/rocksdb/convenience.h | 525 ++++ src/rocksdb/include/rocksdb/customizable.h | 233 ++ src/rocksdb/include/rocksdb/data_structure.h | 51 + src/rocksdb/include/rocksdb/db.h | 1859 +++++++++++++ src/rocksdb/include/rocksdb/db_bench_tool.h | 11 + src/rocksdb/include/rocksdb/db_dump_tool.h | 45 + src/rocksdb/include/rocksdb/db_stress_tool.h | 11 + src/rocksdb/include/rocksdb/env.h | 1893 +++++++++++++ src/rocksdb/include/rocksdb/env_encryption.h | 465 ++++ src/rocksdb/include/rocksdb/experimental.h | 56 + src/rocksdb/include/rocksdb/file_checksum.h | 146 + src/rocksdb/include/rocksdb/file_system.h | 1849 +++++++++++++ src/rocksdb/include/rocksdb/filter_policy.h | 206 ++ src/rocksdb/include/rocksdb/flush_block_policy.h | 75 + src/rocksdb/include/rocksdb/functor_wrapper.h | 56 + src/rocksdb/include/rocksdb/io_status.h | 244 ++ src/rocksdb/include/rocksdb/iostats_context.h | 98 + src/rocksdb/include/rocksdb/iterator.h | 144 + src/rocksdb/include/rocksdb/ldb_tool.h | 44 + src/rocksdb/include/rocksdb/listener.h | 847 ++++++ src/rocksdb/include/rocksdb/memory_allocator.h | 81 + src/rocksdb/include/rocksdb/memtablerep.h | 423 +++ src/rocksdb/include/rocksdb/merge_operator.h | 265 ++ src/rocksdb/include/rocksdb/metadata.h | 245 ++ src/rocksdb/include/rocksdb/options.h | 2113 +++++++++++++++ src/rocksdb/include/rocksdb/perf_context.h | 274 ++ src/rocksdb/include/rocksdb/perf_level.h | 36 + src/rocksdb/include/rocksdb/persistent_cache.h | 74 + src/rocksdb/include/rocksdb/rate_limiter.h | 159 ++ src/rocksdb/include/rocksdb/rocksdb_namespace.h | 16 + src/rocksdb/include/rocksdb/secondary_cache.h | 133 + src/rocksdb/include/rocksdb/slice.h | 264 ++ src/rocksdb/include/rocksdb/slice_transform.h | 135 + src/rocksdb/include/rocksdb/snapshot.h | 53 + src/rocksdb/include/rocksdb/sst_dump_tool.h | 19 + src/rocksdb/include/rocksdb/sst_file_manager.h | 136 + src/rocksdb/include/rocksdb/sst_file_reader.h | 47 + src/rocksdb/include/rocksdb/sst_file_writer.h | 174 ++ src/rocksdb/include/rocksdb/sst_partitioner.h | 142 + src/rocksdb/include/rocksdb/statistics.h | 707 +++++ src/rocksdb/include/rocksdb/stats_history.h | 70 + src/rocksdb/include/rocksdb/status.h | 570 ++++ src/rocksdb/include/rocksdb/system_clock.h | 116 + src/rocksdb/include/rocksdb/table.h | 940 +++++++ src/rocksdb/include/rocksdb/table_properties.h | 327 +++ src/rocksdb/include/rocksdb/table_reader_caller.h | 41 + src/rocksdb/include/rocksdb/thread_status.h | 189 ++ src/rocksdb/include/rocksdb/threadpool.h | 67 + src/rocksdb/include/rocksdb/trace_reader_writer.h | 52 + src/rocksdb/include/rocksdb/trace_record.h | 248 ++ src/rocksdb/include/rocksdb/trace_record_result.h | 187 ++ src/rocksdb/include/rocksdb/transaction_log.h | 122 + src/rocksdb/include/rocksdb/types.h | 66 + src/rocksdb/include/rocksdb/unique_id.h | 55 + src/rocksdb/include/rocksdb/universal_compaction.h | 96 + src/rocksdb/include/rocksdb/utilities/agg_merge.h | 138 + .../include/rocksdb/utilities/backup_engine.h | 631 +++++ .../include/rocksdb/utilities/cache_dump_load.h | 142 + src/rocksdb/include/rocksdb/utilities/checkpoint.h | 66 + .../include/rocksdb/utilities/convenience.h | 10 + .../include/rocksdb/utilities/customizable_util.h | 377 +++ src/rocksdb/include/rocksdb/utilities/db_ttl.h | 72 + src/rocksdb/include/rocksdb/utilities/debug.h | 48 + src/rocksdb/include/rocksdb/utilities/env_mirror.h | 181 ++ .../include/rocksdb/utilities/info_log_finder.h | 19 + src/rocksdb/include/rocksdb/utilities/ldb_cmd.h | 318 +++ .../rocksdb/utilities/ldb_cmd_execute_result.h | 75 + .../include/rocksdb/utilities/leveldb_options.h | 145 + .../utilities/lua/rocks_lua_custom_library.h | 43 + .../include/rocksdb/utilities/lua/rocks_lua_util.h | 55 + .../include/rocksdb/utilities/memory_util.h | 50 + .../include/rocksdb/utilities/object_registry.h | 585 ++++ .../rocksdb/utilities/optimistic_transaction_db.h | 100 + .../rocksdb/utilities/option_change_migration.h | 24 + .../include/rocksdb/utilities/options_type.h | 1221 +++++++++ .../include/rocksdb/utilities/options_util.h | 128 + src/rocksdb/include/rocksdb/utilities/replayer.h | 87 + src/rocksdb/include/rocksdb/utilities/sim_cache.h | 96 + .../include/rocksdb/utilities/stackable_db.h | 566 ++++ .../utilities/table_properties_collectors.h | 90 + .../include/rocksdb/utilities/transaction.h | 686 +++++ .../include/rocksdb/utilities/transaction_db.h | 508 ++++ .../rocksdb/utilities/transaction_db_mutex.h | 91 + .../rocksdb/utilities/write_batch_with_index.h | 309 +++ src/rocksdb/include/rocksdb/version.h | 43 + src/rocksdb/include/rocksdb/wal_filter.h | 111 + src/rocksdb/include/rocksdb/wide_columns.h | 171 ++ src/rocksdb/include/rocksdb/write_batch.h | 494 ++++ src/rocksdb/include/rocksdb/write_batch_base.h | 144 + src/rocksdb/include/rocksdb/write_buffer_manager.h | 176 ++ 102 files changed, 31477 insertions(+) create mode 100644 src/rocksdb/include/rocksdb/advanced_options.h create mode 100644 src/rocksdb/include/rocksdb/block_cache_trace_writer.h create mode 100644 src/rocksdb/include/rocksdb/c.h create mode 100644 src/rocksdb/include/rocksdb/cache.h create mode 100644 src/rocksdb/include/rocksdb/cache_bench_tool.h create mode 100644 src/rocksdb/include/rocksdb/cleanable.h create mode 100644 src/rocksdb/include/rocksdb/compaction_filter.h create mode 100644 src/rocksdb/include/rocksdb/compaction_job_stats.h create mode 100644 src/rocksdb/include/rocksdb/comparator.h create mode 100644 src/rocksdb/include/rocksdb/compression_type.h create mode 100644 src/rocksdb/include/rocksdb/concurrent_task_limiter.h create mode 100644 src/rocksdb/include/rocksdb/configurable.h create mode 100644 src/rocksdb/include/rocksdb/convenience.h create mode 100644 src/rocksdb/include/rocksdb/customizable.h create mode 100644 src/rocksdb/include/rocksdb/data_structure.h create mode 100644 src/rocksdb/include/rocksdb/db.h create mode 100644 src/rocksdb/include/rocksdb/db_bench_tool.h create mode 100644 src/rocksdb/include/rocksdb/db_dump_tool.h create mode 100644 src/rocksdb/include/rocksdb/db_stress_tool.h create mode 100644 src/rocksdb/include/rocksdb/env.h create mode 100644 src/rocksdb/include/rocksdb/env_encryption.h create mode 100644 src/rocksdb/include/rocksdb/experimental.h create mode 100644 src/rocksdb/include/rocksdb/file_checksum.h create mode 100644 src/rocksdb/include/rocksdb/file_system.h create mode 100644 src/rocksdb/include/rocksdb/filter_policy.h create mode 100644 src/rocksdb/include/rocksdb/flush_block_policy.h create mode 100644 src/rocksdb/include/rocksdb/functor_wrapper.h create mode 100644 src/rocksdb/include/rocksdb/io_status.h create mode 100644 src/rocksdb/include/rocksdb/iostats_context.h create mode 100644 src/rocksdb/include/rocksdb/iterator.h create mode 100644 src/rocksdb/include/rocksdb/ldb_tool.h create mode 100644 src/rocksdb/include/rocksdb/listener.h create mode 100644 src/rocksdb/include/rocksdb/memory_allocator.h create mode 100644 src/rocksdb/include/rocksdb/memtablerep.h create mode 100644 src/rocksdb/include/rocksdb/merge_operator.h create mode 100644 src/rocksdb/include/rocksdb/metadata.h create mode 100644 src/rocksdb/include/rocksdb/options.h create mode 100644 src/rocksdb/include/rocksdb/perf_context.h create mode 100644 src/rocksdb/include/rocksdb/perf_level.h create mode 100644 src/rocksdb/include/rocksdb/persistent_cache.h create mode 100644 src/rocksdb/include/rocksdb/rate_limiter.h create mode 100644 src/rocksdb/include/rocksdb/rocksdb_namespace.h create mode 100644 src/rocksdb/include/rocksdb/secondary_cache.h create mode 100644 src/rocksdb/include/rocksdb/slice.h create mode 100644 src/rocksdb/include/rocksdb/slice_transform.h create mode 100644 src/rocksdb/include/rocksdb/snapshot.h create mode 100644 src/rocksdb/include/rocksdb/sst_dump_tool.h create mode 100644 src/rocksdb/include/rocksdb/sst_file_manager.h create mode 100644 src/rocksdb/include/rocksdb/sst_file_reader.h create mode 100644 src/rocksdb/include/rocksdb/sst_file_writer.h create mode 100644 src/rocksdb/include/rocksdb/sst_partitioner.h create mode 100644 src/rocksdb/include/rocksdb/statistics.h create mode 100644 src/rocksdb/include/rocksdb/stats_history.h create mode 100644 src/rocksdb/include/rocksdb/status.h create mode 100644 src/rocksdb/include/rocksdb/system_clock.h create mode 100644 src/rocksdb/include/rocksdb/table.h create mode 100644 src/rocksdb/include/rocksdb/table_properties.h create mode 100644 src/rocksdb/include/rocksdb/table_reader_caller.h create mode 100644 src/rocksdb/include/rocksdb/thread_status.h create mode 100644 src/rocksdb/include/rocksdb/threadpool.h create mode 100644 src/rocksdb/include/rocksdb/trace_reader_writer.h create mode 100644 src/rocksdb/include/rocksdb/trace_record.h create mode 100644 src/rocksdb/include/rocksdb/trace_record_result.h create mode 100644 src/rocksdb/include/rocksdb/transaction_log.h create mode 100644 src/rocksdb/include/rocksdb/types.h create mode 100644 src/rocksdb/include/rocksdb/unique_id.h create mode 100644 src/rocksdb/include/rocksdb/universal_compaction.h create mode 100644 src/rocksdb/include/rocksdb/utilities/agg_merge.h create mode 100644 src/rocksdb/include/rocksdb/utilities/backup_engine.h create mode 100644 src/rocksdb/include/rocksdb/utilities/cache_dump_load.h create mode 100644 src/rocksdb/include/rocksdb/utilities/checkpoint.h create mode 100644 src/rocksdb/include/rocksdb/utilities/convenience.h create mode 100644 src/rocksdb/include/rocksdb/utilities/customizable_util.h create mode 100644 src/rocksdb/include/rocksdb/utilities/db_ttl.h create mode 100644 src/rocksdb/include/rocksdb/utilities/debug.h create mode 100644 src/rocksdb/include/rocksdb/utilities/env_mirror.h create mode 100644 src/rocksdb/include/rocksdb/utilities/info_log_finder.h create mode 100644 src/rocksdb/include/rocksdb/utilities/ldb_cmd.h create mode 100644 src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h create mode 100644 src/rocksdb/include/rocksdb/utilities/leveldb_options.h create mode 100644 src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h create mode 100644 src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h create mode 100644 src/rocksdb/include/rocksdb/utilities/memory_util.h create mode 100644 src/rocksdb/include/rocksdb/utilities/object_registry.h create mode 100644 src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h create mode 100644 src/rocksdb/include/rocksdb/utilities/option_change_migration.h create mode 100644 src/rocksdb/include/rocksdb/utilities/options_type.h create mode 100644 src/rocksdb/include/rocksdb/utilities/options_util.h create mode 100644 src/rocksdb/include/rocksdb/utilities/replayer.h create mode 100644 src/rocksdb/include/rocksdb/utilities/sim_cache.h create mode 100644 src/rocksdb/include/rocksdb/utilities/stackable_db.h create mode 100644 src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h create mode 100644 src/rocksdb/include/rocksdb/utilities/transaction.h create mode 100644 src/rocksdb/include/rocksdb/utilities/transaction_db.h create mode 100644 src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h create mode 100644 src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h create mode 100644 src/rocksdb/include/rocksdb/version.h create mode 100644 src/rocksdb/include/rocksdb/wal_filter.h create mode 100644 src/rocksdb/include/rocksdb/wide_columns.h create mode 100644 src/rocksdb/include/rocksdb/write_batch.h create mode 100644 src/rocksdb/include/rocksdb/write_batch_base.h create mode 100644 src/rocksdb/include/rocksdb/write_buffer_manager.h (limited to 'src/rocksdb/include') diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h new file mode 100644 index 000000000..258cf82a1 --- /dev/null +++ b/src/rocksdb/include/rocksdb/advanced_options.h @@ -0,0 +1,1098 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; +class TablePropertiesCollectorFactory; +class TableFactory; +struct Options; + +enum CompactionStyle : char { + // level based compaction style + kCompactionStyleLevel = 0x0, + // Universal compaction style + // Not supported in ROCKSDB_LITE. + kCompactionStyleUniversal = 0x1, + // FIFO compaction style + // Not supported in ROCKSDB_LITE + kCompactionStyleFIFO = 0x2, + // Disable background compaction. Compaction jobs are submitted + // via CompactFiles(). + // Not supported in ROCKSDB_LITE + kCompactionStyleNone = 0x3, +}; + +// In Level-based compaction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. +enum CompactionPri : char { + // Slightly prioritize larger files by size compensated by #deletes + kByCompensatedSize = 0x0, + // First compact files whose data's latest update time is oldest. + // Try this if you only update some hot keys in small ranges. + kOldestLargestSeqFirst = 0x1, + // First compact files whose range hasn't been compacted to the next level + // for the longest. If your updates are random across the key space, + // write amplification is slightly better with this option. + kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, + // Keeps a cursor(s) of the successor of the file (key range) was/were + // compacted before, and always picks the next files (key range) in that + // level. The file picking process will cycle through all the files in a + // round-robin manner. + kRoundRobin = 0x4, +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + // If true, try to do compaction to compact smaller files into larger ones. + // Minimum files to compact follows options.level0_file_num_compaction_trigger + // and compaction won't trigger if average compact bytes per del file is + // larger than options.write_buffer_size. This is to protect large files + // from being compacted again. + // Default: false; + bool allow_compaction = false; + + // When not 0, if the data in the file is older than this threshold, RocksDB + // will soon move the file to warm temperature. + uint64_t age_for_warm = 0; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} + CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) + : max_table_files_size(_max_table_files_size), + allow_compaction(_allow_compaction) {} +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + // RocksDB's generic default compression level. Internally it'll be translated + // to the default compression level specific to the library being used (see + // comment above `ColumnFamilyOptions::compression`). + // + // The default value is the max 16-bit int as it'll be written out in OPTIONS + // file, which should be portable. + const static int kDefaultCompressionLevel = 32767; + + int window_bits; + int level; + int strategy; + + // Maximum size of dictionaries used to prime the compression library. + // Enabling dictionary can improve compression ratios when there are + // repetitions across data blocks. + // + // The dictionary is created by sampling the SST file data. If + // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's + // dictionary generator (see comments for option `use_zstd_dict_trainer` for + // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the + // random samples are used directly as the dictionary. + // + // When compression dictionary is disabled, we compress and write each block + // before buffering data for the next one. When compression dictionary is + // enabled, we buffer SST file data in-memory so we can sample it, as data + // can only be compressed and written after the dictionary has been finalized. + // + // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This + // buffered memory is charged to the block cache when there is a block cache. + // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is + // full), we finalize the dictionary with whatever data we have and then stop + // buffering. + // + // Default: 0. + uint32_t max_dict_bytes; + + // Maximum size of training data passed to zstd's dictionary trainer. Using + // zstd's dictionary trainer can achieve even better compression ratio + // improvements than using `max_dict_bytes` alone. + // + // The training data will be used to generate a dictionary of max_dict_bytes. + // + // Default: 0. + uint32_t zstd_max_train_bytes; + + // Number of threads for parallel compression. + // Parallel compression is enabled only if threads > 1. + // THE FEATURE IS STILL EXPERIMENTAL + // + // This option is valid only when BlockBasedTable is used. + // + // When parallel compression is enabled, SST size file sizes might be + // more inflated compared to the target size, because more data of unknown + // compressed size is in flight when compression is parallelized. To be + // reasonably accurate, this inflation is also estimated by using historical + // compression ratio and current bytes inflight. + // + // Default: 1. + uint32_t parallel_threads; + + // When the compression options are set by the user, it will be set to "true". + // For bottommost_compression_opts, to enable it, user must set enabled=true. + // Otherwise, bottommost compression will use compression_opts as default + // compression options. + // + // For compression_opts, if compression_opts.enabled=false, it is still + // used as compression options for compression process. + // + // Default: false. + bool enabled; + + // Limit on data buffering when gathering samples to build a dictionary. Zero + // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), + // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. + // + // In compaction, the buffering is limited to the target file size (see + // `target_file_size_base` and `target_file_size_multiplier`) even if this + // setting permits more buffering. Since we cannot determine where the file + // should be cut until data blocks are compressed with dictionary, buffering + // more than the target file size could lead to selecting samples that belong + // to a later output SST. + // + // Limiting too strictly may harm dictionary effectiveness since it forces + // RocksDB to pick samples from the initial portion of the output SST, which + // may not be representative of the whole file. Configuring this limit below + // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can + // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can + // restrict the size of the final dictionary. + // + // Default: 0 (unlimited) + uint64_t max_dict_buffer_bytes; + + // Use zstd trainer to generate dictionaries. When this option is set to true, + // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes + // buffered data will be passed to zstd dictionary trainer to generate a + // dictionary of size max_dict_bytes. + // + // When this option is false, zstd's API ZDICT_finalizeDictionary() will be + // called to generate dictionaries. zstd_max_train_bytes of training sampled + // data will be passed to this API. Using this API should save CPU time on + // dictionary training, but the compression ratio may not be as good as using + // a dictionary trainer. + // + // Default: true + bool use_zstd_dict_trainer; + + CompressionOptions() + : window_bits(-14), + level(kDefaultCompressionLevel), + strategy(0), + max_dict_bytes(0), + zstd_max_train_bytes(0), + parallel_threads(1), + enabled(false), + max_dict_buffer_bytes(0), + use_zstd_dict_trainer(true) {} + CompressionOptions(int wbits, int _lev, int _strategy, + uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes, + uint32_t _parallel_threads, bool _enabled, + uint64_t _max_dict_buffer_bytes, + bool _use_zstd_dict_trainer) + : window_bits(wbits), + level(_lev), + strategy(_strategy), + max_dict_bytes(_max_dict_bytes), + zstd_max_train_bytes(_zstd_max_train_bytes), + parallel_threads(_parallel_threads), + enabled(_enabled), + max_dict_buffer_bytes(_max_dict_buffer_bytes), + use_zstd_dict_trainer(_use_zstd_dict_trainer) {} +}; + +// Temperature of a file. Used to pass to FileSystem for a different +// placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. +enum class Temperature : uint8_t { + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, + kLastTemperature, +}; + +// The control option of how the cache tiers will be used. Currently rocksdb +// support block cache (volatile tier), secondary cache (non-volatile tier). +// In the future, we may add more caching layers. +enum class CacheTier : uint8_t { + kVolatileTier = 0, + kNonVolatileBlockTier = 0x01, +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + +enum class PrepopulateBlobCache : uint8_t { + kDisable = 0x0, // Disable prepopulate blob cache + kFlushOnly = 0x1, // Prepopulate blobs during flush only +}; + +struct AdvancedColumnFamilyOptions { + // The maximum number of write buffers that are built up in memory. + // The default and the minimum number is 2, so that when 1 write buffer + // is being flushed to storage, new writes can continue to the other + // write buffer. + // If max_write_buffer_number > 3, writing will be slowed down to + // options.delayed_write_rate if we are writing to the last write buffer + // allowed. + // + // Default: 2 + // + // Dynamically changeable through SetOptions() API + int max_write_buffer_number = 2; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are flushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. + // If atomic flush is enabled (options.atomic_flush == true), then this + // option will be sanitized to 1. + // Default: 1 + int min_write_buffer_number_to_merge = 1; + + // DEPRECATED + // The total maximum number of write buffers to maintain in memory including + // copies of buffers that have already been flushed. Unlike + // max_write_buffer_number, this parameter does not affect flushing. + // This parameter is being replaced by max_write_buffer_size_to_maintain. + // If both parameters are set to non-zero values, this parameter will be + // ignored. + int max_write_buffer_number_to_maintain = 0; + + // The target number of write history bytes to hold in memory. Write history + // comprises the latest write buffers (memtables). To reach the target, write + // buffers that were most recently flushed to SST files may be retained in + // memory. + // + // This controls the target amount of write history that will be available + // in memory for conflict checking when Transactions are used. + // + // This target may be undershot when the CF first opens and has not recovered + // or received enough writes to reach the target. After reaching the target + // once, it is guaranteed to never undershoot again. That guarantee is + // implemented by retaining flushed write buffers in-memory until the oldest + // one can be trimmed without dropping below the target. + // + // Examples with `max_write_buffer_size_to_maintain` set to 32MB: + // + // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB, + // and zero flushed immutable memtables. Nothing trimmable exists. + // - One mutable memtable of 16MB, zero unflushed immutable memtables, and + // one flushed immutable memtable of 64MB. Trimming is disallowed because + // dropping the earliest (only) flushed immutable memtable would result in + // write history of 16MB < 32MB. + // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB, + // and one flushed immutable memtable of 16MB. The earliest (only) flushed + // immutable memtable is trimmed because without it we still have + // 16MB + 24MB = 40MB > 32MB of write history. + // + // When using an OptimisticTransactionDB: + // If this value is too low, some transactions may fail at commit time due + // to not being able to determine whether there were any write conflicts. + // + // When using a TransactionDB: + // If Transaction::SetSnapshot is used, TransactionDB will read either + // in-memory write buffers or SST files to do write-conflict checking. + // Increasing this value can reduce the number of reads to SST files + // done for conflict detection. + // + // Setting this value to 0 will cause write buffers to be freed immediately + // after they are flushed. If this value is set to -1, + // 'max_write_buffer_number * write_buffer_size' will be used. + // + // Default: + // If using a TransactionDB/OptimisticTransactionDB, the default value will + // be set to the value of 'max_write_buffer_number * write_buffer_size' + // if it is not explicitly set by the user. Otherwise, the default is 0. + int64_t max_write_buffer_size_to_maintain = 0; + + // Allows thread-safe inplace updates. If this is true, there is no way to + // achieve point-in-time consistency using snapshot or iterator (assuming + // concurrent updates). Hence iterator and multi-get will return results + // which are not consistent as of any point-in-time. + // Backward iteration on memtables will not work either. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support = false; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + // + // Dynamically changeable through SetOptions() API + size_t inplace_update_num_locks = 10000; + + // [experimental] + // Used to activate or deactive the Mempurge feature (memtable garbage + // collection). (deactivated by default). At every flush, the total useful + // payload (total entries minus garbage entries) is estimated as a ratio + // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then + // compared to this `threshold` value: + // - if ratio1.0 : aggressive mempurge. + // 0 < threshold < 1.0: mempurge triggered only for very low useful payload + // ratios. + // [experimental] + double experimental_mempurge_threshold = 0.0; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + // + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + // + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + // + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + // + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + // + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + // + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value) = nullptr; + + // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic + // Bloom filter in memtable to optimize many queries that must go beyond + // the memtable. The size in bytes of the filter is + // write_buffer_size * memtable_prefix_bloom_size_ratio. + // * If prefix_extractor is set, the filter includes prefixes. + // * If memtable_whole_key_filtering, the filter includes whole keys. + // * If both, the filter includes both. + // * If neither, the feature is disabled. + // + // If this value is larger than 0.25, it is sanitized to 0.25. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + double memtable_prefix_bloom_size_ratio = 0.0; + + // Enable whole key bloom filter in memtable. Note this will only take effect + // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering + // can potentially reduce CPU usage for point-look-ups. + // + // Default: false (disabled) + // + // Dynamically changeable through SetOptions() API + bool memtable_whole_key_filtering = false; + + // Page size for huge page for the arena used by the memtable. If <=0, it + // won't allocate from huge page but from malloc. + // Users are responsible to reserve huge pages for it to be allocated. For + // example: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + // If there isn't enough free huge page available, it will fall back to + // malloc. + // + // Dynamically changeable through SetOptions() API + size_t memtable_huge_page_size = 0; + + // If non-nullptr, memtable will use the specified function to extract + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. + // + // Currently only the default skiplist based memtable implements the feature. + // All other memtable implementation will ignore the option. It incurs ~250 + // additional bytes of memory overhead to store a hint for each prefix. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. + // + // The option is best suited for workloads where keys will likely to insert + // to a location close the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. + // + // Default: nullptr (disabled) + std::shared_ptr + memtable_insert_with_hint_prefix_extractor = nullptr; + + // Control locality of bloom filter probes to improve CPU cache hit rate. + // This option now only applies to plaintable prefix bloom. This + // optimization is turned off when set to 0, and positive number to turn + // it on. + // Default: 0 + uint32_t bloom_locality = 0; + + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/8 of + // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is + // smaller). + // + // There are two additional restriction of the specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + // + // Dynamically changeable through SetOptions() API + size_t arena_block_size = 0; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. + // + // NOTICE if level_compaction_dynamic_level_bytes=true, + // compression_per_level[0] still determines L0, but other elements + // of the array are based on base level (the level L0 files are merged + // to), and may not match the level users see from info log for metadata. + // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] + // determines compaction type for level n+i-1. + // For example, if we have three 5 levels, and we determine to merge L0 + // data to L4 (which means L1..L3 will be empty), then the new files go to + // L4 uses compression type compression_per_level[1]. + // If now L0 is merged to L2. Data goes to L2 will be compressed + // according to compression_per_level[1], L3 using compression_per_level[2] + // and L4 using compression_per_level[3]. Compaction for each level can + // change when data grows. + // + // NOTE: if the vector size is smaller than the level number, the undefined + // lower level uses the last option in the vector, for example, for 3 level + // LSM tree the following settings are the same: + // {kNoCompression, kSnappyCompression} + // {kNoCompression, kSnappyCompression, kSnappyCompression} + // + // Dynamically changeable through SetOptions() API + std::vector compression_per_level; + + // Number of levels for this database + int num_levels = 7; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + // + // Default: 20 + // + // Dynamically changeable through SetOptions() API + int level0_slowdown_writes_trigger = 20; + + // Maximum number of level-0 files. We stop writes at this point. + // + // Default: 36 + // + // Dynamically changeable through SetOptions() API + int level0_stop_writes_trigger = 36; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + // + // Default: 64MB. + // + // Dynamically changeable through SetOptions() API + uint64_t target_file_size_base = 64 * 1048576; + + // By default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + // + // Dynamically changeable through SetOptions() API + int target_file_size_multiplier = 1; + + // If true, RocksDB will pick target size of each level dynamically. + // We will pick a base level b >= 1. L0 will be directly merged into level b, + // instead of always into level 1. Level 1 to b-1 need to be empty. + // We try to pick b and its target size so that + // 1. target size is in the range of + // (max_bytes_for_level_base / max_bytes_for_level_multiplier, + // max_bytes_for_level_base] + // 2. target size of the last level (level num_levels-1) equals to extra size + // of the level. + // At the same time max_bytes_for_level_multiplier and + // max_bytes_for_level_multiplier_additional are still satisfied. + // (When L0 is too large, we make some adjustment. See below.) + // + // With this option on, from an empty DB, we make last level the base level, + // which means merging L0 data into the last level, until it exceeds + // max_bytes_for_level_base. And then we make the second last level to be + // base level, to start to merge L0 data to second last level, with its + // target size to be 1/max_bytes_for_level_multiplier of the last level's + // extra size. After the data accumulates more so that we need to move the + // base level to the third last one, and so on. + // + // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, + // and max_bytes_for_level_base=10MB. + // Target sizes of level 1 to 5 starts with: + // [- - - - 10MB] + // with base level is level. Target sizes of level 1 to 4 are not applicable + // because they will not be used. + // Until the size of Level 5 grows to more than 10MB, say 11MB, we make + // base target to level 4 and now the targets looks like: + // [- - - 1.1MB 11MB] + // While data are accumulated, size targets are tuned based on actual data + // of level 5. When level 5 has 50MB of data, the target is like: + // [- - - 5MB 50MB] + // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep + // level 4 to be the base level, its target size needs to be 10.1MB, which + // doesn't satisfy the target size range. So now we make level 3 the target + // size and the target sizes of the levels look like: + // [- - 1.01MB 10.1MB 101MB] + // In the same way, while level 5 further grows, all levels' targets grow, + // like + // [- - 5MB 50MB 500MB] + // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the + // base level and make levels' target sizes like this: + // [- 1.001MB 10.01MB 100.1MB 1001MB] + // and go on... + // + // By doing it, we give max_bytes_for_level_multiplier a priority against + // max_bytes_for_level_base, for a more predictable LSM tree shape. It is + // useful to limit worse case space amplification. + // + // + // If the compaction from L0 is lagged behind, a special mode will be turned + // on to prioritize write amplification against max_bytes_for_level_multiplier + // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking + // at number of L0 files and total L0 size. If number of L0 files is at least + // the double of level0_file_num_compaction_trigger, or the total size is + // at least max_bytes_for_level_base, this mode is on. The target of L1 grows + // to the actual data size in L0, and then determine the target for each level + // so that each level will have the same level multiplier. + // + // For example, when L0 size is 100MB, the size of last level is 1600MB, + // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. + // Since L0 size is larger than max_bytes_for_level_base, this is a L0 + // compaction backlogged mode. So that the L1 size is determined to be 100MB. + // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will + // be needed. The level multiplier will be calculated to be 4 and the three + // levels' target to be [100MB, 400MB, 1600MB]. + // + // In this mode, The number of levels will be no more than the normal mode, + // and the level multiplier will be lower. The write amplification will + // likely to be reduced. + // + // + // max_bytes_for_level_multiplier_additional is ignored with this flag on. + // + // Turning this feature on or off for an existing DB can cause unexpected + // LSM tree structure so it's not recommended. + // + // Default: false + bool level_compaction_dynamic_level_bytes = false; + + // Allows RocksDB to generate files that are not exactly the target_file_size + // only for the non-bottommost files. Which can reduce the write-amplification + // from compaction. The file size could be from 0 to 2x target_file_size. + // Once enabled, non-bottommost compaction will try to cut the files align + // with the next level file boundaries (grandparent level). + // + // Default: true + bool level_compaction_dynamic_file_size = true; + + // Default: 10. + // + // Dynamically changeable through SetOptions() API + double max_bytes_for_level_multiplier = 10; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // + // Default: 1 + // + // Dynamically changeable through SetOptions() API + std::vector max_bytes_for_level_multiplier_additional = + std::vector(num_levels, 1); + + // We try to limit number of bytes in one compaction to be lower than this + // threshold. But it's not guaranteed. + // Value 0 will be sanitized. + // + // Default: target_file_size_base * 25 + // + // Dynamically changeable through SetOptions() API + uint64_t max_compaction_bytes = 0; + + // When setting up compaction input files, we ignore the + // `max_compaction_bytes` limit when pulling in input files that are entirely + // within output key range. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + // We could remove this knob and always ignore the limit once it is proven + // safe. + bool ignore_max_compaction_bytes_for_input = true; + + // All writes will be slowed down to at least delayed_write_rate if estimated + // bytes needed to be compaction exceed this threshold. + // + // Default: 64GB + // + // Dynamically changeable through SetOptions() API + uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; + + // All writes are stopped if estimated bytes needed to be compaction exceed + // this threshold. + // + // Default: 256GB + // + // Dynamically changeable through SetOptions() API + uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style = kCompactionStyleLevel; + + // If level compaction_style = kCompactionStyleLevel, for each level, + // which files are prioritized to be picked to compact. + // Default: kMinOverlappingRatio + CompactionPri compaction_pri = kMinOverlappingRatio; + + // The options needed to support Universal Style compactions + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_universal", "{size_ratio=2;}") + CompactionOptionsUniversal compaction_options_universal; + + // The options for FIFO compaction style + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}") + CompactionOptionsFIFO compaction_options_fifo; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // + // Default: 8 + // + // Dynamically changeable through SetOptions() API + uint64_t max_sequential_skip_in_iterations = 8; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory = + std::shared_ptr(new SkipListFactory); + + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + + // This option allows user to collect their own interested statistics of + // the tables. + // Default: empty vector -- no user-defined statistics collection will be + // performed. + using TablePropertiesCollectorFactories = + std::vector>; + TablePropertiesCollectorFactories table_properties_collector_factories; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + size_t max_successive_merges = 0; + + // This flag specifies that the implementation should optimize the filters + // mainly for cases where keys are found rather than also optimize for keys + // missed. This would be used in cases where the application knows that + // there are very few misses or the performance in the case of misses is not + // important. + // + // For now, this flag allows us to not store filters for the last level i.e + // the largest level which contains data of the LSM store. For keys which + // are hits, the filters in this level are not useful because we will search + // for the data anyway. NOTE: the filters in other levels are still useful + // even for key hit because they tell us whether to look in that level or go + // to the higher level. + // + // Default: false + bool optimize_filters_for_hits = false; + + // During flush or compaction, check whether keys inserted to output files + // are in order. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + bool check_flush_compaction_key_order = true; + + // After writing every SST file, reopen it and read all the keys. + // Checks the hash of all of the keys and values written versus the + // keys in the file and signals a corruption if they do not match + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool paranoid_file_checks = false; + + // In debug mode, RocksDB runs consistency checks on the LSM every time the + // LSM changes (Flush, Compaction, AddFile). When this option is true, these + // checks are also enabled in release mode. These checks were historically + // disabled in release mode, but are now enabled by default for proactive + // corruption detection. The CPU overhead is negligible for normal mixed + // operations but can slow down saturated writing. See + // Options::DisableExtraChecks(). + // Default: true + bool force_consistency_checks = true; + + // Measure IO stats in compactions and flushes, if true. + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool report_bg_io_stats = false; + + // Files containing updates older than TTL will go through the compaction + // process. This usually happens in a cascading way so that those entries + // will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. + // Pre-req: This needs max_open_files to be set to -1. + // In Level: Non-bottom-level files older than TTL will go through the + // compaction process. + // In FIFO: Files older than TTL will be deleted. + // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 + // In FIFO, this option will have the same meaning as + // periodic_compaction_seconds. Whichever stricter will be used. + // 0 means disabling. + // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to + // pick default. + // + // Default: 30 days for leveled compaction + block based table. disable + // otherwise. + // + // Dynamically changeable through SetOptions() API + uint64_t ttl = 0xfffffffffffffffe; + + // Files older than this value will be picked up for compaction, and + // re-written to the same level as they were before. + // One main use of the feature is to make sure a file goes through compaction + // filters periodically. Users can also use the feature to clear up SST + // files using old format. + // + // A file's age is computed by looking at file_creation_time or creation_time + // table properties in order, if they have valid non-zero values; if not, the + // age is based on the file's last modified time (given by the underlying + // Env). + // + // Supported in Level and FIFO compaction. + // In FIFO compaction, this option has the same meaning as TTL and whichever + // stricter will be used. + // Pre-req: max_open_file == -1. + // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 + // + // Values: + // 0: Turn off Periodic compactions. + // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature + // as needed. For now, RocksDB will change this value to 30 days + // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction + // process at least once every 30 days if not compacted sooner. + // In FIFO compaction, since the option has the same meaning as ttl, + // when this value is left default, and ttl is left to 0, 30 days will be + // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + // + // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) + // + // Dynamically changeable through SetOptions() API + uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; + + // If this option is set then 1 in N blocks are compressed + // using a fast (lz4) and slow (zstd) compression algorithm. + // The compressibility is reported as stats and the stored + // data is left uncompressed (unless compression is also requested). + uint64_t sample_for_compression = 0; + + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when creating the last level files, pass this + // temperature to FileSystem used. Should be no-op for default FileSystem + // and users need to plug in their own FileSystem to take advantage of it. + // + // Note: the feature is changed from `bottommost_temperature` to + // `last_level_temperature` which now only apply for the last level files. + // The option name `bottommost_temperature` is kept only for migration, the + // behavior is the same as `last_level_temperature`. Please stop using + // `bottommost_temperature` and will be removed in next release. + // + // Dynamically changeable through the SetOptions() API + Temperature bottommost_temperature = Temperature::kUnknown; + Temperature last_level_temperature = Temperature::kUnknown; + + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when data insert time is within this time range, it + // will be precluded from the last level. + // 0 means no key will be precluded from the last level. + // + // Note: when enabled, universal size amplification (controlled by option + // `compaction_options_universal.max_size_amplification_percent`) calculation + // will exclude the last level. As the feature is designed for tiered storage + // and a typical setting is the last level is cold tier which is likely not + // size constrained, the size amp is going to be only for non-last levels. + // + // Default: 0 (disable the feature) + // + // Not dynamically changeable, change it requires db restart. + uint64_t preclude_last_level_data_seconds = 0; + + // EXPERIMENTAL + // If this option is set, it will preserve the internal time information about + // the data until it's older than the specified time here. + // Internally the time information is a map between sequence number and time, + // which is the same as `preclude_last_level_data_seconds`. But it won't + // preclude the data from the last level and the data in the last level won't + // have the sequence number zeroed out. + // Internally, rocksdb would sample the sequence number to time pair and store + // that in SST property "rocksdb.seqno.time.map". The information is currently + // only used for tiered storage compaction (option + // `preclude_last_level_data_seconds`). + // + // Note: if both `preclude_last_level_data_seconds` and this option is set, it + // will preserve the max time of the 2 options and compaction still preclude + // the data based on `preclude_last_level_data_seconds`. + // The higher the preserve_time is, the less the sampling frequency will be ( + // which means less accuracy of the time estimation). + // + // Default: 0 (disable the feature) + // + // Not dynamically changeable, change it requires db restart. + uint64_t preserve_internal_time_seconds = 0; + + // When set, large values (blobs) are written to separate blob files, and + // only pointers to them are stored in SST files. This can reduce write + // amplification for large-value use cases at the cost of introducing a level + // of indirection for reads. See also the options min_blob_size, + // blob_file_size, blob_compression_type, enable_blob_garbage_collection, + // blob_garbage_collection_age_cutoff, + // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size + // below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_files = false; + + // The size of the smallest value to be stored separately in a blob file. + // Values which have an uncompressed size smaller than this threshold are + // stored alongside the keys in SST files in the usual fashion. A value of + // zero for this option means that all values are stored in blob files. Note + // that enable_blob_files has to be set in order for this option to have any + // effect. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t min_blob_size = 0; + + // The size limit for blob files. When writing blob files, a new file is + // opened once this limit is reached. Note that enable_blob_files has to be + // set in order for this option to have any effect. + // + // Default: 256 MB + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_file_size = 1ULL << 28; + + // The compression algorithm to use for large values stored in blob files. + // Note that enable_blob_files has to be set in order for this option to have + // any effect. + // + // Default: no compression + // + // Dynamically changeable through the SetOptions() API + CompressionType blob_compression_type = kNoCompression; + + // Enables garbage collection of blobs. Blob GC is performed as part of + // compaction. Valid blobs residing in blob files older than a cutoff get + // relocated to new files as they are encountered during compaction, which + // makes it possible to clean up blob files once they contain nothing but + // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and + // blob_garbage_collection_force_threshold below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N blob files will be relocated when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_blob_files. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 0.25 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_age_cutoff = 0.25; + + // If the ratio of garbage in the oldest blob files exceeds this threshold, + // targeted compactions are scheduled in order to force garbage collecting + // the blob files in question, assuming they are all eligible based on the + // value of blob_garbage_collection_age_cutoff above. This option is + // currently only supported with leveled compactions. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 1.0 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_force_threshold = 1.0; + + // Compaction readahead for blob files. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_compaction_readahead_size = 0; + + // Enable blob files starting from a certain LSM tree level. + // + // For certain use cases that have a mix of short-lived and long-lived values, + // it might make sense to support extracting large values only during + // compactions whose output level is greater than or equal to a specified LSM + // tree level (e.g. compactions into L1/L2/... or above). This could reduce + // the space amplification caused by large values that are turned into garbage + // shortly after being written at the price of some write amplification + // incurred by long-lived values whose extraction to blob files is delayed. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + int blob_file_starting_level = 0; + + // The Cache object to use for blobs. Using a dedicated object for blobs and + // using the same object for the block and blob caches are both supported. In + // the latter case, note that blobs are less valuable from a caching + // perspective than SST blocks, and some cache implementations have + // configuration options that can be used to prioritize items accordingly (see + // Cache::Priority and LRUCacheOptions::{high,low}_pri_pool_ratio). + // + // Default: nullptr (disabled) + std::shared_ptr blob_cache = nullptr; + + // Enable/disable prepopulating the blob cache. When set to kFlushOnly, BlobDB + // will insert newly written blobs into the blob cache during flush. This can + // improve performance when reading back these blobs would otherwise be + // expensive (e.g. when using direct I/O or remote storage), or when the + // workload has a high temporal locality. + // + // Default: disabled + // + // Dynamically changeable through the SetOptions() API + PrepopulateBlobCache prepopulate_blob_cache = PrepopulateBlobCache::kDisable; + + // Enable memtable per key-value checksum protection. + // + // Each entry in memtable will be suffixed by a per key-value checksum. + // This options determines the size of such checksums. + // + // It is suggested to turn on write batch per key-value + // checksum protection together with this option, so that the checksum + // computation is done outside of writer threads (memtable kv checksum can be + // computed from write batch checksum) See + // WriteOptions::protection_bytes_per_key for more detail. + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + uint32_t memtable_protection_bytes_per_key = 0; + + // Create ColumnFamilyOptions with default values for all fields + AdvancedColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit AdvancedColumnFamilyOptions(const Options& options); + + // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/block_cache_trace_writer.h b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h new file mode 100644 index 000000000..18d28685b --- /dev/null +++ b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h @@ -0,0 +1,149 @@ +// Copyright (c) 2022, Meta Platforms, Inc. and affiliates. All rights +// reserved. This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table_reader_caller.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { +// A record for block cache lookups/inserts. This is passed by the table +// reader to the BlockCacheTraceWriter for every block cache op. +struct BlockCacheTraceRecord { + // Required fields for all accesses. + uint64_t access_timestamp = 0; + + // Info related to the block being looked up or inserted + // + // 1. The cache key for the block + std::string block_key; + + // 2. The type of block + TraceType block_type = TraceType::kTraceMax; + + // 3. Size of the block + uint64_t block_size = 0; + + // Info about the SST file the block is in + // + // 1. Column family ID + uint64_t cf_id = 0; + + // 2. Column family name + std::string cf_name; + + // 3. LSM level of the file + uint32_t level = 0; + + // 4. SST file number + uint64_t sst_fd_number = 0; + + // Info about the calling context + // + // 1. The higher level request triggering the block cache request + TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller; + + // 2. Cache lookup hit/miss. Not relevant for inserts + bool is_cache_hit = false; + + // 3. Whether this request is a lookup + bool no_insert = false; + + // Get/MultiGet specific info + // + // 1. A unique ID for Get/MultiGet + uint64_t get_id = kReservedGetId; + + // 2. Whether the Get/MultiGet is from a user-specified snapshot + bool get_from_user_specified_snapshot = false; + + // 3. The target user key in the block + std::string referenced_key; + + // Required fields for data block and user Get/Multi-Get only. + // + // 1. Size of te useful data in the block + uint64_t referenced_data_size = 0; + + // 2. Only for MultiGet, number of keys from the batch found in the block + uint64_t num_keys_in_block = 0; + + // 3. Whether the key was found in the block or not (false positive) + bool referenced_key_exist_in_block = false; + + static const uint64_t kReservedGetId; + + BlockCacheTraceRecord() {} + + BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, + TraceType _block_type, uint64_t _block_size, + uint64_t _cf_id, std::string _cf_name, uint32_t _level, + uint64_t _sst_fd_number, TableReaderCaller _caller, + bool _is_cache_hit, bool _no_insert, uint64_t _get_id, + bool _get_from_user_specified_snapshot = false, + std::string _referenced_key = "", + uint64_t _referenced_data_size = 0, + uint64_t _num_keys_in_block = 0, + bool _referenced_key_exist_in_block = false) + : access_timestamp(_access_timestamp), + block_key(_block_key), + block_type(_block_type), + block_size(_block_size), + cf_id(_cf_id), + cf_name(_cf_name), + level(_level), + sst_fd_number(_sst_fd_number), + caller(_caller), + is_cache_hit(_is_cache_hit), + no_insert(_no_insert), + get_id(_get_id), + get_from_user_specified_snapshot(_get_from_user_specified_snapshot), + referenced_key(_referenced_key), + referenced_data_size(_referenced_data_size), + num_keys_in_block(_num_keys_in_block), + referenced_key_exist_in_block(_referenced_key_exist_in_block) {} +}; + +// Options for tracing block cache accesses +struct BlockCacheTraceOptions { + // Specify trace sampling option, i.e. capture one per how many requests. + // Default to 1 (capture every request). + uint64_t sampling_frequency = 1; +}; + +// Options for the built-in implementation of BlockCacheTraceWriter +struct BlockCacheTraceWriterOptions { + uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; +}; + +// BlockCacheTraceWriter is an abstract class that captures all RocksDB block +// cache accesses. Every RocksDB operation is passed to WriteBlockAccess() +// with a BlockCacheTraceRecord. +class BlockCacheTraceWriter { + public: + virtual ~BlockCacheTraceWriter() {} + + // Pass Slice references to avoid copy. + virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key) = 0; + + // Write a trace header at the beginning, typically on initiating a trace, + // with some metadata like a magic number and RocksDB version. + virtual Status WriteHeader() = 0; +}; + +// Allocate an instance of the built-in BlockCacheTraceWriter implementation, +// that traces all block cache accesses to a user-provided TraceWriter. Each +// access is traced to a file with a timestamp and type, followed by the +// payload. +std::unique_ptr NewBlockCacheTraceWriter( + SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options, + std::unique_ptr&& trace_writer); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h new file mode 100644 index 000000000..1639f3cd3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/c.h @@ -0,0 +1,2793 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for rocksdb. May be useful as a stable ABI that can be + used by programs that keep rocksdb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#pragma once + +#ifdef _WIN32 +#ifdef ROCKSDB_DLL +#ifdef ROCKSDB_LIBRARY_EXPORTS +#define ROCKSDB_LIBRARY_API __declspec(dllexport) +#else +#define ROCKSDB_LIBRARY_API __declspec(dllimport) +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; +typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t; +typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; +typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; +typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; +typedef struct rocksdb_compactionfiltercontext_t + rocksdb_compactionfiltercontext_t; +typedef struct rocksdb_compactionfilterfactory_t + rocksdb_compactionfilterfactory_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_fifo_compaction_options_t + rocksdb_fifo_compaction_options_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t; +typedef struct rocksdb_block_based_table_options_t + rocksdb_block_based_table_options_t; +typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t + rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; +typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t; +typedef struct rocksdb_column_family_metadata_t + rocksdb_column_family_metadata_t; +typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t; +typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t; +typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; +typedef struct rocksdb_ingestexternalfileoptions_t + rocksdb_ingestexternalfileoptions_t; +typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; +typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; +typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; +typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t; +typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t; +typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t; +typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t; +typedef struct rocksdb_optimistictransactiondb_t + rocksdb_optimistictransactiondb_t; +typedef struct rocksdb_optimistictransaction_options_t + rocksdb_optimistictransaction_options_t; +typedef struct rocksdb_transaction_t rocksdb_transaction_t; +typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t; +typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t; +typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t; +typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t; +typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t; + +/* DB operations */ + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, const char* name, int ttl, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, const char* name, + unsigned char error_if_wal_file_exists, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* +rocksdb_backup_engine_open_opts(const rocksdb_backup_engine_options_t* options, + rocksdb_env_t* env, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( + rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( + rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* +rocksdb_restore_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( + rocksdb_restore_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( + rocksdb_restore_options_t* opt, int v); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup( + rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t* +rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close( + rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf_with_ts( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* ts, size_t tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_increase_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* ts_low, size_t ts_lowlen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_full_history_ts_low( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + size_t* ts_lowlen, char** errptr); + +/* BackupEngineOptions */ + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_options_t* +rocksdb_backup_engine_options_create(const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_backup_dir( + rocksdb_backup_engine_options_t* options, const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_env( + rocksdb_backup_engine_options_t* options, rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_share_table_files( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_share_table_files( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_sync( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backup_engine_options_get_sync( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_destroy_old_data( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_destroy_old_data( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_backup_log_files( + rocksdb_backup_engine_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backup_engine_options_get_backup_log_files( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_backup_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_backup_rate_limit( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_restore_rate_limit( + rocksdb_backup_engine_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_restore_rate_limit( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_max_background_operations( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_max_background_operations( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options, uint64_t size); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_options_get_callback_trigger_interval_size( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_max_valid_backups_to_open( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_options_set_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backup_engine_options_get_share_files_with_checksum_naming( + rocksdb_backup_engine_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_destroy( + rocksdb_backup_engine_options_t*); + +/* Checkpoint */ + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create( + rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy( + rocksdb_checkpoint_t* checkpoint); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_and_trim_history( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char* trim_ts, + size_t trim_tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_wal_file_exists, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families( + const rocksdb_options_t* options, const char* name, size_t* lencf, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy( + char** list, size_t len); + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family(rocksdb_t* db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t*); + +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_column_family_handle_get_id(rocksdb_column_family_handle_t* handle); + +extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_handle_get_name( + rocksdb_column_family_handle_t* handle, size_t* name_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_put( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* start_key, + size_t start_key_len, const char* end_key, size_t end_key_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_write( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr); + +// if values_list[i] == NULL and errs[i] == NULL, +// then we got status.IsNotFound(), which we will not return. +// all errors except status status.ok() and status.IsNotFound() are returned. +// +// errs, values_list and values_list_sizes must be num_keys in length, +// allocated by the caller. +// errs is a list of strings as opposed to the conventional one error, +// where errs[i] is the status for retrieval of keys_list[i]. +// each non-NULL errs entry is a malloc()ed, null terminated string. +// each non-NULL values_list entry is a malloc()ed array, with +// the length for each stored in values_list_sizes[i]. +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, char** timestamp_list, + size_t* timestamp_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf_with_ts( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** timestamps_list, + size_t* timestamps_list_sizes, char** errs); + +// The MultiGet API that improves performance by batching operations +// in the read path for greater efficiency. Currently, only the block based +// table format with full filters are supported. Other table formats such +// as plain table, block based table with block based filters and +// partitioned indexes will still work, but will not get any performance +// benefits. +// +// Note that all the keys passed to this API are restricted to a single +// column family. +// +// Parameters - +// db - the RocksDB instance. +// options - ReadOptions +// column_family - ColumnFamilyHandle* that the keys belong to. All the keys +// passed to the API are restricted to a single column family +// num_keys - Number of keys to lookup +// keys_list - Pointer to C style array of keys with num_keys elements +// keys_list_sizes - Pointer to C style array of the size of corresponding key +// in key_list with num_keys elements. +// values - Pointer to C style array of PinnableSlices with num_keys elements +// statuses - Pointer to C style array of Status with num_keys elements +// sorted_input - If true, it means the input keys are already sorted by key +// order, so the MultiGet() API doesn't have to sort them +// again. If false, the keys will be copied and sorted +// internally by the API - the input array will not be +// modified +extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input); + +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators( + rocksdb_t* db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( + rocksdb_t* db, const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, + const char* propname); +/* returns 0 on success, -1 otherwise */ +int rocksdb_property_int(rocksdb_t* db, const char* propname, + uint64_t* out_val); + +/* returns 0 on success, -1 otherwise */ +int rocksdb_property_int_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val); + +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db, + const char* start_key, + size_t start_key_len, + const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt( + rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key, + size_t start_key_len, const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db, + const char* name); + +extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush( + rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf( + rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, + unsigned char sync, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions( + rocksdb_t* db, unsigned char force, char** errptr); + +/* Management operations */ + +extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +/* Iterator */ + +extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid( + const rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*, + const char* k, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*, + const char* k, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key( + const rocksdb_iterator_t*, size_t* klen); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value( + const rocksdb_iterator_t*, size_t* vlen); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp( + const rocksdb_iterator_t*, size_t* tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error( + const rocksdb_iterator_t*, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next( + rocksdb_wal_iterator_t* iter); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid( + const rocksdb_wal_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status( + const rocksdb_wal_iterator_t* iter, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_get_latest_sequence_number(rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy( + const rocksdb_wal_iterator_t* iter); + +/* Write batch */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create( + void); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from( + const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf_with_ts( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*, + const char* key, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete( + rocksdb_writebatch_t* b, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf_with_ts( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf_with_ts( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range( + rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev( + rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data( + rocksdb_writebatch_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data( + rocksdb_writebatch_t*, size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point( + rocksdb_writebatch_t*, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point( + rocksdb_writebatch_t*, char** errptr); + +/* Write batch with index */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create(size_t reserved_bytes, + unsigned char overwrite_keys); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create_from(const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count( + rocksdb_writebatch_wi_t* b); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range( + rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev( + rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* start_keys_list, const size_t* start_keys_list_sizes, + const char* const* end_keys_list, const size_t* end_keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data( + rocksdb_writebatch_wi_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data( + rocksdb_writebatch_wi_t* b, size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point( + rocksdb_writebatch_wi_t*, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + const char* key, size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* cf); + +/* Options utils */ + +// Load the latest rocksdb options from the specified db_path. +// +// On success, num_column_families will be updated with a non-zero +// number indicating the number of column families. +// The returned db_options, column_family_names, and column_family_options +// should be released via rocksdb_load_latest_options_destroy(). +// +// On error, a non-null errptr that includes the error message will be +// returned. db_options, column_family_names, and column_family_options +// will be set to NULL. +extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options( + const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options, + rocksdb_cache_t* cache, rocksdb_options_t** db_options, + size_t* num_column_families, char*** column_family_names, + rocksdb_options_t*** column_family_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy( + rocksdb_options_t* db_options, char** list_column_family_names, + rocksdb_options_t** list_column_family_options, size_t len); + +/* Block based table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t* +rocksdb_block_based_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_checksum( + rocksdb_block_based_table_options_t*, char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, unsigned char no_block_cache); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t*, int); +enum { + rocksdb_block_based_table_index_type_binary_search = 0, + rocksdb_block_based_table_index_type_hash_search = 1, + rocksdb_block_based_table_index_type_two_level_index_search = 2, +}; +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +enum { + rocksdb_block_based_table_data_block_index_type_binary_search = 0, + rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1, +}; +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v); +// rocksdb_block_based_options_set_hash_index_allow_collision() +// is removed since BlockBasedTableOptions.hash_index_allow_collision() +// is removed +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); + +/* Cuckoo table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t* +rocksdb_cuckoo_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options); + +/* Options */ +extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count, + const char* const keys[], + const char* const values[], + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], const char* const values[], char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism( + rocksdb_options_t* opt, int total_threads); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup( + rocksdb_options_t* opt, uint64_t block_cache_size_mb); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( + rocksdb_options_t*, rocksdb_compactionfilter_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t*, rocksdb_compactionfilterfactory_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator( + rocksdb_options_t*, rocksdb_comparator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator( + rocksdb_options_t*, rocksdb_mergeoperator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, const int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_create_missing_column_families(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_create_missing_column_families(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths( + rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, + rocksdb_env_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, + rocksdb_logger_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_write_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size( + rocksdb_options_t* opt, uint64_t n); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_use_zstd_dict_trainer( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int, + int, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t*, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t*, unsigned char, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( + rocksdb_options_t*, rocksdb_slicetransform_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_target_file_size_base(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt); + +/* Blob Options Settings */ +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_min_blob_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_file_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache( + rocksdb_options_t* opt, rocksdb_cache_t* blob_cache); + +enum { + rocksdb_prepopulate_blob_disable = 0, + rocksdb_prepopulate_blob_flush_only = 1 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prepopulate_blob_cache( + rocksdb_options_t* opt, int val); + +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache( + rocksdb_options_t* opt); + +/* returns a pointer to a malloc()-ed, null terminated string */ +extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, + int64_t); +extern ROCKSDB_LIBRARY_API int64_t +rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_max_subcompactions(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_log_file_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_keep_log_file_num(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_arena_block_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir( + rocksdb_options_t*, const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*, + const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_advise_random_on_open(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_bytes_per_sync(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*, + double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t*, size_t, int32_t, int32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory( + rocksdb_options_t*, uint32_t, int, double, size_t); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress( + rocksdb_options_t* opt, int level); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_successive_merges(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_bloom_locality(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_inplace_update_support(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_avoid_unnecessary_blocking_io(rocksdb_options_t*); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_experimental_mempurge_threshold(rocksdb_options_t*); + +enum { + rocksdb_tolerate_corrupted_tail_records_recovery = 0, + rocksdb_absolute_consistency_recovery = 1, + rocksdb_point_in_time_recovery = 2, + rocksdb_skip_any_corrupted_records_recovery = 3 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode( + rocksdb_options_t*); + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 2, + rocksdb_bz2_compression = 3, + rocksdb_lz4_compression = 4, + rocksdb_lz4hc_compression = 5, + rocksdb_xpress_compression = 6, + rocksdb_zstd_compression = 7 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression( + rocksdb_options_t*); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1, + rocksdb_fifo_compaction = 2 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_universal_compaction_options( + rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter( + rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( + rocksdb_options_t* opt, rocksdb_cache_t* cache); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_compression( + rocksdb_options_t* opt, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression( + rocksdb_options_t* opt); + +/* RateLimiter */ +extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); +extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy( + rocksdb_ratelimiter_t*); + +/* PerfContext */ +enum { + rocksdb_uninitialized = 0, + rocksdb_disable = 1, + rocksdb_enable_count = 2, + rocksdb_enable_time_except_for_mutex = 3, + rocksdb_enable_time = 4, + rocksdb_out_of_bounds = 5 +}; + +enum { + rocksdb_user_key_comparison_count = 0, + rocksdb_block_cache_hit_count, + rocksdb_block_read_count, + rocksdb_block_read_byte, + rocksdb_block_read_time, + rocksdb_block_checksum_time, + rocksdb_block_decompress_time, + rocksdb_get_read_bytes, + rocksdb_multiget_read_bytes, + rocksdb_iter_read_bytes, + rocksdb_internal_key_skipped_count, + rocksdb_internal_delete_skipped_count, + rocksdb_internal_recent_skipped_count, + rocksdb_internal_merge_count, + rocksdb_get_snapshot_time, + rocksdb_get_from_memtable_time, + rocksdb_get_from_memtable_count, + rocksdb_get_post_process_time, + rocksdb_get_from_output_files_time, + rocksdb_seek_on_memtable_time, + rocksdb_seek_on_memtable_count, + rocksdb_next_on_memtable_count, + rocksdb_prev_on_memtable_count, + rocksdb_seek_child_seek_time, + rocksdb_seek_child_seek_count, + rocksdb_seek_min_heap_time, + rocksdb_seek_max_heap_time, + rocksdb_seek_internal_seek_time, + rocksdb_find_next_user_entry_time, + rocksdb_write_wal_time, + rocksdb_write_memtable_time, + rocksdb_write_delay_time, + rocksdb_write_pre_and_post_process_time, + rocksdb_db_mutex_lock_nanos, + rocksdb_db_condition_wait_nanos, + rocksdb_merge_operator_time_nanos, + rocksdb_read_index_block_nanos, + rocksdb_read_filter_block_nanos, + rocksdb_new_table_block_iter_nanos, + rocksdb_new_table_iterator_nanos, + rocksdb_block_seek_nanos, + rocksdb_find_table_nanos, + rocksdb_bloom_memtable_hit_count, + rocksdb_bloom_memtable_miss_count, + rocksdb_bloom_sst_hit_count, + rocksdb_bloom_sst_miss_count, + rocksdb_key_lock_wait_time, + rocksdb_key_lock_wait_count, + rocksdb_env_new_sequential_file_nanos, + rocksdb_env_new_random_access_file_nanos, + rocksdb_env_new_writable_file_nanos, + rocksdb_env_reuse_writable_file_nanos, + rocksdb_env_new_random_rw_file_nanos, + rocksdb_env_new_directory_nanos, + rocksdb_env_file_exists_nanos, + rocksdb_env_get_children_nanos, + rocksdb_env_get_children_file_attributes_nanos, + rocksdb_env_delete_file_nanos, + rocksdb_env_create_dir_nanos, + rocksdb_env_create_dir_if_missing_nanos, + rocksdb_env_delete_dir_nanos, + rocksdb_env_get_file_size_nanos, + rocksdb_env_get_file_modification_time_nanos, + rocksdb_env_rename_file_nanos, + rocksdb_env_link_file_nanos, + rocksdb_env_lock_file_nanos, + rocksdb_env_unlock_file_nanos, + rocksdb_env_new_logger_nanos, + rocksdb_number_async_seek, + rocksdb_blob_cache_hit_count, + rocksdb_blob_read_count, + rocksdb_blob_read_byte, + rocksdb_blob_read_time, + rocksdb_blob_checksum_time, + rocksdb_blob_decompress_time, + rocksdb_internal_range_del_reseek_count, + rocksdb_total_metric_count = 78 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( + rocksdb_perfcontext_t* context); +extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( + rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy( + rocksdb_perfcontext_t* context); + +/* Compaction Filter */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t* +rocksdb_compactionfilter_create( + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy( + rocksdb_compactionfilter_t*); + +/* Compaction Filter Context */ + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context); + +/* Compaction Filter Factory */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t* +rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t*); + +/* Comparator */ + +extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy( + rocksdb_comparator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* +rocksdb_comparator_with_ts_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts, + size_t b_tslen), + int (*compare_without_ts)(void*, const char* a, size_t alen, + unsigned char a_has_ts, const char* b, + size_t blen, unsigned char b_has_ts), + const char* (*name)(void*), size_t timestamp_size); + +/* Filter policy */ + +extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy( + rocksdb_filterpolicy_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom(double bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom_full(double bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key, + int bloom_before_level); + +/* Merge Operator */ + +extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t* +rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy( + rocksdb_mergeoperator_t*); + +/* Read options */ + +extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing( + rocksdb_readoptions_t*); +// The functionality that this option controlled has been removed. +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp( + rocksdb_readoptions_t*, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts( + rocksdb_readoptions_t*, const char* ts, size_t tslen); + +/* Write options */ + +extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL( + rocksdb_writeoptions_t* opt, int disable); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t*); + +/* Compact range options */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t* +rocksdb_compactoptions_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level( + rocksdb_compactoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_full_history_ts_low( + rocksdb_compactoptions_t*, char* ts, size_t tslen); + +/* Flush options */ + +extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy( + rocksdb_flushoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait( + rocksdb_flushoptions_t*); + +/* Memory allocator */ + +extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t* +rocksdb_jemalloc_nodump_allocator_create(char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy( + rocksdb_memory_allocator_t*); + +/* Cache */ + +extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t* +rocksdb_lru_cache_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy( + rocksdb_lru_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity( + rocksdb_lru_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits( + rocksdb_lru_cache_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( + size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* +rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data( + rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( + rocksdb_cache_t* cache, size_t capacity); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_capacity(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_usage(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache); + +/* DBPath */ + +extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create( + const char* path, uint64_t target_size); +extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*); + +/* Env */ + +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int +rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create( + void); +extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy( + rocksdb_envoptions_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_create_dir_if_missing( + rocksdb_env_t* env, const char* path, char** errptr); + +/* SstFile */ + +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env, + const rocksdb_options_t* io_options); +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* comparator); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open( + rocksdb_sstfilewriter_t* writer, const char* name, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put_with_ts( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* ts, size_t tslen, const char* val, size_t vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_with_ts( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* ts, size_t tslen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_range( + rocksdb_sstfilewriter_t* writer, const char* begin_key, size_t begin_keylen, + const char* end_key, size_t end_keylen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish( + rocksdb_sstfilewriter_t* writer, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size( + rocksdb_sstfilewriter_t* writer, uint64_t* file_size); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy( + rocksdb_sstfilewriter_t* writer); +extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind); +extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary( + rocksdb_t* db, char** errptr); + +/* SliceTransform */ + +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create( + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* + rocksdb_slicetransform_create_fixed_prefix(size_t); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create_noop(void); +extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( + rocksdb_slicetransform_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* +rocksdb_fifo_compaction_options_create(void); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts); + +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count( + const rocksdb_livefiles_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t*); + +/* Utility Helpers */ + +extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string( + const rocksdb_options_t* base_options, const char* opts_str, + rocksdb_options_t* new_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + +/* MetaData */ + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t* +rocksdb_get_column_family_metadata(rocksdb_t* db); + +/** + * Returns the rocksdb_column_family_metadata_t of the specified + * column family. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_column_family_metadata_destroy. + */ +extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t* +rocksdb_get_column_family_metadata_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_column_family_metadata_destroy( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_column_family_metadata_get_size( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API size_t rocksdb_column_family_metadata_get_file_count( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_metadata_get_name( + rocksdb_column_family_metadata_t* cf_meta); + +extern ROCKSDB_LIBRARY_API size_t +rocksdb_column_family_metadata_get_level_count( + rocksdb_column_family_metadata_t* cf_meta); + +/** + * Returns the rocksdb_level_metadata_t of the ith level from the specified + * column family metadata. + * + * If the specified i is greater than or equal to the number of levels + * in the specified column family, then NULL will be returned. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_level_metadata_destroy before releasing its parent + * rocksdb_column_family_metadata_t. + */ +extern ROCKSDB_LIBRARY_API rocksdb_level_metadata_t* +rocksdb_column_family_metadata_get_level_metadata( + rocksdb_column_family_metadata_t* cf_meta, size_t i); + +/** + * Releases the specified rocksdb_level_metadata_t. + * + * Note that the specified rocksdb_level_metadata_t must be released + * before the release of its parent rocksdb_column_family_metadata_t. + */ +extern ROCKSDB_LIBRARY_API void rocksdb_level_metadata_destroy( + rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API int rocksdb_level_metadata_get_level( + rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta); + +extern ROCKSDB_LIBRARY_API size_t +rocksdb_level_metadata_get_file_count(rocksdb_level_metadata_t* level_meta); + +/** + * Returns the sst_file_metadata_t of the ith file from the specified level + * metadata. + * + * If the specified i is greater than or equal to the number of files + * in the specified level, then NULL will be returned. + * + * Note that the caller is responsible to release the returned memory + * using rocksdb_sst_file_metadata_destroy before releasing its + * parent rocksdb_level_metadata_t. + */ +extern ROCKSDB_LIBRARY_API rocksdb_sst_file_metadata_t* +rocksdb_level_metadata_get_sst_file_metadata( + rocksdb_level_metadata_t* level_meta, size_t i); + +/** + * Releases the specified rocksdb_sst_file_metadata_t. + * + * Note that the specified rocksdb_sst_file_metadata_t must be released + * before the release of its parent rocksdb_level_metadata_t. + */ +extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_metadata_destroy( + rocksdb_sst_file_metadata_t* file_meta); + +extern ROCKSDB_LIBRARY_API char* +rocksdb_sst_file_metadata_get_relative_filename( + rocksdb_sst_file_metadata_t* file_meta); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta); + +/** + * Returns the smallest key of the specified sst file. + * The caller is responsible for releasing the returned memory. + * + * @param file_meta the metadata of an SST file to obtain its smallest key. + * @param len the out value which will contain the length of the returned key + * after the function call. + */ +extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_smallestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* len); + +/** + * Returns the smallest key of the specified sst file. + * The caller is responsible for releasing the returned memory. + * + * @param file_meta the metadata of an SST file to obtain its smallest key. + * @param len the out value which will contain the length of the returned key + * after the function call. + */ +extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_largestkey( + rocksdb_sst_file_metadata_t* file_meta, size_t* len); + +/* Transactions */ + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* +rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value( + rocksdb_transactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int( + rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t** +rocksdb_transactiondb_get_prepared_transactions(rocksdb_transactiondb_t* txn_db, + size_t* cnt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_name( + rocksdb_transaction_t* txn, const char* name, size_t name_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_name( + rocksdb_transaction_t* txn, size_t* name_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_prepare( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint( + rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy( + rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_transaction_get_writebatch_wi(rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch( + rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch, + char** errptr); + +// This rocksdb_writebatch_wi_t should be freed with rocksdb_free +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch_wi( + rocksdb_transaction_t* txn, rocksdb_writebatch_wi_t* wi, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_commit_timestamp( + rocksdb_transaction_t* txn, uint64_t commit_timestamp); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_set_read_timestamp_for_validation( + rocksdb_transaction_t* txn, uint64_t read_timestamp); + +// This snapshot should be freed using rocksdb_free +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_cf(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, unsigned char exclusive, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_for_update(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + unsigned char exclusive, + char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transaction_get_pinned_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + unsigned char exclusive, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transactiondb_get_pinned(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_transactiondb_get_pinned_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete( + rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close( + rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal( + rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options, + const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db( + rocksdb_t* base_db); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* +rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr); + +/* Transaction Options */ + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t* +rocksdb_transactiondb_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t* +rocksdb_transaction_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy( + rocksdb_transaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_skip_prepare( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val); + +// referring to convention (3), this should be used by client +// to free memory that was malloc()ed +extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy( + rocksdb_pinnableslice_t* v); +extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( + const rocksdb_pinnableslice_t* t, size_t* vlen); + +extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* +rocksdb_memory_consumers_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( + rocksdb_memory_consumers_t* consumers, rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( + rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy( + rocksdb_memory_consumers_t* consumers); +extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t* +rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy( + rocksdb_memory_usage_t* usage); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats( + rocksdb_options_t*, unsigned char); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*, + unsigned char); + +extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work( + rocksdb_t* db, unsigned char wait); + +extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h new file mode 100644 index 000000000..575d276b5 --- /dev/null +++ b/src/rocksdb/include/rocksdb/cache.h @@ -0,0 +1,775 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ConfigOptions; +class Logger; +class SecondaryCache; + +// Classifications of block cache entries. +// +// Developer notes: Adding a new enum to this class requires corresponding +// updates to `kCacheEntryRoleToCamelString` and +// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since +// `kNumCacheEntryRoles` assumes `kMisc` comes last. +enum class CacheEntryRole { + // Block-based table data block + kDataBlock, + // Block-based table filter block (full or partitioned) + kFilterBlock, + // Block-based table metadata block for partitioned filter + kFilterMetaBlock, + // OBSOLETE / DEPRECATED: old/removed block-based filter + kDeprecatedFilterBlock, + // Block-based table index block + kIndexBlock, + // Other kinds of block-based table block + kOtherBlock, + // WriteBufferManager's charge to account for its memtable usage + kWriteBuffer, + // Compression dictionary building buffer's charge to account for + // its memory usage + kCompressionDictionaryBuildingBuffer, + // Filter's charge to account for + // (new) bloom and ribbon filter construction's memory usage + kFilterConstruction, + // BlockBasedTableReader's charge to account for its memory usage + kBlockBasedTableReader, + // FileMetadata's charge to account for its memory usage + kFileMetadata, + // Blob value (when using the same cache as block cache and blob cache) + kBlobValue, + // Blob cache's charge to account for its memory usage (when using a + // separate block cache and blob cache) + kBlobCache, + // Default bucket, for miscellaneous cache entries. Do not use for + // entries that could potentially add up to large usage. + kMisc, +}; +constexpr uint32_t kNumCacheEntryRoles = + static_cast(CacheEntryRole::kMisc) + 1; + +// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`. +const std::string& GetCacheEntryRoleName(CacheEntryRole); + +// For use with `GetMapProperty()` for property +// `DB::Properties::kBlockCacheEntryStats`. On success, the map will +// be populated with all keys that can be obtained from these functions. +struct BlockCacheEntryStatsMapKeys { + static const std::string& CacheId(); + static const std::string& CacheCapacityBytes(); + static const std::string& LastCollectionDurationSeconds(); + static const std::string& LastCollectionAgeSeconds(); + + static std::string EntryCount(CacheEntryRole); + static std::string UsedBytes(CacheEntryRole); + static std::string UsedPercent(CacheEntryRole); +}; + +extern const bool kDefaultToAdaptiveMutex; + +enum CacheMetadataChargePolicy { + // Only the `charge` of each entry inserted into a Cache counts against + // the `capacity` + kDontChargeCacheMetadata, + // In addition to the `charge`, the approximate space overheads in the + // Cache (in bytes) also count against `capacity`. These space overheads + // are for supporting fast Lookup and managing the lifetime of entries. + kFullChargeCacheMetadata +}; +const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = + kFullChargeCacheMetadata; + +// Options shared betweeen various cache implementations that +// divide the key space into shards using hashing. +struct ShardedCacheOptions { + // Capacity of the cache, in the same units as the `charge` of each entry. + // This is typically measured in bytes, but can be a different unit if using + // kDontChargeCacheMetadata. + size_t capacity = 0; + + // Cache is sharded into 2^num_shard_bits shards, by hash of key. + // If < 0, a good default is chosen based on the capacity and the + // implementation. (Mutex-based implementations are much more reliant + // on many shards for parallel scalability.) + int num_shard_bits = -1; + + // If strict_capacity_limit is set, Insert() will fail if there is not + // enough capacity for the new entry along with all the existing referenced + // (pinned) cache entries. (Unreferenced cache entries are evicted as + // needed, sometimes immediately.) If strict_capacity_limit == false + // (default), Insert() never fails. + bool strict_capacity_limit = false; + + // If non-nullptr, RocksDB will use this allocator instead of system + // allocator when allocating memory for cache blocks. + // + // Caveat: when the cache is used as block cache, the memory allocator is + // ignored when dealing with compression libraries that allocate memory + // internally (currently only XPRESS). + std::shared_ptr memory_allocator; + + // See CacheMetadataChargePolicy + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + ShardedCacheOptions() {} + ShardedCacheOptions( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : capacity(_capacity), + num_shard_bits(_num_shard_bits), + strict_capacity_limit(_strict_capacity_limit), + memory_allocator(std::move(_memory_allocator)), + metadata_charge_policy(_metadata_charge_policy) {} +}; + +struct LRUCacheOptions : public ShardedCacheOptions { + // Ratio of cache reserved for high-priority and low-priority entries, + // respectively. (See Cache::Priority below more information on the levels.) + // Valid values are between 0 and 1 (inclusive), and the sum of the two + // values cannot exceed 1. + // + // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU + // list is maintained by the cache. Similarly, if low_pri_pool_ratio is + // greater than zero, a dedicated low-priority LRU list is maintained. + // There is also a bottom-priority LRU list, which is always enabled and not + // explicitly configurable. Entries are spilled over to the next available + // lower-priority pool if a certain pool's capacity is exceeded. + // + // Entries with cache hits are inserted into the highest priority LRU list + // available regardless of the entry's priority. Entries without hits + // are inserted into highest priority LRU list available whose priority + // does not exceed the entry's priority. (For example, high-priority items + // with no hits are placed in the high-priority pool if available; + // otherwise, they are placed in the low-priority pool if available; + // otherwise, they are placed in the bottom-priority pool.) This results + // in lower-priority entries without hits getting evicted from the cache + // sooner. + // + // Default values: high_pri_pool_ratio = 0.5 (which is referred to as + // "midpoint insertion"), low_pri_pool_ratio = 0 + double high_pri_pool_ratio = 0.5; + double low_pri_pool_ratio = 0.0; + + // Whether to use adaptive mutexes for cache shards. Note that adaptive + // mutexes need to be supported by the platform in order for this to have any + // effect. The default value is true if RocksDB is compiled with + // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. + bool use_adaptive_mutex = kDefaultToAdaptiveMutex; + + // A SecondaryCache instance to use a the non-volatile tier. + std::shared_ptr secondary_cache; + + LRUCacheOptions() {} + LRUCacheOptions(size_t _capacity, int _num_shard_bits, + bool _strict_capacity_limit, double _high_pri_pool_ratio, + std::shared_ptr _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + double _low_pri_pool_ratio = 0.0) + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), + high_pri_pool_ratio(_high_pri_pool_ratio), + low_pri_pool_ratio(_low_pri_pool_ratio), + use_adaptive_mutex(_use_adaptive_mutex) {} +}; + +// Create a new cache with a fixed size capacity. The cache is sharded +// to 2^num_shard_bits shards, by hash of the key. The total capacity +// is divided and evenly assigned to each shard. If strict_capacity_limit +// is set, insert to the cache will fail when cache is full. User can also +// set percentage of the cache reserves for high priority entries via +// high_pri_pool_pct. +// num_shard_bits = -1 means it is automatically determined: every shard +// will be at least 512KB and number of shard bits will not exceed 6. +extern std::shared_ptr NewLRUCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, + std::shared_ptr memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + double low_pri_pool_ratio = 0.0); + +extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); + +// EXPERIMENTAL +// Options structure for configuring a SecondaryCache instance based on +// LRUCache. The LRUCacheOptions.secondary_cache is not used and +// should not be set. +struct CompressedSecondaryCacheOptions : LRUCacheOptions { + // The compression method (if any) that is used to compress data. + CompressionType compression_type = CompressionType::kLZ4Compression; + + // compress_format_version can have two values: + // compress_format_version == 1 -- decompressed size is not included in the + // block header. + // compress_format_version == 2 -- decompressed size is included in the block + // header in varint32 format. + uint32_t compress_format_version = 2; + + // Enable the custom split and merge feature, which split the compressed value + // into chunks so that they may better fit jemalloc bins. + bool enable_custom_split_merge = false; + + CompressedSecondaryCacheOptions() {} + CompressedSecondaryCacheOptions( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + double _high_pri_pool_ratio, double _low_pri_pool_ratio = 0.0, + std::shared_ptr _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + CompressionType _compression_type = CompressionType::kLZ4Compression, + uint32_t _compress_format_version = 2, + bool _enable_custom_split_merge = false) + : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + _high_pri_pool_ratio, std::move(_memory_allocator), + _use_adaptive_mutex, _metadata_charge_policy, + _low_pri_pool_ratio), + compression_type(_compression_type), + compress_format_version(_compress_format_version), + enable_custom_split_merge(_enable_custom_split_merge) {} +}; + +// EXPERIMENTAL +// Create a new Secondary Cache that is implemented on top of LRUCache. +extern std::shared_ptr NewCompressedSecondaryCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, + double low_pri_pool_ratio = 0.0, + std::shared_ptr memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + CompressionType compression_type = CompressionType::kLZ4Compression, + uint32_t compress_format_version = 2, + bool enable_custom_split_merge = false); + +extern std::shared_ptr NewCompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts); + +// HyperClockCache - A lock-free Cache alternative for RocksDB block cache +// that offers much improved CPU efficiency vs. LRUCache under high parallel +// load or high contention, with some caveats: +// * Not a general Cache implementation: can only be used for +// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is +// compatible with HyperClockCache. +// * Requires an extra tuning parameter: see estimated_entry_charge below. +// Similarly, substantially changing the capacity with SetCapacity could +// harm efficiency. +// * SecondaryCache is not yet supported. +// * Cache priorities are less aggressively enforced, which could cause +// cache dilution from long range scans (unless they use fill_cache=false). +// * Can be worse for small caches, because if almost all of a cache shard is +// pinned (more likely with non-partitioned filters), then CLOCK eviction +// becomes very CPU intensive. +// +// See internal cache/clock_cache.h for full description. +struct HyperClockCacheOptions : public ShardedCacheOptions { + // The estimated average `charge` associated with cache entries. This is a + // critical configuration parameter for good performance from the hyper + // cache, because having a table size that is fixed at creation time greatly + // reduces the required synchronization between threads. + // * If the estimate is substantially too low (e.g. less than half the true + // average) then metadata space overhead with be substantially higher (e.g. + // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this + // can slightly reduce cache hit rates, and slightly reduce access times due + // to the larger working memory size. + // * If the estimate is substantially too high (e.g. 25% higher than the true + // average) then there might not be sufficient slots in the hash table for + // both efficient operation and capacity utilization (hit rate). The hyper + // cache will evict entries to prevent load factors that could dramatically + // affect lookup times, instead letting the hit rate suffer by not utilizing + // the full capacity. + // + // A reasonable choice is the larger of block_size and metadata_block_size. + // When WriteBufferManager (and similar) charge memory usage to the block + // cache, this can lead to the same effect as estimate being too low, which + // is better than the opposite. Therefore, the general recommendation is to + // assume that other memory charged to block cache could be negligible, and + // ignore it in making the estimate. + // + // The best parameter choice based on a cache in use is given by + // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as + // with kDontChargeCacheMetadata. More precisely with + // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) / + // GetOccupancyCount(). However, when the average value size might vary + // (e.g. balance between metadata and data blocks in cache), it is better + // to estimate toward the lower side than the higher side. + size_t estimated_entry_charge; + + HyperClockCacheOptions( + size_t _capacity, size_t _estimated_entry_charge, + int _num_shard_bits = -1, bool _strict_capacity_limit = false, + std::shared_ptr _memory_allocator = nullptr, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, + std::move(_memory_allocator), + _metadata_charge_policy), + estimated_entry_charge(_estimated_entry_charge) {} + + // Construct an instance of HyperClockCache using these options + std::shared_ptr MakeSharedCache() const; +}; + +// DEPRECATED - The old Clock Cache implementation had an unresolved bug and +// has been removed. The new HyperClockCache requires an additional +// configuration parameter that is not provided by this API. This function +// simply returns a new LRUCache for functional compatibility. +extern std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); + +class Cache { + public: // opaque types + // Opaque handle to an entry stored in the cache. + struct Handle {}; + + public: // type defs + // Depending on implementation, cache entries with higher priority levels + // could be less likely to get evicted than entries with lower priority + // levels. The "high" priority level applies to certain SST metablocks (e.g. + // index and filter blocks) if the option + // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority + // level is used for other kinds of SST blocks (most importantly, data + // blocks), as well as the above metablocks in case + // cache_index_and_filter_blocks_with_high_priority is + // not set. The "bottom" priority level is for BlobDB's blob values. + enum class Priority { HIGH, LOW, BOTTOM }; + + // A set of callbacks to allow objects in the primary block cache to be + // be persisted in a secondary cache. The purpose of the secondary cache + // is to support other ways of caching the object, such as persistent or + // compressed data, that may require the object to be parsed and transformed + // in some way. Since the primary cache holds C++ objects and the secondary + // cache may only hold flat data that doesn't need relocation, these + // callbacks need to be provided by the user of the block + // cache to do the conversion. + // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers + // to callback functions for size, saving and deletion of the + // object. The callbacks are defined in C-style in order to make them + // stateless and not add to the cache metadata size. + // Saving multiple std::function objects will take up 32 bytes per + // function, even if its not bound to an object and does no capture. + // + // All the callbacks are C-style function pointers in order to simplify + // lifecycle management. Objects in the cache can outlive the parent DB, + // so anything required for these operations should be contained in the + // object itself. + // + // The SizeCallback takes a void* pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + using SizeCallback = size_t (*)(void* obj); + + // The SaveToCallback takes a void* object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, + size_t length, void* out); + + // A function pointer type for custom destruction of an entry's + // value. The Cache is responsible for copying and reclaiming space + // for the key, but values are managed by the caller. + using DeleterFn = void (*)(const Slice& key, void* value); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + SizeCallback size_cb; + SaveToCallback saveto_cb; + DeleterFn del_cb; + + CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} + CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, + DeleterFn _del_cb) + : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + }; + + // The CreateCallback is passed by the block cache user to Lookup(). It + // takes in a buffer from the NVM cache and constructs an object using + // it. The callback doesn't have ownership of the buffer and should + // copy the contents into its own buffer. + using CreateCallback = std::function; + + public: // ctor/dtor/create + Cache(std::shared_ptr allocator = nullptr) + : memory_allocator_(std::move(allocator)) {} + // No copying allowed + Cache(const Cache&) = delete; + Cache& operator=(const Cache&) = delete; + + // Destroys all remaining entries by calling the associated "deleter" + virtual ~Cache() {} + + // Creates a new Cache based on the input value string and returns the result. + // Currently, this method can be used to create LRUCaches only + // @param config_options + // @param value The value might be: + // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102( + // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; + // For the LRUCache, the values are defined in LRUCacheOptions. + // @param result The new Cache object + // @return OK if the cache was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + public: // functions + // The type of the Cache + virtual const char* Name() const = 0; + + // EXPERIMENTAL SecondaryCache support: + // Some APIs here are experimental and might change in the future. + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation. They rely on a + // per object CacheItemHelper to do the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->value into the volatile cache only + // and assign it with the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::MemoryLimit. + // + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter" which must delete the value. + // (The Cache is responsible for copying and reclaiming space for + // the key.) + virtual Status Insert(const Slice& key, void* value, size_t charge, + DeleterFn deleter, Handle** handle = nullptr, + Priority priority = Priority::LOW) = 0; + + // EXPERIMENTAL + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. If + // strict_capacity_limit is true and cache reaches its full capacity, + // return Status::MemoryLimit. `value` must be non-nullptr for this + // Insert() because Value() == nullptr is reserved for indicating failure + // with secondary-cache-compatible mappings. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or promoted to the secondary cache. It, + // therefore, must outlive the cache. + // + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. + // + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Status Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) { + if (!helper) { + return Status::InvalidArgument(); + } + return Insert(key, value, charge, helper->del_cb, handle, priority); + } + + // If the cache has no mapping for "key", returns nullptr. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // If stats is not nullptr, relative tickers could be used inside the + // function. + virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0; + + // EXPERIMENTAL + // Lookup the key in the primary and secondary caches (if one is configured). + // The create_cb callback function object will be used to contruct the + // cached object. + // If none of the caches have the mapping for the key, returns nullptr. + // Else, returns a handle that corresponds to the mapping. + // + // This call may promote the object from the secondary cache (if one is + // configured, and has the given key) to the primary cache. + // + // The helper argument should be provided if the caller wants the lookup + // to include the secondary cache (if one is configured) and the object, + // if it exists, to be promoted to the primary cache. The helper may be + // saved and used later when the object is evicted. Therefore, it must + // outlive the cache. + // + // ======================== Async Lookup (wait=false) ====================== + // When wait=false, the handle returned might be in any of three states: + // * Present - If Value() != nullptr, then the result is present and + // the handle can be used just as if wait=true. + // * Pending, not ready (IsReady() == false) - secondary cache is still + // working to retrieve the value. Might become ready any time. + // * Pending, ready (IsReady() == true) - secondary cache has the value + // but it has not been loaded into primary cache. Call to Wait()/WaitAll() + // will not block. + // + // IMPORTANT: Pending handles are not thread-safe, and only these functions + // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release() + // can only come after Wait() or WaitAll() even though a reference is held. + // + // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is + // safe and has no effect on other handle states.) After waiting on a Handle, + // it is in one of two states: + // * Present - if Value() != nullptr + // * Failed - if Value() == nullptr, such as if the secondary cache + // initially thought it had the value but actually did not. + // + // Note that given an arbitrary Handle, the only way to distinguish the + // Pending+ready state from the Failed state is to Wait() on it. A cache + // entry not compatible with secondary cache can also have Value()==nullptr + // like the Failed state, but this is not generally a concern. + virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/, + const CreateCallback& /*create_cb*/, + Priority /*priority*/, bool /*wait*/, + Statistics* stats = nullptr) { + return Lookup(key, stats); + } + + // Increments the reference count for the handle if it refers to an entry in + // the cache. Returns true if refcount was incremented; otherwise, returns + // false. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Ref(Handle* handle) = 0; + + /** + * Release a mapping returned by a previous Lookup(). A released entry might + * still remain in cache in case it is later looked up by others. If + * erase_if_last_ref is set then it also erases it from the cache if there is + * no other reference to it. Erasing it should call the deleter function that + * was provided when the entry was inserted. + * + * Returns true if the entry was also erased. + */ + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains the entry for the key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + // Return a new numeric id. May be used by multiple clients who are + // sharding the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // sets the maximum configured capacity of the cache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage + virtual void SetCapacity(size_t capacity) = 0; + + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Get the flag whether to return error on insertion when cache reaches its + // full capacity. + virtual bool HasStrictCapacityLimit() const = 0; + + // Returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // Returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // Returns the number of entries currently tracked in the table. SIZE_MAX + // means "not supported." This is used for inspecting the load factor, along + // with GetTableAddressCount(). + virtual size_t GetOccupancyCount() const { return SIZE_MAX; } + + // Returns the number of ways the hash function is divided for addressing + // entries. Zero means "not supported." This is used for inspecting the load + // factor, along with GetOccupancyCount(). + virtual size_t GetTableAddressCount() const { return 0; } + + // Returns the memory size for a specific entry in the cache. + virtual size_t GetUsage(Handle* handle) const = 0; + + // Returns the memory size for the entries in use by the system + virtual size_t GetPinnedUsage() const = 0; + + // Returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + + // Returns the deleter for the specified entry. This might seem useless + // as the Cache itself is responsible for calling the deleter, but + // the deleter can essentially verify that a cache entry is of an + // expected type from an expected code source. + virtual DeleterFn GetDeleter(Handle* handle) const = 0; + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData() { + // default implementation is noop + } + + struct ApplyToAllEntriesOptions { + // If the Cache uses locks, setting `average_entries_per_lock` to + // a higher value suggests iterating over more entries each time a lock + // is acquired, likely reducing the time for ApplyToAllEntries but + // increasing latency for concurrent users of the Cache. Setting + // `average_entries_per_lock` to a smaller value could be helpful if + // callback is relatively expensive, such as using large data structures. + size_t average_entries_per_lock = 256; + }; + + // Apply a callback to all entries in the cache. The Cache must ensure + // thread safety but does not guarantee that a consistent snapshot of all + // entries is iterated over if other threads are operating on the Cache + // also. + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) = 0; + + // DEPRECATED version of above. (Default implementation uses above.) + virtual void ApplyToAllCacheEntries(void (*callback)(void* value, + size_t charge), + bool /*thread_safe*/) { + ApplyToAllEntries([callback](const Slice&, void* value, size_t charge, + DeleterFn) { callback(value, charge); }, + {}); + } + + // Remove all entries. + // Prerequisite: no entry is referenced. + virtual void EraseUnRefEntries() = 0; + + virtual std::string GetPrintableOptions() const { return ""; } + + // Check for any warnings or errors in the operation of the cache and + // report them to the logger. This is intended only to be called + // periodically so does not need to be very efficient. (Obscure calling + // conventions for Logger inherited from env.h) + virtual void ReportProblems( + const std::shared_ptr& /*info_log*/) const {} + + MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + + // EXPERIMENTAL + // Release a mapping returned by a previous Lookup(). The "useful" + // parameter specifies whether the data was actually used or not, + // which may be used by the cache implementation to decide whether + // to consider it as a hit for retention purposes. As noted elsewhere, + // "pending" handles require Wait()/WaitAll() before Release(). + virtual bool Release(Handle* handle, bool /*useful*/, + bool erase_if_last_ref) { + return Release(handle, erase_if_last_ref); + } + + // EXPERIMENTAL + // Determines if the handle returned by Lookup() can give a value without + // blocking, though Wait()/WaitAll() might be required to publish it to + // Value(). See secondary cache compatible Lookup() above for details. + // This call is not thread safe on "pending" handles. + virtual bool IsReady(Handle* /*handle*/) { return true; } + + // EXPERIMENTAL + // Convert a "pending" handle into a full thread-shareable handle by + // * If necessary, wait until secondary cache finishes loading the value. + // * Construct the value for primary cache and set it in the handle. + // Even after Wait() on a pending handle, the caller must check for + // Value() == nullptr in case of failure. This call is not thread-safe + // on pending handles. This call has no effect on non-pending handles. + // See secondary cache compatible Lookup() above for details. + virtual void Wait(Handle* /*handle*/) {} + + // EXPERIMENTAL + // Wait for a vector of handles to become ready. As with Wait(), the user + // should check the Value() of each handle for nullptr. This call is not + // thread-safe on pending handles. + virtual void WaitAll(std::vector& /*handles*/) {} + + private: + std::shared_ptr memory_allocator_; +}; + + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/cache_bench_tool.h b/src/rocksdb/include/rocksdb/cache_bench_tool.h new file mode 100644 index 000000000..413ce1593 --- /dev/null +++ b/src/rocksdb/include/rocksdb/cache_bench_tool.h @@ -0,0 +1,14 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +int cache_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h new file mode 100644 index 000000000..afc736673 --- /dev/null +++ b/src/rocksdb/include/rocksdb/cleanable.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cleanable { + public: + Cleanable(); + // No copy constructor and copy assignment allowed. + Cleanable(Cleanable&) = delete; + Cleanable& operator=(Cleanable&) = delete; + + // Executes all the registered cleanups + ~Cleanable(); + + // Move constructor and move assignment is allowed. + Cleanable(Cleanable&&) noexcept; + Cleanable& operator=(Cleanable&&) noexcept; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + using CleanupFunction = void (*)(void* arg1, void* arg2); + + // Add another Cleanup to the list + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + // Move the cleanups owned by this Cleanable to another Cleanable, adding to + // any existing cleanups it has + void DelegateCleanupsTo(Cleanable* other); + + // DoCleanup and also resets the pointers for reuse + inline void Reset() { + DoCleanup(); + cleanup_.function = nullptr; + cleanup_.next = nullptr; + } + + inline bool HasCleanups() { return cleanup_.function != nullptr; } + + protected: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + // It also becomes the owner of c + void RegisterCleanup(Cleanup* c); + + private: + // Performs all the cleanups. It does not reset the pointers. Making it + // private + // to prevent misuse + inline void DoCleanup() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr;) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } + } +}; + +// A copyable, reference-counted pointer to a simple Cleanable that only +// performs registered cleanups after all copies are destroy. This is like +// shared_ptr but works more efficiently with wrapping the pointer +// in an outer Cleanable (see RegisterCopyWith() and MoveAsCleanupTo()). +// WARNING: if you create a reference cycle, for example: +// SharedCleanablePtr scp; +// scp.Allocate(); +// scp.RegisterCopyWith(&*scp); +// It will prevent cleanups from ever happening! +class SharedCleanablePtr { + public: + // Empy/null pointer + SharedCleanablePtr() {} + // Copy and move constructors and assignment + SharedCleanablePtr(const SharedCleanablePtr& from); + SharedCleanablePtr(SharedCleanablePtr&& from) noexcept; + SharedCleanablePtr& operator=(const SharedCleanablePtr& from); + SharedCleanablePtr& operator=(SharedCleanablePtr&& from) noexcept; + // Destructor (decrement refcount if non-null) + ~SharedCleanablePtr(); + // Create a new simple Cleanable and make this assign this pointer to it. + // (Reset()s first if necessary.) + void Allocate(); + // Reset to empty/null (decrement refcount if previously non-null) + void Reset(); + // Dereference to pointed-to Cleanable + Cleanable& operator*(); + Cleanable* operator->(); + // Get as raw pointer to Cleanable + Cleanable* get(); + + // Creates a (virtual) copy of this SharedCleanablePtr and registers its + // destruction with target, so that the cleanups registered with the + // Cleanable pointed to by this can only happen after the cleanups in the + // target Cleanable are run. + // No-op if this is empty (nullptr). + void RegisterCopyWith(Cleanable* target); + + // Moves (virtually) this shared pointer to a new cleanup in the target. + // This is essentilly a move semantics version of RegisterCopyWith(), for + // performance optimization. No-op if this is empty (nullptr). + void MoveAsCleanupTo(Cleanable* target); + + private: + struct Impl; + Impl* ptr_ = nullptr; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h new file mode 100644 index 000000000..9c6a9c30d --- /dev/null +++ b/src/rocksdb/include/rocksdb/compaction_filter.h @@ -0,0 +1,256 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; + +// CompactionFilter allows an application to modify/delete a key-value during +// table file creation. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilter : public Customizable { + public: + enum ValueType { + kValue, + kMergeOperand, + kBlobIndex, // used internally by BlobDB. + }; + + enum class Decision { + kKeep, + kRemove, + kChangeValue, + kRemoveAndSkipUntil, + kChangeBlobIndex, // used internally by BlobDB. + kIOError, // used internally by BlobDB. + kPurge, // used for keys that can only be SingleDelete'ed + kUndetermined, + }; + + enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; + + // Context information for a table file creation. + struct Context { + // Whether this table file is created as part of a compaction including all + // table files. + bool is_full_compaction; + // Whether this table file is created as part of a compaction requested by + // the client. + bool is_manual_compaction; + // The column family that will contain the created table file. + uint32_t column_family_id; + // Reason this table file is being created. + TableFileCreationReason reason; + }; + + virtual ~CompactionFilter() {} + static const char* Type() { return "CompactionFilter"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& name, + const CompactionFilter** result); + + // The table file creation process invokes this method before adding a kv to + // the table file. A return value of false indicates that the kv should be + // preserved in the new table file and a return value of true indicates + // that this key-value should be removed from the new table file. The + // application can inspect the existing value of the key and make decision + // based on it. + // + // Key-Values that are results of merge operation during table file creation + // are not passed into this function. Currently, when you have a mix of Put()s + // and Merge()s on a same key, we only guarantee to process the merge operands + // through the `CompactionFilter`s. Put()s might be processed, or might not. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + // + // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a + // DB* object) will not guarantee to preserve the state of the DB with + // CompactionFilter. Data seen from a snapshot might disappear after a + // table file created with a `CompactionFilter` is installed. If you use + // snapshots, think twice about whether you want to use `CompactionFilter` and + // whether you are using it in a safe way. + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via Options::compaction_filter, this method may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + // + // If the CompactionFilter was created by a factory, then it will only ever + // be used by a single thread that is doing the table file creation, and this + // call does not need to be thread-safe. However, multiple filters may be + // in existence and operating concurrently. + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const { + return false; + } + + // The table file creation process invokes this method on every merge operand. + // If this method returns true, the merge operand will be ignored and not + // written out in the new table file. + // + // Note: If you are using a TransactionDB, it is not recommended to implement + // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB + // may not realize there is a write conflict and may allow a Transaction to + // Commit that should have failed. Instead, it is better to implement any + // Merge filtering inside the MergeOperator. + virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const { + return false; + } + + // An extended API. Called for both values and merge operands. + // Allows changing value and skipping ranges of keys. + // The default implementation uses Filter() and FilterMergeOperand(). + // If you're overriding this method, no need to override the other two. + // `value_type` indicates whether this key-value corresponds to a normal + // value (e.g. written with Put()) or a merge operand (written with Merge()). + // + // Possible return values: + // * kKeep - keep the key-value pair. + // * kRemove - remove the key-value pair or merge operand. + // * kChangeValue - keep the key and change the value/operand to *new_value. + // * kRemoveAndSkipUntil - remove this key-value pair, and also remove + // all key-value pairs with key in [key, *skip_until). This range + // of keys will be skipped without reading, potentially saving some + // IO operations compared to removing the keys one by one. + // + // *skip_until <= key is treated the same as Decision::kKeep + // (since the range [key, *skip_until) is empty). + // + // Caveats: + // - The keys are skipped even if there are snapshots containing them, + // i.e. values removed by kRemoveAndSkipUntil can disappear from a + // snapshot - beware if you're using TransactionDB or + // DB::GetSnapshot(). + // - If value for a key was overwritten or merged into (multiple Put()s + // or Merge()s), and `CompactionFilter` skips this key with + // kRemoveAndSkipUntil, it's possible that it will remove only + // the new value, exposing the old value that was supposed to be + // overwritten. + // - Doesn't work with PlainTableFactory in prefix mode. + // - If you use kRemoveAndSkipUntil for table files created by + // compaction, consider also reducing compaction_readahead_size + // option. + // + // Should never return kUndetermined. + // Note: If you are using a TransactionDB, it is not recommended to filter + // out or modify merge operands (ValueType::kMergeOperand). + // If a merge operation is filtered out, TransactionDB may not realize there + // is a write conflict and may allow a Transaction to Commit that should have + // failed. Instead, it is better to implement any Merge filtering inside the + // MergeOperator. + // key includes timestamp if user-defined timestamp is enabled. + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + switch (value_type) { + case ValueType::kValue: { + bool value_changed = false; + bool rv = Filter(level, key, existing_value, new_value, &value_changed); + if (rv) { + return Decision::kRemove; + } + return value_changed ? Decision::kChangeValue : Decision::kKeep; + } + case ValueType::kMergeOperand: { + bool rv = FilterMergeOperand(level, key, existing_value); + return rv ? Decision::kRemove : Decision::kKeep; + } + case ValueType::kBlobIndex: + return Decision::kKeep; + } + assert(false); + return Decision::kKeep; + } + + // Internal (BlobDB) use only. Do not override in application code. + virtual BlobDecision PrepareBlobOutput(const Slice& /* key */, + const Slice& /* existing_value */, + std::string* /* new_value */) const { + return BlobDecision::kKeep; + } + + // This function is deprecated. Snapshots will always be ignored for + // `CompactionFilter`s, because we realized that not ignoring snapshots + // doesn't provide the guarantee we initially thought it would provide. + // Repeatable reads will not be guaranteed anyway. If you override the + // function and returns false, we will fail the table file creation. + virtual bool IgnoreSnapshots() const { return true; } + + // Returns a name that identifies this `CompactionFilter`. + // The name will be printed to LOG file on start up for diagnosis. + const char* Name() const override = 0; + + // Internal (BlobDB) use only. Do not override in application code. + virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; } + + // In the case of BlobDB, it may be possible to reach a decision with only + // the key without reading the actual value. Keys whose value_type is + // kBlobIndex will be checked by this method. + // Returning kUndetermined will cause FilterV2() to be called to make a + // decision as usual. + virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/, + std::string* /*new_value*/, + std::string* /*skip_until*/) const { + return Decision::kUndetermined; + } +}; + +// Each thread of work involving creating table files will create a new +// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This +// allows the application to know about the different ongoing threads of work +// and makes it unnecessary for `CompactionFilter` to provide thread-safety. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilterFactory : public Customizable { + public: + virtual ~CompactionFilterFactory() {} + static const char* Type() { return "CompactionFilterFactory"; } + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& name, + std::shared_ptr* result); + + // Returns whether a thread creating table files for the specified `reason` + // should invoke `CreateCompactionFilter()` and pass KVs through the returned + // filter. + virtual bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const { + // For backward compatibility, default implementation only applies + // `CompactionFilter` to files generated by compaction. + return reason == TableFileCreationReason::kCompaction; + } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this `CompactionFilter` factory. + virtual const char* Name() const override = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h new file mode 100644 index 000000000..5ff8eccc8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h @@ -0,0 +1,109 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +struct CompactionJobStats { + CompactionJobStats() { Reset(); } + void Reset(); + // Aggregate the CompactionJobStats from another instance with this one + void Add(const CompactionJobStats& stats); + + // the elapsed time of this compaction in microseconds. + uint64_t elapsed_micros; + + // the elapsed CPU time of this compaction in microseconds. + uint64_t cpu_micros; + + // the number of compaction input records. + uint64_t num_input_records; + // the number of blobs read from blob files + uint64_t num_blobs_read; + // the number of compaction input files (table files) + size_t num_input_files; + // the number of compaction input files at the output level (table files) + size_t num_input_files_at_output_level; + + // the number of compaction output records. + uint64_t num_output_records; + // the number of compaction output files (table files) + size_t num_output_files; + // the number of compaction output files (blob files) + size_t num_output_files_blob; + + // true if the compaction is a full compaction (all live SST files input) + bool is_full_compaction; + // true if the compaction is a manual compaction + bool is_manual_compaction; + + // the total size of table files in the compaction input + uint64_t total_input_bytes; + // the total size of blobs read from blob files + uint64_t total_blob_bytes_read; + // the total size of table files in the compaction output + uint64_t total_output_bytes; + // the total size of blob files in the compaction output + uint64_t total_output_bytes_blob; + + // number of records being replaced by newer record associated with same key. + // this could be a new value or a deletion entry for that key so this field + // sums up all updated and deleted keys + uint64_t num_records_replaced; + + // the sum of the uncompressed input keys in bytes. + uint64_t total_input_raw_key_bytes; + // the sum of the uncompressed input values in bytes. + uint64_t total_input_raw_value_bytes; + + // the number of deletion entries before compaction. Deletion entries + // can disappear after compaction because they expired + uint64_t num_input_deletion_records; + // number of deletion records that were found obsolete and discarded + // because it is not possible to delete any more keys with this entry + // (i.e. all possible deletions resulting from it have been completed) + uint64_t num_expired_deletion_records; + + // number of corrupt keys (ParseInternalKey returned false when applied to + // the key) encountered and written out. + uint64_t num_corrupt_keys; + + // Following counters are only populated if + // options.report_bg_io_stats = true; + + // Time spent on file's Append() call. + uint64_t file_write_nanos; + + // Time spent on sync file range. + uint64_t file_range_sync_nanos; + + // Time spent on file fsync. + uint64_t file_fsync_nanos; + + // Time spent on preparing file write (fallocate, etc) + uint64_t file_prepare_write_nanos; + + // 0-terminated strings storing the first 8 bytes of the smallest and + // largest key in the output. + static const size_t kMaxPrefixLength = 8; + + std::string smallest_output_key_prefix; + std::string largest_output_key_prefix; + + // number of single-deletes which do not meet a put + uint64_t num_single_del_fallthru; + + // number of single-deletes which meet something other than a put + uint64_t num_single_del_mismatch; + + // TODO: Add output_to_penultimate_level output information +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h new file mode 100644 index 000000000..ad1e71a11 --- /dev/null +++ b/src/rocksdb/include/rocksdb/comparator.h @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// The general interface for comparing two Slices are defined for both of +// Comparator and some internal data structures. +class CompareInterface { + public: + virtual ~CompareInterface() {} + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + // Note that Compare(a, b) also compares timestamp if timestamp size is + // non-zero. For the same user key with different timestamps, larger (newer) + // timestamp comes first. + virtual int Compare(const Slice& a, const Slice& b) const = 0; +}; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Comparator : public Customizable, public CompareInterface { + public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + + ~Comparator() override {} + + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + const Comparator** comp); + static const char* Type() { return "Comparator"; } + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + const char* Name() const override = 0; + + // Compares two slices for equality. The following invariant should always + // hold (and is the default implementation): + // Equal(a, b) iff Compare(a, b) == 0 + // Overwrite only if equality comparisons can be done more efficiently than + // three-way comparisons. + virtual bool Equal(const Slice& a, const Slice& b) const { + return Compare(a, b) == 0; + } + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; + + // given two keys, determine if t is the successor of s + // BUG: only return true if no other keys starting with `t` are ordered + // before `t`. Otherwise, the auto_prefix_mode can omit entries within + // iterator bounds that have same prefix as upper bound but different + // prefix from seek key. + virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, + const Slice& /*t*/) const { + return false; + } + + // return true if two keys with different byte sequences can be regarded + // as equal by this comparator. + // The major use case is to determine if DataBlockHashIndex is compatible + // with the customized comparator. + virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + // if it is a wrapped comparator, may return the root one. + // return itself it is not wrapped. + virtual const Comparator* GetRootComparator() const { return this; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + + // For two events e1 and e2 whose timestamps are t1 and t2 respectively, + // Returns value: + // < 0 iff t1 < t2 + // == 0 iff t1 == t2 + // > 0 iff t1 > t2 + // Note that an all-zero byte array will be the smallest (oldest) timestamp + // of the same length, and a byte array with all bits 1 will be the largest. + // In the future, we can extend Comparator so that subclasses can specify + // both largest and smallest timestamps. + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, + const Slice& b, bool /*b_has_ts*/) const { + return Compare(a, b); + } + + virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return 0 == + CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + + private: + size_t timestamp_size_; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +// Return a builtin comparator that uses reverse lexicographic byte-wise +// ordering. +extern const Comparator* ReverseBytewiseComparator(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/compression_type.h b/src/rocksdb/include/rocksdb/compression_type.h new file mode 100644 index 000000000..bfeb00bde --- /dev/null +++ b/src/rocksdb/include/rocksdb/compression_type.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. + +enum CompressionType : unsigned char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, + kZlibCompression = 0x2, + kBZip2Compression = 0x3, + kLZ4Compression = 0x4, + kLZ4HCCompression = 0x5, + kXpressCompression = 0x6, + kZSTD = 0x7, + + // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than + // 0.8.0 or consider a possibility of downgrading the service or copying + // the database files to another service running with an older version of + // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will + // eventually remove the option from the public API. + kZSTDNotFinalCompression = 0x40, + + // kDisableCompressionOption is used to disable some compression options. + kDisableCompressionOption = 0xff, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h new file mode 100644 index 000000000..9ad741f98 --- /dev/null +++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h @@ -0,0 +1,51 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This is NOT an extensible interface but a public interface for result of +// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal. +class ConcurrentTaskLimiter { + public: + virtual ~ConcurrentTaskLimiter() {} + + // Returns a name that identifies this concurrent task limiter. + virtual const std::string& GetName() const = 0; + + // Set max concurrent tasks. + // limit = 0 means no new task allowed. + // limit < 0 means no limitation. + virtual void SetMaxOutstandingTask(int32_t limit) = 0; + + // Reset to unlimited max concurrent task. + virtual void ResetMaxOutstandingTask() = 0; + + // Returns current outstanding task count. + virtual int32_t GetOutstandingTask() const = 0; +}; + +// Create a ConcurrentTaskLimiter that can be shared with multiple CFs +// across RocksDB instances to control concurrent tasks. +// +// @param name: Name of the limiter. +// @param limit: max concurrent tasks. +// limit = 0 means no new task allowed. +// limit < 0 means no limitation. +extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/configurable.h b/src/rocksdb/include/rocksdb/configurable.h new file mode 100644 index 000000000..60ae89f97 --- /dev/null +++ b/src/rocksdb/include/rocksdb/configurable.h @@ -0,0 +1,400 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class ObjectRegistry; +class OptionTypeInfo; +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; + +// Configurable is a base class used by the rocksdb that describes a +// standard way of configuring objects. A Configurable object can: +// -> Populate itself given: +// - One or more "name/value" pair strings +// - A string representing the set of name=value properties +// - A map of name/value properties. +// -> Convert itself into its string representation +// -> Dump itself to a Logger +// -> Compare itself to another Configurable object to see if the two objects +// have equivalent options settings +// +// If a derived class calls RegisterOptions to register (by name) how its +// options objects are to be processed, this functionality can typically be +// handled by this class without additional overrides. Otherwise, the derived +// class will need to implement the methods for handling the corresponding +// functionality. +class Configurable { + protected: + friend class ConfigurableHelper; + struct RegisteredOptions { + // The name of the options being registered + std::string name; + // Pointer to the object being registered + void* opt_ptr; +#ifndef ROCKSDB_LITE + // The map of options being registered + const std::unordered_map* type_map; +#endif + }; + + public: + virtual ~Configurable() {} + + // Returns the raw pointer of the named options that is used by this + // object, or nullptr if this function is not supported. + // Since the return value is a raw pointer, the object owns the + // pointer and the caller should not delete the pointer. + // + // Note that changing the underlying options while the object + // is currently used by any open DB is undefined behavior. + // Developers should use DB::SetOption() instead to dynamically change + // options while the DB is open. + template + const T* GetOptions() const { + return GetOptions(T::kName()); + } + template + T* GetOptions() { + return GetOptions(T::kName()); + } + template + const T* GetOptions(const std::string& name) const { + return reinterpret_cast(GetOptionsPtr(name)); + } + template + T* GetOptions(const std::string& name) { + return reinterpret_cast(const_cast(GetOptionsPtr(name))); + } + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. + // If this method fails, an attempt is made to revert the object to original + // state. Note that the revert may not be the original state but may be an + // equivalent. For example, if the object contains an option that is a + // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not + // the Cache object that was passed in, but a Cache object of the same size). + // + // The acceptable values of the name/value pairs are documented with the + // specific class/instance. + // + // @param config_options Controls how the arguments are processed. + // @param opt_map Name/value pairs of the options to update + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all values in the map were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names in the opt_map were not valid + // for this object. If unused is specified, it will contain the + // collection of NotFound names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + // @see ConfigOptions for a description of the controls. + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map); + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map, + std::unordered_map* unused); + +#ifndef ROCKSDB_LITE + // Updates the named option to the input value, returning OK if successful. + // Note that ConfigureOption does not cause PrepareOptions to be invoked. + // @param config_options Controls how the name/value is processed. + // @param name The name of the option to update + // @param value The value to set for the named option + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @return NotSupported If the name is valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If the value cannot be successfully parsed. + Status ConfigureOption(const ConfigOptions& config_options, + const std::string& name, const std::string& value); +#endif // ROCKSDB_LITE + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. If this method fails, an attempt is made to revert the + // object to original state. Note that the revert may not be the original + // state but may be an equivalent. + // @see ConfigureFromMap for more details + // @param config_options Controls how the arguments are processed. + // @param opt_str string containing the values to update. + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all specified values were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names were not valid for this object. + // If unused is specified, it will contain the collection of NotFound + // names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + Status ConfigureFromString(const ConfigOptions& config_options, + const std::string& opts); + + // Fills in result with the serialized options for this object. + // This is the inverse of ConfigureFromString. + // @param config_options Controls how serialization happens. + // @param result The string representation of this object. + // @return OK If the options for this object were successfully serialized. + // @return InvalidArgument If one or more of the options could not be + // serialized. + Status GetOptionString(const ConfigOptions& config_options, + std::string* result) const; +#ifndef ROCKSDB_LITE + // Returns the serialized options for this object. + // This method is similar to GetOptionString with no errors. + // @param config_options Controls how serialization happens. + // @param prefix A string to prepend to every option. + // @return The serialized representation of the options for this object + std::string ToString(const ConfigOptions& config_options) const { + return ToString(config_options, ""); + } + std::string ToString(const ConfigOptions& config_options, + const std::string& prefix) const; + + // Returns the list of option names associated with this configurable + // @param config_options Controls how the names are returned + // @param result The set of option names for this object. Note that + // options that are deprecated or aliases are not returned. + // @return OK on success. + Status GetOptionNames(const ConfigOptions& config_options, + std::unordered_set* result) const; + + // Returns the value of the option associated with the input name + // This method is the functional inverse of ConfigureOption + // @param config_options Controls how the value is returned + // @param name The name of the option to return a value for. + // @param value The returned value associated with the named option. + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @param InvalidArgument If the name is valid for this object but + // its value cannot be serialized. + virtual Status GetOption(const ConfigOptions& config_options, + const std::string& name, std::string* value) const; +#endif // ROCKSDB_LITE + + // Checks to see if this Configurable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + virtual bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* name) const; + + // Returns a pretty-printed, human-readable version of the options. + // This method is typically used to dump the options to a log file. + // Classes should override this method + virtual std::string GetPrintableOptions() const { return ""; } + + // Validates that the settings are valid/consistent and performs any object + // initialization required by this object. This method may be called as part + // of Configure (if invoke_prepare_options is set), or may be invoked + // separately. + // + // Once an object has been prepared, non-mutable options can no longer be + // updated. + // + // Classes must override this method to provide any implementation-specific + // initialization, such as opening log files or setting up cache parameters. + // Implementations should be idempotent (e.g. don't re-open the log file or + // reconfigure the cache), as there is the potential this method can be called + // more than once. + // + // By default, this method will also prepare all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param config_options Controls how the object is prepared. Also contains + // a Logger and Env that can be used to initialize this object. + // @return OK If the object was successfully initialized. + // @return InvalidArgument If this object could not be successfully + // initialized. + virtual Status PrepareOptions(const ConfigOptions& config_options); + + // Checks to see if the settings are valid for this object. + // This method checks to see if the input DBOptions and ColumnFamilyOptions + // are valid for the settings of this object. For example, an Env might not + // support certain mmap modes or a TableFactory might require certain + // settings. + // + // By default, this method will also validate all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param db_opts The DBOptions to validate + // @param cf_opts The ColumnFamilyOptions to validate + // @return OK if the options are valid + // @return InvalidArgument If the arguments are not valid for the options + // of the current object. + virtual Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const; + + // Splits the input opt_value into the ID field and the remaining options. + // The input opt_value can be in the form of "name" or "name=value + // [;name=value]". The first form uses the "name" as an id with no options The + // latter form converts the input into a map of name=value pairs and sets "id" + // to the "id" value from the map. + // @param opt_value The value to split into id and options + // @param id The id field from the opt_value + // @param options The remaining name/value pairs from the opt_value + // @param default_id If specified and there is no id field in the map, this + // value is returned as the ID + // @return OK if the value was converted to a map successfully and an ID was + // found. + // @return InvalidArgument if the value could not be converted to a map or + // there was or there is no id property in the map. + static Status GetOptionsMap( + const std::string& opt_value, const std::string& default_id, + std::string* id, std::unordered_map* options); + + protected: + // Returns the raw pointer for the associated named option. + // The name is typically the name of an option registered via the + // Classes may override this method to provide further specialization (such as + // returning a sub-option) + // + // The default implementation looks at the registered options. If the + // input name matches that of a registered option, the pointer registered + // with that name is returned. + // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns + // "my_ptr" + virtual const void* GetOptionsPtr(const std::string& name) const; + + // Method for allowing options to be configured outside of the normal + // registered options framework. Classes may override this method if they + // wish to support non-standard options implementations (such as configuring + // themselves from constant or simple ":"-separated strings. + // + // The default implementation does nothing and returns OK + virtual Status ParseStringOptions(const ConfigOptions& config_options, + const std::string& opts_str); + + // Internal method to configure an object from a map of name-value options. + // This method uses the input config_options to drive the configuration of + // the options in opt_map. Any option name that cannot be found from the + // input set will be returned in "unused". + // + // Classes may override this method to extend the functionality if required. + // @param config_options Controls how the options are configured and errors + // handled. + // @param opts_map The set of options to configure + // @param unused Any options from opt_map that were not configured. + // @returns a Status based on the rules outlined in ConfigureFromMap + virtual Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused); + +#ifndef ROCKSDB_LITE + // Method that configures a the specific opt_name from opt_value. + // By default, this method calls opt_info.ParseOption with the + // input parameters. + // Classes may override this method to extend the functionality, or + // change the returned Status. + virtual Status ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, void* opt_ptr); + + // Internal method to see if the single option name/info matches for this and + // that Classes may override this value to change its behavior. + // @param config_options Controls how the options are being matched + // @param opt_info The OptionTypeInfo registered for this option name + // that controls what field is matched (offset) and how (type). + // @param name The name associated with this opt_info. + // @param this_ptr The base pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param that_ptr The other pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param bad_name If the match fails, the name of the option that failed to + // match. + virtual bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& name, + const void* const this_ptr, + const void* const that_ptr, + std::string* bad_name) const; +#endif +#ifndef ROCKSDB_LITE + // Internal method to serialize options (ToString) + // Classes may override this value to change its behavior. + virtual std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const; +#endif // ROCKSDB_LITE + + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + virtual std::string GetOptionName(const std::string& long_name) const; + + // Registers the input name with the options and associated map. + // When classes register their options in this manner, most of the + // functionality (excluding unknown options and validate/prepare) is + // implemented by the base class. + // + // This method should be called in the class constructor to register the + // option set for this object. For example, to register the options + // associated with the BlockBasedTableFactory, the constructor calls this + // method passing in: + // - the name of the options ("BlockBasedTableOptions"); + // - the options object (the BlockBasedTableOptions object for this object; + // - the options type map for the BlockBasedTableOptions. + // This registration allows the Configurable class to process the option + // values associated with the BlockBasedTableOptions without further code in + // the derived class. + // + // @param name The name of this set of options (@see GetOptionsPtr) + // @param opt_ptr Pointer to the options to associate with this name + // @param opt_map Options map that controls how this option is configured. + template + void RegisterOptions( + T* opt_ptr, + const std::unordered_map* opt_map) { + RegisterOptions(T::kName(), opt_ptr, opt_map); + } + void RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* opt_map); + + // Returns true if there are registered options for this Configurable object + inline bool HasRegisteredOptions() const { return !options_.empty(); } + + private: + // Contains the collection of options (name, opt_ptr, opt_map) associated with + // this object. This collection is typically set in the constructor of the + // Configurable option via + std::vector options_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h new file mode 100644 index 000000000..921ec221b --- /dev/null +++ b/src/rocksdb/include/rocksdb/convenience.h @@ -0,0 +1,525 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +class Env; +class Logger; +class ObjectRegistry; + +struct ColumnFamilyOptions; +struct DBOptions; +struct Options; + +// ConfigOptions containing the parameters/controls for +// comparing objects and converting to/from strings. +// These settings control how the methods +// treat errors (e.g. ignore_unknown_objects), the format +// of the serialization (e.g. delimiter), and how to compare +// options (sanity_level). +struct ConfigOptions { + // Constructs a new ConfigOptions with a new object registry. + // This method should only be used when a DBOptions is not available, + // else registry settings may be lost + ConfigOptions(); + + // Constructs a new ConfigOptions using the settings from + // the input DBOptions. Currently constructs a new object registry. + explicit ConfigOptions(const DBOptions&); + + // This enum defines the RocksDB options sanity level. + enum SanityLevel : unsigned char { + kSanityLevelNone = 0x01, // Performs no sanity check at all. + // Performs minimum check to ensure the RocksDB instance can be + // opened without corrupting / mis-interpreting the data. + kSanityLevelLooselyCompatible = 0x02, + // Perform exact match sanity check. + kSanityLevelExactMatch = 0xFF, + }; + + enum Depth { + kDepthDefault, // Traverse nested options that are not flagged as "shallow" + kDepthShallow, // Do not traverse into any nested options + kDepthDetailed, // Traverse nested options, overriding the options shallow + // setting + }; + + // When true, any unused options will be ignored and OK will be returned + bool ignore_unknown_options = false; + + // When true, any unsupported options will be ignored and OK will be returned + bool ignore_unsupported_options = true; + + // If the strings are escaped (old-style?) + bool input_strings_escaped = true; + + // Whether or not to invoke PrepareOptions after configure is called. + bool invoke_prepare_options = true; + + // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not. + // When "mutable_options_only=false", all options are evaluated. + // When "mutable_options_only="true", any option not marked as Mutable is + // either ignored (in the case of string/equals methods) or results in an + // error (in the case of Configure). + bool mutable_options_only = false; + + // The separator between options when converting to a string + std::string delimiter = ";"; + + // Controls how to traverse options during print/match stages + Depth depth = Depth::kDepthDefault; + + // Controls how options are serialized + // Controls how pedantic the comparison must be for equivalency + SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch; + // `file_readahead_size` is used for readahead for the option file. + size_t file_readahead_size = 512 * 1024; + + // The environment to use for this option + Env* env = Env::Default(); + +#ifndef ROCKSDB_LITE + // The object registry to use for this options + std::shared_ptr registry; +#endif + + bool IsShallow() const { return depth == Depth::kDepthShallow; } + bool IsDetailed() const { return depth == Depth::kDepthDetailed; } + + bool IsCheckDisabled() const { + return sanity_level == SanityLevel::kSanityLevelNone; + } + + bool IsCheckEnabled(SanityLevel level) const { + return (level > SanityLevel::kSanityLevelNone && level <= sanity_level); + } +}; + +#ifndef ROCKSDB_LITE + +// The following set of functions provide a way to construct RocksDB Options +// from a string or a string-to-string map. Here is the general rule of +// setting option values from strings by type. Some RocksDB types are also +// supported in these APIs. Please refer to the comment of the function itself +// to find more information about how to config those RocksDB types. +// +// * Strings: +// Strings will be used as values directly without any truncating or +// trimming. +// +// * Booleans: +// - "true" or "1" => true +// - "false" or "0" => false. +// [Example]: +// - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or +// - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString. +// +// * Integers: +// Integers are converted directly from string, in addition to the following +// units that we support: +// - 'k' or 'K' => 2^10 +// - 'm' or 'M' => 2^20 +// - 'g' or 'G' => 2^30 +// - 't' or 'T' => 2^40 // only for unsigned int with sufficient bits. +// [Example]: +// - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or +// - "arena_block_size=19G" in GetColumnFamilyOptionsFromString. +// +// * Doubles / Floating Points: +// Doubles / Floating Points are converted directly from string. Note that +// currently we do not support units. +// [Example]: +// - {"memtable_prefix_bloom_size_ratio", "0.1"} in +// GetColumnFamilyOptionsFromMap, or +// - "memtable_prefix_bloom_size_ratio=0.1" in +// GetColumnFamilyOptionsFromString. +// * Array / Vectors: +// An array is specified by a list of values, where ':' is used as +// the delimiter to separate each value. +// [Example]: +// - {"compression_per_level", "kNoCompression:kSnappyCompression"} +// in GetColumnFamilyOptionsFromMap, or +// - "compression_per_level=kNoCompression:kSnappyCompression" in +// GetColumnFamilyOptionsFromMapString +// * Enums: +// The valid values of each enum are identical to the names of its constants. +// [Example]: +// - CompressionType: valid values are "kNoCompression", +// "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ... +// - CompactionStyle: valid values are "kCompactionStyleLevel", +// "kCompactionStyleUniversal", "kCompactionStyleFIFO", and +// "kCompactionStyleNone". +// + +// Take a default ColumnFamilyOptions "base_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// ColumnFamilyOptions "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in ColumnFamilyOptions: +// +// * table_factory: +// table_factory can be configured using our custom nested-option syntax. +// +// {option_a=value_a; option_b=value_b; option_c=value_c; ... } +// +// A nested option is enclosed by two curly braces, within which there are +// multiple option assignments. Each assignment is of the form +// "variable_name=value;". +// +// Currently we support the following types of TableFactory: +// - BlockBasedTableFactory: +// Use name "block_based_table_factory" to initialize table_factory with +// BlockBasedTableFactory. Its BlockBasedTableFactoryOptions can be +// configured using the nested-option syntax. +// [Example]: +// * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"} +// is equivalent to assigning table_factory with a BlockBasedTableFactory +// that has 1M LRU block-cache with block size equals to 4k: +// ColumnFamilyOptions cf_opt; +// BlockBasedTableOptions blk_opt; +// blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024); +// blk_opt.block_size = 4 * 1024; +// cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt)); +// - PlainTableFactory: +// Use name "plain_table_factory" to initialize table_factory with +// PlainTableFactory. Its PlainTableFactoryOptions can be configured using +// the nested-option syntax. +// [Example]: +// * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"} +// +// * memtable_factory: +// Use "memtable" to config memtable_factory. Here are the supported +// memtable factories: +// - SkipList: +// Pass "skip_list:" to config memtable to use SkipList, +// or simply "skip_list" to use the default SkipList. +// [Example]: +// * {"memtable", "skip_list:5"} is equivalent to setting +// memtable to SkipListFactory(5). +// - PrefixHash: +// Pass "prefix_hash:" to config memtable +// to use PrefixHash, or simply "prefix_hash" to use the default +// PrefixHash. +// [Example]: +// * {"memtable", "prefix_hash:1000"} is equivalent to setting +// memtable to NewHashSkipListRepFactory(hash_bucket_count). +// - HashLinkedList: +// Pass "hash_linkedlist:" to config memtable +// to use HashLinkedList, or simply "hash_linkedlist" to use the default +// HashLinkedList. +// [Example]: +// * {"memtable", "hash_linkedlist:1000"} is equivalent to +// setting memtable to NewHashLinkListRepFactory(1000). +// - VectorRepFactory: +// Pass "vector:" to config memtable to use VectorRepFactory, +// or simply "vector" to use the default Vector memtable. +// [Example]: +// * {"memtable", "vector:1024"} is equivalent to setting memtable +// to VectorRepFactory(1024). +// +// * compression_opts: +// Use "compression_opts" to config compression_opts. The value format +// is of the form ":::". +// [Example]: +// * {"compression_opts", "4:5:6:7"} is equivalent to setting: +// ColumnFamilyOptions cf_opt; +// cf_opt.compression_opts.window_bits = 4; +// cf_opt.compression_opts.level = 5; +// cf_opt.compression_opts.strategy = 6; +// cf_opt.compression_opts.max_dict_bytes = 7; +// +// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetColumnFamilyOptionsFromMap( + const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options); +Status GetColumnFamilyOptionsFromMap( + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a default DBOptions "base_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// DBOptions "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in DBOptions: +// +// * rate_limiter_bytes_per_sec: +// RateLimiter can be configured directly by specifying its bytes_per_sec. +// [Example]: +// - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to +// passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec. +// +// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetDBOptionsFromMap( + const ConfigOptions& cfg_options, const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options); +Status GetDBOptionsFromMap( + const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a default BlockBasedTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// BlockBasedTableOptions "new_table_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in BlockBasedTableOptions: +// +// * filter_policy: +// We currently only support the following FilterPolicy in the convenience +// functions: +// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]" +// to specify BloomFilter. The above string is equivalent to calling +// NewBloomFilterPolicy(bits_per_key, use_block_based_builder). +// [Example]: +// - Pass {"filter_policy", "bloomfilter:4:true"} in +// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits +// per key and use_block_based_builder enabled. +// +// * block_cache / block_cache_compressed: +// We currently only support LRU cache in the GetOptions API. The LRU +// cache can be set by directly specifying its size. +// [Example]: +// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is +// equivalent to setting block_cache using NewLRUCache(1024 * 1024). +// +// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding +// options in the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetBlockBasedTableOptionsFromMap( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options); +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options, + bool input_strings_escaped = false, bool ignore_unknown_options = false); + +// Take a default PlainTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// PlainTableOptions "new_table_options". +// +// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetPlainTableOptionsFromMap( + const ConfigOptions& config_options, const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options); +Status GetPlainTableOptionsFromMap( + const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a string representation of option names and values, apply them into the +// base_options, and return the new options as a result. The string has the +// following format: +// "write_buffer_size=1024;max_write_buffer_number=2" +// Nested options config is also possible. For example, you can define +// BlockBasedTableOptions as part of the string for block-based table factory: +// "write_buffer_size=1024;block_based_table_factory={block_size=4k};" +// "max_write_buffer_num=2" +// +// +// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); +Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); + +Status GetDBOptionsFromString(const ConfigOptions& config_options, + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + +Status GetDBOptionsFromString(const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + +Status GetStringFromDBOptions(const ConfigOptions& config_options, + const DBOptions& db_options, + std::string* opts_str); + +Status GetStringFromDBOptions(std::string* opts_str, + const DBOptions& db_options, + const std::string& delimiter = "; "); + +Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options, + const ColumnFamilyOptions& cf_options, + std::string* opts_str); +Status GetStringFromColumnFamilyOptions(std::string* opts_str, + const ColumnFamilyOptions& cf_options, + const std::string& delimiter = "; "); +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type); + +std::vector GetSupportedCompressions(); + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options); +Status GetBlockBasedTableOptionsFromString( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options); + +Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); +Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, + const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr* new_mem_factory); + +Status GetOptionsFromString(const Options& base_options, + const std::string& opts_str, Options* new_options); +Status GetOptionsFromString(const ConfigOptions& config_options, + const Options& base_options, + const std::string& opts_str, Options* new_options); + +Status StringToMap(const std::string& opts_str, + std::unordered_map* opts_map); + +// Request stopping background work, if wait is true wait until it's done +void CancelAllBackgroundWork(DB* db, bool wait = false); + +// Delete files which are entirely in the given range +// Could leave some keys in the range which are in files which are not +// entirely in the range. Also leaves L0 files regardless of whether they're +// in the range. +// Snapshots before the delete might not see the data in the given range. +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end = true); + +// Delete files in multiple ranges at once +// Delete files in a lot of ranges one at a time can be slow, use this API for +// better performance in that case. +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno = 0); +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/customizable.h b/src/rocksdb/include/rocksdb/customizable.h new file mode 100644 index 000000000..92f7504ae --- /dev/null +++ b/src/rocksdb/include/rocksdb/customizable.h @@ -0,0 +1,233 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/configurable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +/** + * Customizable a base class used by the rocksdb that describes a + * standard way of configuring and creating objects. Customizable objects + * are configurable objects that can be created from an ObjectRegistry. + * + * Customizable classes are used when there are multiple potential + * implementations of a class for use by RocksDB (e.g. Table, Cache, + * MergeOperator, etc). The abstract base class is expected to define a method + * declaring its type and a factory method for creating one of these, such as: + * static const char *Type() { return "Table"; } + * static Status CreateFromString(const ConfigOptions& options, + * const std::string& id, + * std::shared_ptr* result); + * The "Type" string is expected to be unique (no two base classes are the same + * type). This factory is expected, based on the options and id, create and + * return the appropriate derived type of the customizable class (e.g. + * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers, + * helper classes and methods are provided for writing this factory. + * + * Instances of a Customizable class need to define: + * - A "static const char *kClassName()" method. This method defines the name + * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the + * CheckedCast method. + * - The Name() of the object. This name is used when creating and saving + * instances of this class. Typically this name will be the same as + * kClassName(). + * + * Additionally, Customizable classes should register any options used to + * configure themselves with the Configurable subsystem. + * + * When a Customizable is being created, the "name" property specifies + * the name of the instance being created. + * For custom objects, their configuration and name can be specified by: + * [prop]={name=X;option 1 = value1[; option2=value2...]} + * + * [prop].name=X + * [prop].option1 = value1 + * + * [prop].name=X + * X.option1 =value1 + */ +class Customizable : public Configurable { + public: + ~Customizable() override {} + + // Returns the name of this class of Customizable + virtual const char* Name() const = 0; + + // Returns an identifier for this Customizable. + // This could be its name or something more complex (like its URL/pattern). + // Used for pretty printing. + virtual std::string GetId() const { + std::string id = Name(); + return id; + } + + // This is typically determined by if the input name matches the + // name of this object. + // This method is typically used in conjunction with CheckedCast to find the + // derived class instance from its base. For example, if you have an Env + // and want the "Default" env, you would IsInstanceOf("Default") to get + // the default implementation. This method should be used when you need a + // specific derivative or implementation of a class. + // + // Intermediary caches (such as SharedCache) may wish to override this method + // to check for the intermediary name (SharedCache). Classes with multiple + // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override + // this method. + // + // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a". + // Wrapped classes that have an Inner "has-a" should not be returned. + // + // @param name The name of the instance to find. + // Returns true if the class is an instance of the input name. + virtual bool IsInstanceOf(const std::string& name) const { + if (name.empty()) { + return false; + } else if (name == Name()) { + return true; + } else { + const char* nickname = NickName(); + if (nickname != nullptr && name == nickname) { + return true; + } else { + return false; + } + } + } + + const void* GetOptionsPtr(const std::string& name) const override { + const void* ptr = Configurable::GetOptionsPtr(name); + if (ptr != nullptr) { + return ptr; + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->GetOptionsPtr(name); + } else { + return nullptr; + } + } + } + + // Returns the named instance of the Customizable as a T*, or nullptr if not + // found. This method uses IsInstanceOf/Inner to find the appropriate class + // instance and then casts it to the expected return type. + template + const T* CheckedCast() const { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + template + T* CheckedCast() { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + auto inner = const_cast(Inner()); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + // Checks to see if this Customizable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + // @see Configurable::AreEquivalent for more details + bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const override; +#ifndef ROCKSDB_LITE + // Gets the value of the option associated with the input name + // @see Configurable::GetOption for more details + Status GetOption(const ConfigOptions& config_options, const std::string& name, + std::string* value) const override; +#endif // ROCKSDB_LITE + // Helper method for getting for parsing the opt_value into the corresponding + // options for use in potentially creating a new Customizable object (this + // method is primarily a support method for LoadSharedObject et al for new + // Customizable objects). The opt_value may be either name-value pairs + // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new + // Customizable, the ID is determined by: + // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this + // name; + // - Otherwise, if there is a "id=value", the id is set to "value" + // - Otherwise, if the input customizable is not null, custom->GetId is used + // - Otherwise, an error is returned. + // + // If the opt_value is name-value pairs, these pairs will be returned in + // options (without the id pair). If the ID being returned matches the ID of + // the input custom object, then the options from the input object will also + // be added to the returned options. + // + // This method returns non-OK if the ID could not be found, or if the + // opt_value could not be parsed into name-value pairs. + static Status GetOptionsMap( + const ConfigOptions& config_options, const Customizable* custom, + const std::string& opt_value, std::string* id, + std::unordered_map* options); + + // Helper method to configure a new object with the supplied options. + // If the object is not null and invoke_prepare_options=true, the object + // will be configured and prepared. + // Returns success if the object is properly configured and (optionally) + // prepared Returns InvalidArgument if the object is nullptr and there are + // options in the map Returns the result of the ConfigureFromMap or + // PrepareOptions + static Status ConfigureNewObject( + const ConfigOptions& config_options, Customizable* object, + const std::unordered_map& options); + + // Returns the inner class when a Customizable implements a has-a (wrapped) + // relationship. Derived classes that implement a has-a must override this + // method in order to get CheckedCast to function properly. + virtual const Customizable* Inner() const { return nullptr; } + + protected: + // Generates a ID specific for this instance of the customizable. + // The unique ID is of the form :#pid, where: + // - name is the Name() of this object; + // - addr is the memory address of this object; + // - pid is the process ID of this process ID for this process. + // Note that if obj1 and obj2 have the same unique IDs, they must be the + // same. However, if an object is deleted and recreated, it may have the + // same unique ID as a predecessor + // + // This method is useful for objects (especially ManagedObjects) that + // wish to generate an ID that is specific for this instance and wish to + // override the GetId() method. + std::string GenerateIndividualId() const; + + // Some classes have both a class name (e.g. PutOperator) and a nickname + // (e.g. put). Classes can override this method to return a + // nickname. Nicknames can be used by InstanceOf and object creation. + virtual const char* NickName() const { return ""; } + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + std::string GetOptionName(const std::string& long_name) const override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& options, + const std::string& prefix) const override; +#endif // ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/data_structure.h b/src/rocksdb/include/rocksdb/data_structure.h new file mode 100644 index 000000000..f868a6be5 --- /dev/null +++ b/src/rocksdb/include/rocksdb/data_structure.h @@ -0,0 +1,51 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This is a data structure specifically designed as a "Set" for a +// pretty small scale of Enum structure. For now, it can support up +// to 64 element, and it is expandable in the future. +template +class SmallEnumSet { + public: + SmallEnumSet() : state_(0) {} + + ~SmallEnumSet() {} + + // Return true if the input enum is included in the "Set" (i.e., changes the + // internal scalar state successfully), otherwise, it will return false. + bool Add(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t old_state = state_; + uint64_t tmp = 1; + state_ |= (tmp << value); + return old_state != state_; + } + + // Return true if the input enum is contained in the "Set". + bool Contains(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t tmp = 1; + return state_ & (tmp << value); + } + + private: + uint64_t state_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h new file mode 100644 index 000000000..26c07c19f --- /dev/null +++ b/src/rocksdb/include/rocksdb/db.h @@ -0,0 +1,1859 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/block_cache_trace_writer.h" +#include "rocksdb/iterator.h" +#include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/version.h" +#include "rocksdb/wide_columns.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +struct ColumnFamilyOptions; +struct CompactionOptions; +struct CompactRangeOptions; +struct DBOptions; +struct ExternalSstFileInfo; +struct FlushOptions; +struct Options; +struct ReadOptions; +struct TableProperties; +struct WriteOptions; +#ifdef ROCKSDB_LITE +class CompactionJobInfo; +#endif +class Env; +class EventListener; +class FileSystem; +#ifndef ROCKSDB_LITE +class Replayer; +#endif +class StatsHistoryIterator; +#ifndef ROCKSDB_LITE +class TraceReader; +class TraceWriter; +#endif +class WriteBatch; + +extern const std::string kDefaultColumnFamilyName; +extern const std::string kPersistentStatsColumnFamilyName; +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& _name, + const ColumnFamilyOptions& _options) + : name(_name), options(_options) {} +}; + +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} + // Returns the name of the column family associated with the current handle. + virtual const std::string& GetName() const = 0; + // Returns the ID of the column family associated with the current handle. + virtual uint32_t GetID() const = 0; + // Fills "*desc" with the up-to-date descriptor of the column family + // associated with this handle. Since it fills "*desc" with the up-to-date + // information, this call might internally lock and release DB mutex to + // access the up-to-date CF options. In addition, all the pointer-typed + // options cannot be referenced any longer than the original options exist. + // + // Note that this function is not supported in RocksDBLite. + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0; + // Returns the comparator of the column family associated with the + // current handle. + virtual const Comparator* GetComparator() const = 0; +}; + +static const int kMajorVersion = __ROCKSDB_MAJOR__; +static const int kMinorVersion = __ROCKSDB_MINOR__; + +// A range of keys +struct Range { + Slice start; + Slice limit; + + Range() {} + Range(const Slice& s, const Slice& l) : start(s), limit(l) {} +}; + +struct RangePtr { + const Slice* start; + const Slice* limit; + + RangePtr() : start(nullptr), limit(nullptr) {} + RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} +}; + +// It is valid that files_checksums and files_checksum_func_names are both +// empty (no checksum information is provided for ingestion). Otherwise, +// their sizes should be the same as external_files. The file order should +// be the same in three vectors and guaranteed by the caller. +// Note that, we assume the temperatures of this batch of files to be +// ingested are the same. +struct IngestExternalFileArg { + ColumnFamilyHandle* column_family = nullptr; + std::vector external_files; + IngestExternalFileOptions options; + std::vector files_checksums; + std::vector files_checksum_func_names; + Temperature file_temperature = Temperature::kUnknown; +}; + +struct GetMergeOperandsOptions { + int expected_max_number_of_operands = 0; +}; + +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +using TablePropertiesCollection = + std::unordered_map>; + +// A DB is a persistent, versioned ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +// DB is an abstract base class with one primary implementation (DBImpl) +// and a number of wrapper implementations. +class DB { + public: + // Open the database with the specified "name" for reads and writes. + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error, including + // if the DB is already open (read-write) by another DB object. (This + // guarantee depends on options.env->LockFile(), which might not provide + // this guarantee in a custom Env implementation.) + // + // Caller must delete *dbptr when it is no longer needed. + static Status Open(const Options& options, const std::string& name, + DB** dbptr); + + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families in the database, + // containing column family name and options. You need to open ALL column + // families in the database. To get the list of column families, you can use + // ListColumnFamilies(). + // + // The default column family name is 'default' and it's stored + // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i]. + // Before delete DB, you have to close All column families by calling + // DestroyColumnFamilyHandle() with all the handles. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // OpenForReadOnly() creates a Read-only instance that supports reads alone. + // + // All DB interfaces that modify data, like put/delete, will return error. + // Automatic Flush and Compactions are disabled and any manual calls + // to Flush/Compaction will return error. + // + // While a given DB can be simultaneously opened via OpenForReadOnly + // by any number of readers, if a DB is simultaneously opened by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + + // Open the database for read only. + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. + static Status OpenForReadOnly(const Options& options, const std::string& name, + DB** dbptr, + bool error_if_wal_file_exists = false); + + // Open the database for read only with column families. + // + // When opening DB with read only, you can specify only a subset of column + // families in the database that should be opened. However, you always need + // to specify default column family. The default column family name is + // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. + static Status OpenForReadOnly( + const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists = false); + + // OpenAsSecondary() creates a secondary instance that supports read-only + // operations and supports dynamic catch up with the primary (through a + // call to TryCatchUpWithPrimary()). + // + // All DB interfaces that modify data, like put/delete, will return error. + // Automatic Flush and Compactions are disabled and any manual calls + // to Flush/Compaction will return error. + // + // Multiple secondary instances can co-exist at the same time. + // + + // Open DB as secondary instance + // + // The options argument specifies the options to open the secondary instance. + // Options.max_open_files should be set to -1. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. + // + // Return OK on success, non-OK on failures. + static Status OpenAsSecondary(const Options& options, const std::string& name, + const std::string& secondary_path, DB** dbptr); + + // Open DB as secondary instance with specified column families + // + // When opening DB in secondary mode, you can specify only a subset of column + // families in the database that should be opened. However, you always need + // to specify default column family. The default column family name is + // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName + // + // Column families created by the primary after the secondary instance starts + // are currently ignored by the secondary instance. Column families opened + // by secondary and dropped by the primary will be dropped by secondary as + // well (on next invocation of TryCatchUpWithPrimary()). However the user + // of the secondary instance can still access the data of such dropped column + // family as long as they do not destroy the corresponding column family + // handle. + // + // The options argument specifies the options to open the secondary instance. + // Options.max_open_files should be set to -1. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The column_families argument specifies a list of column families to open. + // If default column family is not specified or if any specified column + // families does not exist, the function returns non-OK status. + // The handles is an out-arg corresponding to the opened database column + // family handles. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. Before deleting the dbptr, the user should also + // delete the pointers stored in handles vector. + // + // Return OK on success, non-OK on failures. + static Status OpenAsSecondary( + const DBOptions& db_options, const std::string& name, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // Open DB and run the compaction. + // It's a read-only operation, the result won't be installed to the DB, it + // will be output to the `output_directory`. The API should only be used with + // `options.CompactionService` to run compaction triggered by + // `CompactionService`. + static Status OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options); + + static Status OpenAndCompact( + const OpenAndCompactOptions& options, const std::string& name, + const std::string& output_directory, const std::string& input, + std::string* output, + const CompactionServiceOptionsOverride& override_options); + + // Experimental and subject to change + // Open DB and trim data newer than specified timestamp. + // The trim_ts specified the user-defined timestamp trim bound. + // This API should only be used at timestamp enabled column families recovery. + // If some input column families do not support timestamp, nothing will + // be happened to them. The data with timestamp > trim_ts + // will be removed after this API returns successfully. + static Status OpenAndTrimHistory( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + std::string trim_ts); + + virtual Status Resume() { return Status::NotSupported(); } + + // Close the DB by releasing resources, closing files etc. This should be + // called before calling the destructor so that the caller can get back a + // status in case there are any errors. This will not fsync the WAL files. + // If syncing is required, the caller must first call SyncWAL(), or Write() + // using an empty write batch with WriteOptions.sync=true. + // Regardless of the return status, the DB must be freed. + // If the return status is Aborted(), closing fails because there is + // unreleased snapshot in the system. In this case, users can release + // the unreleased snapshots and try again and expect it to succeed. For + // other status, re-calling Close() will be no-op and return the original + // close status. If the return status is NotSupported(), then the DB + // implementation does cleanup in the destructor + virtual Status Close() { return Status::NotSupported(); } + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families); + + // Abstract class ctor + DB() {} + // No copying allowed + DB(const DB&) = delete; + void operator=(const DB&) = delete; + + virtual ~DB(); + + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Bulk create column families with the same column family options. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles); + + // Bulk create column families. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Bulk drop column families. This call only records drop records in the + // manifest and prevents the column families from flushing and compacting. + // In case of error, the request may succeed partially. User may call + // ListColumnFamilies to check the result. + virtual Status DropColumnFamilies( + const std::vector& column_families); + + // Release and deallocate a column family handle. A column family is only + // removed once it is dropped (DropColumnFamily) and all handles have been + // destroyed (DestroyColumnFamilyHandle). Use this method to destroy + // column family handles (except for DefaultColumnFamily()!) before closing + // a DB. + virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family); + + // Set the database entry for "key" to "value". + // If "key" already exists, it will be overwritten. + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Put(options, DefaultColumnFamily(), key, value); + } + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& ts, const Slice& value) { + return Put(options, DefaultColumnFamily(), key, ts, value); + } + + // Set the database entry for "key" in the column family specified by + // "column_family" to the wide-column entity defined by "columns". If the key + // already exists in the column family, it will be overwritten. + // + // Returns OK on success, and a non-OK status on error. + virtual Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns); + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) = 0; + virtual Status Delete(const WriteOptions& options, const Slice& key) { + return Delete(options, DefaultColumnFamily(), key); + } + virtual Status Delete(const WriteOptions& options, const Slice& key, + const Slice& ts) { + return Delete(options, DefaultColumnFamily(), key, ts); + } + + // Remove the database entry for "key". Requires that the key exists + // and was not overwritten. Returns OK on success, and a non-OK status + // on error. It is not an error if "key" did not exist in the database. + // + // If a key is overwritten (by calling Put() multiple times), then the result + // of calling SingleDelete() on this key is undefined. SingleDelete() only + // behaves correctly if there has been only one Put() for this key since the + // previous call to SingleDelete() for this key. + // + // This feature is currently an experimental performance optimization + // for a very specific workload. It is up to the caller to ensure that + // SingleDelete is only used for a key that is not deleted using Delete() or + // written using Merge(). Mixing SingleDelete operations with Deletes and + // Merges can result in undefined behavior. + // + // Note: consider setting options.sync = true. + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) = 0; + virtual Status SingleDelete(const WriteOptions& options, const Slice& key) { + return SingleDelete(options, DefaultColumnFamily(), key); + } + virtual Status SingleDelete(const WriteOptions& options, const Slice& key, + const Slice& ts) { + return SingleDelete(options, DefaultColumnFamily(), key, ts); + } + + // Removes the database entries in the range ["begin_key", "end_key"), i.e., + // including "begin_key" and excluding "end_key". Returns OK on success, and + // a non-OK status on error. It is not an error if the database does not + // contain any existing data in the range ["begin_key", "end_key"). + // + // If "end_key" comes before "start_key" according to the user's comparator, + // a `Status::InvalidArgument` is returned. + // + // This feature is now usable in production, with the following caveats: + // 1) Accumulating too many range tombstones in the memtable will degrade read + // performance; this can be avoided by manually flushing occasionally. + // 2) Limiting the maximum number of open files in the presence of range + // tombstones can degrade read performance. To avoid this problem, set + // max_open_files to -1 whenever possible. + virtual Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key); + virtual Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts); + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Merge(options, DefaultColumnFamily(), key, value); + } + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*ts*/, + const Slice& /*value*/); + + // Apply the specified updates to the database. + // If `updates` contains no update, WAL will still be synced if + // options.sync=true. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the column family specified by "column_family" contains an entry for + // "key", return the corresponding value in "*value". If the entry is a plain + // key-value, return the value as-is; if it is a wide-column entity, return + // the value of its default anonymous column (see kDefaultWideColumnName) if + // any, or an empty value otherwise. + // + // If timestamp is enabled and a non-null timestamp pointer is passed in, + // timestamp is returned. + // + // Returns OK on success. Returns NotFound and an empty value in "*value" if + // there is no entry for "key". Returns some other non-OK status on error. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) { + return Get(options, DefaultColumnFamily(), key, value); + } + + // Get() methods that return timestamp. Derived DB classes don't need to worry + // about this group of methods if they don't care about timestamp feature. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val, timestamp); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, PinnableSlice* /*value*/, + std::string* /*timestamp*/) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented."); + } + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp) { + return Get(options, DefaultColumnFamily(), key, value, timestamp); + } + + // If the column family specified by "column_family" contains an entry for + // "key", return it as a wide-column entity in "*columns". If the entry is a + // wide-column entity, return it as-is; if it is a plain key-value, return it + // as an entity with a single anonymous column (see kDefaultWideColumnName) + // which contains the value. + // + // Returns OK on success. Returns NotFound and an empty wide-column entity in + // "*columns" if there is no entry for "key". Returns some other non-OK status + // on error. + virtual Status GetEntity(const ReadOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, + PinnableWideColumns* /* columns */) { + return Status::NotSupported("GetEntity not supported"); + } + + // Populates the `merge_operands` array with all the merge operands in the DB + // for `key`. The `merge_operands` array will be populated in the order of + // insertion. The number of entries populated in `merge_operands` will be + // assigned to `*number_of_operands`. + // + // If the number of merge operands in DB for `key` is greater than + // `merge_operands_options.expected_max_number_of_operands`, + // `merge_operands` is not populated and the return value is + // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned + // the number of merge operands found in the DB for `key`. + // + // `merge_operands`- Points to an array of at-least + // merge_operands_options.expected_max_number_of_operands and the + // caller is responsible for allocating it. + // + // The caller should delete or `Reset()` the `merge_operands` entries when + // they are no longer needed. All `merge_operands` entries must be destroyed + // or `Reset()` before this DB is closed or destroyed. + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) = 0; + + // Consistent Get of many keys across column families without the need + // for an explicit snapshot. NOTE: the implementation of this MultiGet API + // does not have the performance benefits of the void-returning MultiGet + // functions. + // + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values); + } + + virtual std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, std::vector* /*values*/, + std::vector* /*timestamps*/) { + return std::vector( + keys.size(), Status::NotSupported( + "MultiGet() returning timestamps not implemented.")); + } + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values, timestamps); + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, std::string* /*value*/, + std::string* /*timestamp*/, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, column_family, key, value, + /*timestamp=*/nullptr, value_found); + } + + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); + } + + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp, + value_found); + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Iterator* NewIterator(const ReadOptions& options) { + return NewIterator(options, DefaultColumnFamily()); + } + // Returns iterators from a consistent database state across multiple + // column families. Iterators are heap allocated and need to be deleted + // before the db is deleted + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + // + // nullptr will be returned if the DB fails to take a snapshot or does + // not support snapshot (eg: inplace_update_support enabled). + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + +#ifndef ROCKSDB_LITE + // Contains all valid property arguments for GetProperty() or + // GetMapProperty(). Each is a "string" property for retrieval with + // GetProperty() unless noted as a "map" property, for GetMapProperty(). + // + // NOTE: Property names cannot end in numbers since those are interpreted as + // arguments, e.g., see kNumFilesAtLevelPrefix. + struct Properties { + // "rocksdb.num-files-at-level" - returns string containing the number + // of files at level , where is an ASCII representation of a + // level number (e.g., "0"). + static const std::string kNumFilesAtLevelPrefix; + + // "rocksdb.compression-ratio-at-level" - returns string containing the + // compression ratio of data at level , where is an ASCII + // representation of a level number (e.g., "0"). Here, compression + // ratio is defined as uncompressed data size / compressed file size. + // Returns "-1.0" if no open files at level . + static const std::string kCompressionRatioAtLevelPrefix; + + // "rocksdb.stats" - returns a multi-line string containing the data + // described by kCFStats followed by the data described by kDBStats. + static const std::string kStats; + + // "rocksdb.sstables" - returns a multi-line string summarizing current + // SST files. + static const std::string kSSTables; + + // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram" + // and "rocksdb.cf-file-histogram" as a "map" property. + static const std::string kCFStats; + + // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with + // general column family stats per-level over db's lifetime ("L"), + // aggregated over db's lifetime ("Sum"), and aggregated over the + // interval since the last retrieval ("Int"). + static const std::string kCFStatsNoFileHistogram; + + // "rocksdb.cf-file-histogram" - print out how many file reads to every + // level, as well as the histogram of latency of single requests. + static const std::string kCFFileHistogram; + + // "rocksdb.dbstats" - As a string property, returns a multi-line string + // with general database stats, both cumulative (over the db's + // lifetime) and interval (since the last retrieval of kDBStats). + // As a map property, returns cumulative stats only and does not + // update the baseline for the interval stats. + static const std::string kDBStats; + + // "rocksdb.levelstats" - returns multi-line string containing the number + // of files per level and total size of each level (MB). + static const std::string kLevelStats; + + // "rocksdb.block-cache-entry-stats" - returns a multi-line string or + // map with statistics on block cache usage. See + // `BlockCacheEntryStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kBlockCacheEntryStats; + + // "rocksdb.fast-block-cache-entry-stats" - same as above, but returns + // stale values more frequently to reduce overhead and latency. + static const std::string kFastBlockCacheEntryStats; + + // "rocksdb.num-immutable-mem-table" - returns number of immutable + // memtables that have not yet been flushed. + static const std::string kNumImmutableMemTable; + + // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable + // memtables that have already been flushed. + static const std::string kNumImmutableMemTableFlushed; + + // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is + // pending; otherwise, returns 0. + static const std::string kMemTableFlushPending; + + // "rocksdb.num-running-flushes" - returns the number of currently running + // flushes. + static const std::string kNumRunningFlushes; + + // "rocksdb.compaction-pending" - returns 1 if at least one compaction is + // pending; otherwise, returns 0. + static const std::string kCompactionPending; + + // "rocksdb.num-running-compactions" - returns the number of currently + // running compactions. + static const std::string kNumRunningCompactions; + + // "rocksdb.background-errors" - returns accumulated number of background + // errors. + static const std::string kBackgroundErrors; + + // "rocksdb.cur-size-active-mem-table" - returns approximate size of active + // memtable (bytes). + static const std::string kCurSizeActiveMemTable; + + // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active + // and unflushed immutable memtables (bytes). + static const std::string kCurSizeAllMemTables; + + // "rocksdb.size-all-mem-tables" - returns approximate size of active, + // unflushed immutable, and pinned immutable memtables (bytes). + static const std::string kSizeAllMemTables; + + // "rocksdb.num-entries-active-mem-table" - returns total number of entries + // in the active memtable. + static const std::string kNumEntriesActiveMemTable; + + // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries + // in the unflushed immutable memtables. + static const std::string kNumEntriesImmMemTables; + + // "rocksdb.num-deletes-active-mem-table" - returns total number of delete + // entries in the active memtable. + static const std::string kNumDeletesActiveMemTable; + + // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete + // entries in the unflushed immutable memtables. + static const std::string kNumDeletesImmMemTables; + + // "rocksdb.estimate-num-keys" - returns estimated number of total keys in + // the active and unflushed immutable memtables and storage. + static const std::string kEstimateNumKeys; + + // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for + // reading SST tables, excluding memory used in block cache (e.g., + // filter and index blocks). + static const std::string kEstimateTableReadersMem; + + // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete + // files is enabled; otherwise, returns a non-zero number. + // This name may be misleading because true(non-zero) means disable, + // but we keep the name for backward compatibility. + static const std::string kIsFileDeletionsEnabled; + + // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the + // database. + static const std::string kNumSnapshots; + + // "rocksdb.oldest-snapshot-time" - returns number representing unix + // timestamp of oldest unreleased snapshot. + static const std::string kOldestSnapshotTime; + + // "rocksdb.oldest-snapshot-sequence" - returns number representing + // sequence number of oldest unreleased snapshot. + static const std::string kOldestSnapshotSequence; + + // "rocksdb.num-live-versions" - returns number of live versions. `Version` + // is an internal data structure. See version_set.h for details. More + // live versions often mean more SST files are held from being deleted, + // by iterators or unfinished compactions. + static const std::string kNumLiveVersions; + + // "rocksdb.current-super-version-number" - returns number of current LSM + // version. It is a uint64_t integer number, incremented after there is + // any change to the LSM tree. The number is not preserved after restarting + // the DB. After DB restart, it will start from 0 again. + static const std::string kCurrentSuperVersionNumber; + + // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of + // live data in bytes. For BlobDB, it also includes the exact value of + // live bytes in the blob files of the version. + static const std::string kEstimateLiveDataSize; + + // "rocksdb.min-log-number-to-keep" - return the minimum log number of the + // log files that should be kept. + static const std::string kMinLogNumberToKeep; + + // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file + // number for an obsolete SST to be kept. The max value of `uint64_t` + // will be returned if all obsolete files can be deleted. + static const std::string kMinObsoleteSstNumberToKeep; + + // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST + // files. + // WARNING: may slow down online queries if there are too many files. + static const std::string kTotalSstFilesSize; + + // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST + // files belong to the latest LSM tree. + static const std::string kLiveSstFilesSize; + + // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes) + // of SST files at all certain file temperature + static const std::string kLiveSstFilesSizeAtTemperature; + + // "rocksdb.base-level" - returns number of level to which L0 data will be + // compacted. + static const std::string kBaseLevel; + + // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total + // number of bytes compaction needs to rewrite to get all levels down + // to under target size. Not valid for other compactions than level- + // based. + static const std::string kEstimatePendingCompactionBytes; + + // "rocksdb.aggregated-table-properties" - returns a string or map + // representation of the aggregated table properties of the target + // column family. Only properties that make sense for aggregation + // are included. + static const std::string kAggregatedTableProperties; + + // "rocksdb.aggregated-table-properties-at-level", same as the previous + // one but only returns the aggregated table properties of the + // specified level "N" at the target column family. + static const std::string kAggregatedTablePropertiesAtLevel; + + // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed + // write rate. 0 means no delay. + static const std::string kActualDelayedWriteRate; + + // "rocksdb.is-write-stopped" - Return 1 if write has been stopped. + static const std::string kIsWriteStopped; + + // "rocksdb.estimate-oldest-key-time" - returns an estimation of + // oldest key timestamp in the DB. Currently only available for + // FIFO compaction with + // compaction_options_fifo.allow_compaction = false. + static const std::string kEstimateOldestKeyTime; + + // "rocksdb.block-cache-capacity" - returns block cache capacity. + static const std::string kBlockCacheCapacity; + + // "rocksdb.block-cache-usage" - returns the memory size for the entries + // residing in block cache. + static const std::string kBlockCacheUsage; + + // "rocksdb.block-cache-pinned-usage" - returns the memory size for the + // entries being pinned. + static const std::string kBlockCachePinnedUsage; + + // "rocksdb.options-statistics" - returns multi-line string + // of options.statistics + static const std::string kOptionsStatistics; + + // "rocksdb.num-blob-files" - returns number of blob files in the current + // version. + static const std::string kNumBlobFiles; + + // "rocksdb.blob-stats" - return the total number and size of all blob + // files, and total amount of garbage (bytes) in the blob files in + // the current version. + static const std::string kBlobStats; + + // "rocksdb.total-blob-file-size" - returns the total size of all blob + // files over all versions. + static const std::string kTotalBlobFileSize; + + // "rocksdb.live-blob-file-size" - returns the total size of all blob + // files in the current version. + static const std::string kLiveBlobFileSize; + + // "rocksdb.live-blob-file-garbage-size" - returns the total amount of + // garbage in the blob files in the current version. + static const std::string kLiveBlobFileGarbageSize; + + // "rocksdb.blob-cache-capacity" - returns blob cache capacity. + static const std::string kBlobCacheCapacity; + + // "rocksdb.blob-cache-usage" - returns the memory size for the entries + // residing in blob cache. + static const std::string kBlobCacheUsage; + + // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the + // entries being pinned in blob cache. + static const std::string kBlobCachePinnedUsage; + }; +#endif /* ROCKSDB_LITE */ + + // DB implementations export properties about their state via this method. + // If "property" is a valid "string" property understood by this DB + // implementation (see Properties struct above for valid options), fills + // "*value" with its current value and returns true. Otherwise, returns + // false. + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + virtual bool GetProperty(const Slice& property, std::string* value) { + return GetProperty(DefaultColumnFamily(), property, value); + } + + // Like GetProperty but for valid "map" properties. (Some properties can be + // accessed as either "string" properties or "map" properties.) + virtual bool GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) = 0; + virtual bool GetMapProperty(const Slice& property, + std::map* value) { + return GetMapProperty(DefaultColumnFamily(), property, value); + } + + // Similar to GetProperty(), but only works for a subset of properties whose + // return value is an integer. Return the value by integer. Supported + // properties: + // "rocksdb.num-immutable-mem-table" + // "rocksdb.mem-table-flush-pending" + // "rocksdb.compaction-pending" + // "rocksdb.background-errors" + // "rocksdb.cur-size-active-mem-table" + // "rocksdb.cur-size-all-mem-tables" + // "rocksdb.size-all-mem-tables" + // "rocksdb.num-entries-active-mem-table" + // "rocksdb.num-entries-imm-mem-tables" + // "rocksdb.num-deletes-active-mem-table" + // "rocksdb.num-deletes-imm-mem-tables" + // "rocksdb.estimate-num-keys" + // "rocksdb.estimate-table-readers-mem" + // "rocksdb.is-file-deletions-enabled" + // "rocksdb.num-snapshots" + // "rocksdb.oldest-snapshot-time" + // "rocksdb.num-live-versions" + // "rocksdb.current-super-version-number" + // "rocksdb.estimate-live-data-size" + // "rocksdb.min-log-number-to-keep" + // "rocksdb.min-obsolete-sst-number-to-keep" + // "rocksdb.total-sst-files-size" + // "rocksdb.live-sst-files-size" + // "rocksdb.base-level" + // "rocksdb.estimate-pending-compaction-bytes" + // "rocksdb.num-running-compactions" + // "rocksdb.num-running-flushes" + // "rocksdb.actual-delayed-write-rate" + // "rocksdb.is-write-stopped" + // "rocksdb.estimate-oldest-key-time" + // "rocksdb.block-cache-capacity" + // "rocksdb.block-cache-usage" + // "rocksdb.block-cache-pinned-usage" + // + // Properties dedicated for BlobDB: + // "rocksdb.num-blob-files" + // "rocksdb.total-blob-file-size" + // "rocksdb.live-blob-file-size" + // "rocksdb.blob-cache-capacity" + // "rocksdb.blob-cache-usage" + // "rocksdb.blob-cache-pinned-usage" + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) = 0; + virtual bool GetIntProperty(const Slice& property, uint64_t* value) { + return GetIntProperty(DefaultColumnFamily(), property, value); + } + + // Reset internal stats for DB and all column families. + // Note this doesn't reset options.statistics as it is not owned by + // DB. + virtual Status ResetStats() { + return Status::NotSupported("Not implemented"); + } + + // Same as GetIntProperty(), but this one returns the aggregated int + // property from all column families. + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) = 0; + + // Flags for DB::GetSizeApproximation that specify whether memtable + // stats should be included, or file stats approximation or both + enum class SizeApproximationFlags : uint8_t { + NONE = 0, + INCLUDE_MEMTABLES = 1 << 0, + INCLUDE_FILES = 1 << 1 + }; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)" + // in a single column family. + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes) = 0; + + // Simpler versions of the GetApproximateSizes() method above. + // The include_flags argument must of type DB::SizeApproximationFlags + // and can not be NONE. + virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + SizeApproximationFlags include_flags = + SizeApproximationFlags::INCLUDE_FILES); + + virtual Status GetApproximateSizes( + const Range* ranges, int n, uint64_t* sizes, + SizeApproximationFlags include_flags = + SizeApproximationFlags::INCLUDE_FILES) { + return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, + include_flags); + } + + // The method is similar to GetApproximateSizes, except it + // returns approximate number of records in memtables. + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) = 0; + virtual void GetApproximateMemTableStats(const Range& range, + uint64_t* const count, + uint64_t* const size) { + GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size); + } + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // This call blocks until the operation completes successfully, fails, + // or is aborted (Status::Incomplete). See DisableManualCompaction. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(options, nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size after + // compaction is reduced, that level might not be appropriate for hosting all + // the files. In this case, client could set options.change_level to true, to + // move the files back to the minimum level capable of holding the data set + // or a given level (specified by non-negative options.target_level). + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) = 0; + virtual Status CompactRange(const CompactRangeOptions& options, + const Slice* begin, const Slice* end) { + return CompactRange(options, DefaultColumnFamily(), begin, end); + } + + // Dynamically change column family options or table factory options in a + // running DB, for the specified column family. Only options internally + // marked as "mutable" can be changed. Options not listed in `opts_map` will + // keep their current values. See GetColumnFamilyOptionsFromMap() in + // convenience.h for the details of `opts_map`. Not supported in LITE mode. + // + // USABILITY NOTE: SetOptions is intended only for expert users, and does + // not apply the same sanitization to options as the standard DB::Open code + // path does. Use with caution. + // + // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for + // reliability, and this is a slow call because a new OPTIONS file is + // serialized and persisted for each call. Use only infrequently. + // + // EXAMPLES: + // s = db->SetOptions(cfh, {{"ttl", "36000"}}); + // s = db->SetOptions(cfh, {{"block_based_table_factory", + // "{prepopulate_block_cache=kDisable;}"}}); + virtual Status SetOptions( + ColumnFamilyHandle* /*column_family*/, + const std::unordered_map& /*opts_map*/) { + return Status::NotSupported("Not implemented"); + } + // Shortcut for SetOptions on the default column family handle. + virtual Status SetOptions( + const std::unordered_map& new_options) { + return SetOptions(DefaultColumnFamily(), new_options); + } + + // Like SetOptions but for DBOptions, including the same caveats for + // usability, reliability, and performance. See GetDBOptionsFromMap() (and + // GetColumnFamilyOptionsFromMap()) in convenience.h for details on + // `opts_map`. Note supported in LITE mode. + // + // EXAMPLES: + // s = db->SetDBOptions({{"max_subcompactions", "2"}}); + // s = db->SetDBOptions({{"stats_dump_period_sec", "0"}, + // {"stats_persist_period_sec", "0"}}); + virtual Status SetDBOptions( + const std::unordered_map& new_options) = 0; + + // CompactFiles() inputs a list of files specified by file numbers and + // compacts them to the specified level. A small difference compared to + // CompactRange() is that CompactFiles() performs the compaction job + // using the CURRENT thread, so is not considered a "background" job. + // + // @see GetDataBaseMetaData + // @see GetColumnFamilyMetaData + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) = 0; + + virtual Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) { + return CompactFiles(compact_options, DefaultColumnFamily(), + input_file_names, output_level, output_path_id, + output_file_names, compaction_job_info); + } + + // This function will wait until all currently running background processes + // finish. After it returns, no background process will be run until + // ContinueBackgroundWork is called, once for each preceding OK-returning + // call to PauseBackgroundWork. + virtual Status PauseBackgroundWork() = 0; + virtual Status ContinueBackgroundWork() = 0; + + // This function will enable automatic compactions for the given column + // families if they were previously disabled. The function will first set the + // disable_auto_compactions option for each column family to 'false', after + // which it will schedule a flush/compaction. + // + // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API + // does NOT schedule a flush/compaction afterwards, and only changes the + // parameter itself within the column family option. + // + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) = 0; + + // After this function call, CompactRange() or CompactFiles() will not + // run compactions and fail. Calling this function will tell outstanding + // manual compactions to abort and will wait for them to finish or abort + // before returning. + virtual void DisableManualCompaction() = 0; + // Re-enable CompactRange() and ComapctFiles() that are disabled by + // DisableManualCompaction(). This function must be called as many times + // as DisableManualCompaction() has been called in order to re-enable + // manual compactions, and must not be called more times than + // DisableManualCompaction() has been called. + virtual void EnableManualCompaction() = 0; + + // Number of levels used for this DB. + virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; + virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; + virtual int MaxMemCompactionLevel() { + return MaxMemCompactionLevel(DefaultColumnFamily()); + } + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; + virtual int Level0StopWriteTrigger() { + return Level0StopWriteTrigger(DefaultColumnFamily()); + } + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for + // efficiency. + virtual FileSystem* GetFileSystem() const; + + // Get DB Options that we use. During the process of opening the + // column family, the options provided when calling DB::Open() or + // DB::CreateColumnFamily() will have been "sanitized" and transformed + // in an implementation-defined manner. + virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0; + virtual Options GetOptions() const { + return GetOptions(DefaultColumnFamily()); + } + + virtual DBOptions GetDBOptions() const = 0; + + // Flush all mem-table data. + // Flush a single column family, even when atomic flush is enabled. To flush + // multiple column families, use Flush(options, column_families). + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Status Flush(const FlushOptions& options) { + return Flush(options, DefaultColumnFamily()); + } + // Flushes multiple column families. + // If atomic flush is not enabled, Flush(options, column_families) is + // equivalent to calling Flush(options, column_family) multiple times. + // If atomic flush is enabled, Flush(options, column_families) will flush all + // column families specified in 'column_families' up to the latest sequence + // number at the time when flush is requested. + // Note that RocksDB 5.15 and earlier may not be able to open later versions + // with atomic flush enabled. + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) = 0; + + // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL + // afterwards. + virtual Status FlushWAL(bool /*sync*/) { + return Status::NotSupported("FlushWAL not implemented"); + } + // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the + // same as Write() with sync=true: in the latter case the changes won't be + // visible until the sync is done. + // Currently only works if allow_mmap_writes = false in Options. + virtual Status SyncWAL() = 0; + + // Lock the WAL. Also flushes the WAL after locking. + virtual Status LockWAL() { + return Status::NotSupported("LockWAL not implemented"); + } + + // Unlock the WAL. + virtual Status UnlockWAL() { + return Status::NotSupported("UnlockWAL not implemented"); + } + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Increase the full_history_ts of column family. The new ts_low value should + // be newer than current full_history_ts value. + // If another thread updates full_history_ts_low concurrently to a higher + // timestamp than the requested ts_low, a try again error will be returned. + virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) = 0; + + // Get current full_history_ts value. + virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) = 0; + + // Allow compactions to delete obsolete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + +#ifndef ROCKSDB_LITE + // Retrieves the creation time of the oldest file in the DB. + // This API only works if max_open_files = -1, if it is not then + // Status returned is Status::NotSupported() + // The file creation time is set using the env provided to the DB. + // If the DB was created from a very old release then its possible that + // the SST files might not have file_creation_time property and even after + // moving to a newer release its possible that some files never got compacted + // and may not have file_creation_time property. In both the cases + // file_creation_time is considered 0 which means this API will return + // creation_time = 0 as there wouldn't be a timestamp lower than 0. + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0; + + // Note: this API is not yet consistent with WritePrepared transactions. + // + // Sets iter to an iterator that is positioned at a write-batch whose + // sequence number range [start_seq, end_seq] covers seq_number. If no such + // write-batch exists, then iter is positioned at the next write-batch whose + // start_seq > seq_number. + // + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) = 0; + +// Windows API macro interference +#undef DeleteFile + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + // + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Obtains a list of all live table (SST) files and how they fit into the + // LSM-trees, such as column family, level, key range, etc. + // This builds a de-normalized form of GetAllColumnFamilyMetaData(). + // For information about all files in a DB, use GetLiveFilesStorageInfo(). + virtual void GetLiveFilesMetaData( + std::vector* /*metadata*/) {} + + // Return a list of all table (SST) and blob files checksum info. + // Note: This function might be of limited use because it cannot be + // synchronized with other "live files" APIs. GetLiveFilesStorageInfo() + // is recommended instead. + virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; + + // Get information about all live files that make up a DB, for making + // live copies (Checkpoint, backups, etc.) or other storage-related purposes. + // If creating a live copy, use DisableFileDeletions() before and + // EnableFileDeletions() after to prevent deletions. + // For LSM-tree metadata, use Get*MetaData() functions instead. + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) = 0; + + // Obtains the LSM-tree meta data of the specified column family of the DB, + // including metadata for each live table (SST) file in that column family. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) {} + + // Get the metadata of the default column family. + void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) { + GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); + } + + // Obtains the LSM-tree meta data of all column families of the DB, including + // metadata for each live table (SST) file and each blob file in the DB. + virtual void GetAllColumnFamilyMetaData( + std::vector* /*metadata*/) {} + + // Retrieve the list of all files in the database except WAL files. The files + // are relative to the dbname (or db_paths/cf_paths), not absolute paths. + // (Not recommended with db_paths/cf_paths because that information is not + // returned.) Despite being relative paths, the file names begin with "/". + // The valid size of the manifest file is returned in manifest_file_size. + // The manifest file is an ever growing file, but only the portion specified + // by manifest_file_size is valid for this snapshot. Setting flush_memtable + // to true does Flush before recording the live files (unless DB is + // read-only). Setting flush_memtable to false is useful when we don't want + // to wait for flush which may have to wait for compaction to complete + // taking an indeterminate time. + // + // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate + // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended + // instead, because it ensures a single consistent view of all files is + // captured in one call. + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // Retrieve information about the current wal file + // + // Note that the log might have rolled after this call in which case + // the current_log_file would not point to the current log file. + // + // Additionally, for the sake of optimization current_log_file->StartSequence + // would always be set to 0 + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) = 0; + + // IngestExternalFile() will load a list of external SST files (1) into the DB + // Two primary modes are supported: + // - Duplicate keys in the new files will overwrite exiting keys (default) + // - Duplicate keys will be skipped (set ingest_behind=true) + // In the first mode we will try to find the lowest possible level that + // the file can fit in, and ingest the file into this level (2). A file that + // have a key range that overlap with the memtable key range will require us + // to Flush the memtable first before ingesting the file. + // In the second mode we will always ingest in the bottom most level (see + // docs to IngestExternalFileOptions::ingest_behind). + // + // (1) External SST files can be created using SstFileWriter + // (2) We will try to ingest the files to the lowest possible level + // even if the file compression doesn't match the level compression + // (3) If IngestExternalFileOptions->ingest_behind is set to true, + // we always ingest at the bottommost level, which should be reserved + // for this purpose (see DBOPtions::allow_ingest_behind flag). + // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to + // true, then this method can return Status:TryAgain() indicating that + // the files cannot be ingested to the bottommost level, and it is the + // user's responsibility to clear the bottommost level in the overlapping + // range before re-attempting the ingestion. + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& options) = 0; + + virtual Status IngestExternalFile( + const std::vector& external_files, + const IngestExternalFileOptions& options) { + return IngestExternalFile(DefaultColumnFamily(), external_files, options); + } + + // IngestExternalFiles() will ingest files for multiple column families, and + // record the result atomically to the MANIFEST. + // If this function returns OK, all column families' ingestion must succeed. + // If this function returns NOK, or the process crashes, then non-of the + // files will be ingested into the database after recovery. + // Note that it is possible for application to observe a mixed state during + // the execution of this function. If the user performs range scan over the + // column families with iterators, iterator on one column family may return + // ingested data, while iterator on other column family returns old data. + // Users can use snapshot for a consistent view of data. + // If your db ingests multiple SST files using this API, i.e. args.size() + // > 1, then RocksDB 5.15 and earlier will not be able to open it. + // + // REQUIRES: each arg corresponds to a different column family: namely, for + // 0 <= i < j < len(args), args[i].column_family != args[j].column_family. + virtual Status IngestExternalFiles( + const std::vector& args) = 0; + + // CreateColumnFamilyWithImport() will create a new column family with + // column_family_name and import external SST files specified in metadata into + // this column family. + // (1) External SST files can be created using SstFileWriter. + // (2) External SST files can be exported from a particular column family in + // an existing DB using Checkpoint::ExportColumnFamily. + // Option in import_options specifies whether the external files are copied or + // moved (default is copy). When option specifies copy, managing files at + // external_file_path is caller's responsibility. When option specifies a + // move, the call makes a best effort to delete the specified files at + // external_file_path on successful return, logging any failure to delete + // rather than returning in Status. Files are not modified on any error + // return, and a best effort is made to remove any newly-created files. + // On error return, column family handle returned will be nullptr. + // ColumnFamily will be present on successful return and will not be present + // on error return. ColumnFamily may be present on any crash during this call. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) = 0; + + // Verify the checksums of files in db. Currently the whole-file checksum of + // table files are checked. + virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { + return Status::NotSupported("File verification not supported"); + } + + // Verify the block checksums of files in db. The block checksums of table + // files are checked. + virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; + + virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + +#endif // ROCKSDB_LITE + + // Returns the unique ID which is read from IDENTITY file during the opening + // of database by setting in the identity variable + // Returns Status::OK if identity could be set properly + virtual Status GetDbIdentity(std::string& identity) const = 0; + + // Return a unique identifier for each DB object that is opened + // This DB session ID should be unique among all open DB instances on all + // hosts, and should be unique among re-openings of the same or other DBs. + // (Two open DBs have the same identity from other function GetDbIdentity when + // one is physically copied from the other.) + virtual Status GetDbSessionId(std::string& session_id) const = 0; + + // Returns default column family handle + virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + +#ifndef ROCKSDB_LITE + + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) = 0; + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return GetPropertiesOfAllTables(DefaultColumnFamily(), props); + } + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) = 0; + + virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) { + return Status::NotSupported("SuggestCompactRange() is not implemented."); + } + + virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { + return Status::NotSupported("PromoteL0() is not implemented."); + } + + // Trace DB operations. Use EndTrace() to stop tracing. + virtual Status StartTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartTrace() is not implemented."); + } + + virtual Status EndTrace() { + return Status::NotSupported("EndTrace() is not implemented."); + } + + // IO Tracing operations. Use EndIOTrace() to stop tracing. + virtual Status StartIOTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartIOTrace() is not implemented."); + } + + virtual Status EndIOTrace() { + return Status::NotSupported("EndIOTrace() is not implemented."); + } + + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. + virtual Status StartBlockCacheTrace( + const TraceOptions& /*trace_options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status StartBlockCacheTrace( + const BlockCacheTraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status EndBlockCacheTrace() { + return Status::NotSupported("EndBlockCacheTrace() is not implemented."); + } + + // Create a default trace replayer. + virtual Status NewDefaultReplayer( + const std::vector& /*handles*/, + std::unique_ptr&& /*reader*/, + std::unique_ptr* /*replayer*/) { + return Status::NotSupported("NewDefaultReplayer() is not implemented."); + } + +#endif // ROCKSDB_LITE + + // Needed for StackableDB + virtual DB* GetRootDB() { return this; } + + // Given a window [start_time, end_time), setup a StatsHistoryIterator + // to access stats history. Note the start_time and end_time are epoch + // time measured in seconds, and end_time is an exclusive bound. + virtual Status GetStatsHistory( + uint64_t /*start_time*/, uint64_t /*end_time*/, + std::unique_ptr* /*stats_iterator*/) { + return Status::NotSupported("GetStatsHistory() is not implemented."); + } + +#ifndef ROCKSDB_LITE + // Make the secondary instance catch up with the primary by tailing and + // replaying the MANIFEST and WAL of the primary. + // Column families created by the primary after the secondary instance starts + // will be ignored unless the secondary instance closes and restarts with the + // newly created column families. + // Column families that exist before secondary instance starts and dropped by + // the primary afterwards will be marked as dropped. However, as long as the + // secondary instance does not delete the corresponding column family + // handles, the data of the column family is still accessible to the + // secondary. + virtual Status TryCatchUpWithPrimary() { + return Status::NotSupported("Supported only by secondary instance"); + } +#endif // !ROCKSDB_LITE +}; + +// Overloaded operators for enum class SizeApproximationFlags. +inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs, + DB::SizeApproximationFlags rhs) { + return static_cast(static_cast(lhs) & + static_cast(rhs)); +} +inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs, + DB::SizeApproximationFlags rhs) { + return static_cast(static_cast(lhs) | + static_cast(rhs)); +} + +inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + SizeApproximationFlags include_flags) { + SizeApproximationOptions options; + options.include_memtables = + ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != + SizeApproximationFlags::NONE); + options.include_files = + ((include_flags & SizeApproximationFlags::INCLUDE_FILES) != + SizeApproximationFlags::NONE); + return GetApproximateSizes(options, column_family, ranges, n, sizes); +} + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options, + const std::vector& column_families = + std::vector()); + +#ifndef ROCKSDB_LITE +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +// +// With this API, we will warn and skip data associated with column families not +// specified in column_families. +// +// @param column_families Descriptors for known column families +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families); + +// @param unknown_cf_opts Options for column families encountered during the +// repair that were not specified in column_families. +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts); + +// @param options These options will be used for the database and for ALL column +// families encountered during the repair +Status RepairDB(const std::string& dbname, const Options& options); + +#endif + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h new file mode 100644 index 000000000..17f4e6bde --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_bench_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h new file mode 100644 index 000000000..b7d4766a2 --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_dump_tool.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +struct DumpOptions { + // Database that will be dumped + std::string db_path; + // File location that will contain dump output + std::string dump_location; + // Don't include db information header in the dump + bool anonymous = false; +}; + +class DbDumpTool { + public: + bool Run(const DumpOptions& dump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; + +struct UndumpOptions { + // Database that we will load the dumped file into + std::string db_path; + // File location of the dumped file that will be loaded + std::string dump_location; + // Compact the db after loading the dumped file + bool compact_db = false; +}; + +class DbUndumpTool { + public: + bool Run(const UndumpOptions& undump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h new file mode 100644 index 000000000..7d3d42c9d --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_stress_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_stress_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h new file mode 100644 index 000000000..bef60a212 --- /dev/null +++ b/src/rocksdb/include/rocksdb/env.h @@ -0,0 +1,1893 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/functor_wrapper.h" +#include "rocksdb/status.h" +#include "rocksdb/thread_status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ + __attribute__((__format__(__printf__, format_param, dots_param))) +#else +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) +#endif + +namespace ROCKSDB_NAMESPACE { + +class DynamicLibrary; +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +struct DataVerificationInfo; +class WritableFile; +class RandomRWFile; +class MemoryMappedFileBuffer; +class Directory; +struct DBOptions; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; +class ThreadStatusUpdater; +struct ThreadStatus; +class FileSystem; +class SystemClock; +struct ConfigOptions; + +const size_t kDefaultPageSize = 4 * 1024; + +enum class CpuPriority { + kIdle = 0, + kLow = 1, + kNormal = 2, + kHigh = 3, +}; + +// Options while opening a file to read/write +struct EnvOptions { + // Construct with default Options + EnvOptions(); + + // Construct from Options + explicit EnvOptions(const DBOptions& options); + + // If true, then use mmap to read data. + // Not recommended for 32-bit OS. + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, then use O_DIRECT for reading data + bool use_direct_reads = false; + + // If true, then use O_DIRECT for writing data + bool use_direct_writes = false; + + // If false, fallocate() calls are bypassed + bool allow_fallocate = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec = true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; + + // When true, guarantees the file has at most `bytes_per_sync` bytes submitted + // for writeback at any given time. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; + + // See DBOptions doc + size_t compaction_readahead_size = 0; + + // See DBOptions doc + size_t random_access_max_buffer_size = 0; + + // See DBOptions doc + size_t writable_file_max_buffer_size = 1024 * 1024; + + // If not nullptr, write rate limiting is enabled for flush and compaction + RateLimiter* rate_limiter = nullptr; +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Env : public Customizable { + public: + static const char* kDefaultName() { return "DefaultEnv"; } + struct FileAttributes { + // File name + std::string name; + + // Size of file in bytes + uint64_t size_bytes; + }; + + Env(); + // Construct an Env with a separate FileSystem and/or SystemClock + // implementation + explicit Env(const std::shared_ptr& fs); + Env(const std::shared_ptr& fs, + const std::shared_ptr& clock); + // No copying allowed + Env(const Env&) = delete; + void operator=(const Env&) = delete; + + ~Env() override; + + static const char* Type() { return "Environment"; } + + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return ""; } + + // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. + static Status LoadEnv(const std::string& value, Env** result); + + // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. + static Status LoadEnv(const std::string& value, Env** result, + std::shared_ptr* guard); + + // Loads the environment specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // + // @param config_options Controls how the environment is loaded. + // @param value the name and associated properties for the environment. + // @param result On success, the environment that was loaded. + // @param guard If specified and the loaded environment is not static, + // this value will contain the loaded environment (guard.get() == + // result). + // @return OK If the environment was successfully loaded (and optionally + // prepared) + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard); + + // Loads the environment specified by the env and fs uri. + // If both are specified, an error is returned. + // Otherwise, the environment is created by loading (via CreateFromString) + // the appropriate env/fs from the corresponding values. + static Status CreateFromUri(const ConfigOptions& options, + const std::string& env_uri, + const std::string& fs_uri, Env** result, + std::shared_ptr* guard); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // See FileSystem::RegisterDbPaths. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // See FileSystem::UnregisterDbPaths. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + WLTH_NOT_SET = 0, // No hint information set + WLTH_NONE, // No hints about write life time + WLTH_SHORT, // Data written has a short life time + WLTH_MEDIUM, // Data written has a medium life time + WLTH_LONG, // Data written has a long life time + WLTH_EXTREME, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that writes to a file with the specified name. + // `WritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status ReopenWritableFile(const std::string& /*fname*/, + std::unique_ptr* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("Env::ReopenWritableFile() not supported."); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewRandomRWFile(const std::string& /*fname*/, + std::unique_ptr* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("RandomRWFile is not implemented in this Env"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual Status NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr* /*result*/) { + return Status::NotSupported( + "MemoryMappedFileBuffer is not implemented in this Env"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual Status FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir", and shall never include the + // names `.` or `..`. + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir", and shall never include the + // names `.` or `..`. + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildrenFileAttributes(const std::string& dir, + std::vector* result); + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Truncate the named file to the specified size. + virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) { + return Status::NotSupported("Truncate is not supported for this Env"); + } + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + // Many implementations of this function will only delete a directory if it is + // empty. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Hard Link file src to target. + virtual Status LinkFile(const std::string& /*src*/, + const std::string& /*target*/) { + return Status::NotSupported("LinkFile is not supported for this Env"); + } + + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/) { + return Status::NotSupported( + "Getting number of file links is not supported for this Env"); + } + + virtual Status AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, bool* /*res*/) { + return Status::NotSupported("AreFilesSame is not supported for this Env"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + + // Priority for scheduling job in thread pool + enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL }; + + static std::string PriorityToString(Priority priority); + + // Priority for requesting bytes in rate limiter scheduler + enum IOPriority { + IO_LOW = 0, + IO_MID = 1, + IO_HIGH = 2, + IO_USER = 3, + IO_TOTAL = 4 + }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + // When the UnSchedule function is called, the unschedFunction + // registered at the time of Schedule is invoked with arg as a parameter. + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW, void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) = 0; + + // Arrange to remove jobs for given arg from the queue_ if they are not + // already scheduled. Caller is expected to have exclusive lock on arg. + virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; } + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // Start a new thread, invoking "function(args...)" within the new thread. + // When "function(args...)" returns, the thread will be destroyed. + template + void StartThreadTyped(FunctionT function, Args&&... args) { + using FWType = FunctorWrapper; + StartThread( + [](void* arg) { + auto* functor = static_cast(arg); + functor->invoke(); + delete functor; + }, + new FWType(std::function(function), + std::forward(args)...)); + } + + // Wait for all threads started by StartThread to terminate. + virtual void WaitForJoin() {} + + // Reserve available background threads in the specified thread pool. + virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) { + return 0; + } + + // Release a specific number of reserved threads from the specified thread + // pool + virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) { + return 0; + } + + // Get thread pool queue length for specific thread pool. + virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const { + return 0; + } + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + virtual Status NewLogger(const std::string& fname, + std::shared_ptr* result); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // 0 indicates not supported. + virtual uint64_t NowCPUNanos() { return 0; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name as a null terminated string iff the string + // length is < len. The hostname should otherwise be truncated to len. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the current hostname from the given env as a std::string in result. + // The result may be truncated if the hostname is too + // long + virtual Status GetHostNameString(std::string* result); + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + virtual int GetBackgroundThreads(Priority pri = LOW) = 0; + + virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) { + return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported."); + } + + // Enlarge number of background worker threads of a specific thread pool + // for this environment if it is smaller than specified. 'LOW' is the default + // pool. + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0; + + // Lower IO priority for threads from the specified pool. + virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {} + + // Lower CPU priority for threads from the specified pool. + virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/, + CpuPriority /*pri*/) { + return Status::NotSupported( + "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported"); + } + + // Lower CPU priority for threads from the specified pool. + virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {} + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a human-readable unique ID that can be used to identify a DB. + // In built-in implementations, this is an RFC-4122 UUID string, but might + // not be in all implementations. Overriding is not recommended. + // NOTE: this has not be validated for use in cryptography + virtual std::string GenerateUniqueId(); + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for reading log files. + virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const; + + // OptimizeForManifestRead will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for reading manifest + // files. + virtual EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const; + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that is + // a copy of the EnvOptions in the parameters, but is optimized for writing + // table files. + virtual EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // table files. + virtual EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + + // OptimizeForBlobFileRead will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // blob files. + virtual EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + + // Returns the status of all threads that belong to the current Env. + virtual Status GetThreadList(std::vector* /*thread_list*/) { + return Status::NotSupported("Env::GetThreadList() not supported."); + } + + // Returns the pointer to ThreadStatusUpdater. This function will be + // used in RocksDB internally to update thread status and supports + // GetThreadList(). + virtual ThreadStatusUpdater* GetThreadStatusUpdater() const { + return thread_status_updater_; + } + + // Returns the ID of the current thread. + virtual uint64_t GetThreadID() const; + +// This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + virtual Status GetFreeSpace(const std::string& /*path*/, + uint64_t* /*diskfree*/) { + return Status::NotSupported("Env::GetFreeSpace() not supported."); + } + + // Check whether the specified path is a directory + virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) { + return Status::NotSupported("Env::IsDirectory() not supported."); + } + + virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {} + + // Get the FileSystem implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetFileSystem() const; + + // Get the SystemClock implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetSystemClock() const; + + // If you're adding methods here, remember to add them to EnvWrapper too. + + protected: + // The pointer to an internal structure that will update the + // status of each thread. + ThreadStatusUpdater* thread_status_updater_; + + // Pointer to the underlying FileSystem implementation + std::shared_ptr file_system_; + + // Pointer to the underlying SystemClock implementation + std::shared_ptr system_clock_; + + private: + static const size_t kMaxHostNameLen = 256; +}; + +// The factory function to construct a ThreadStatusUpdater. Any Env +// that supports GetThreadList() feature should call this function in its +// constructor to initialize thread_status_updater_. +ThreadStatusUpdater* CreateThreadStatusUpdater(); + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() {} + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported( + "SequentialFile::InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/, + Slice* /*result*/, char* /*scratch*/) { + return Status::NotSupported( + "SequentialFile::PositionedRead() not supported."); + } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead +struct ReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + Status status; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() {} + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) { + return Status::OK(); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for + // any errors that occur before even processing specific read requests + virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest& req = reqs[i]; + req.status = Read(req.offset, req.len, &req.result, req.scratch); + } + return Status::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + } + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported( + "RandomAccessFile::InvalidateCache not supported."); + } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit WritableFile(const EnvOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + // No copying allowed + WritableFile(const WritableFile&) = delete; + void operator=(const WritableFile&) = delete; + + virtual ~WritableFile(); + + // Append data to the end of the file + // Note: A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual Status Append(const Slice& data) = 0; + + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) { + return Append(data); + } + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual Status PositionedAppend(const Slice& /* data */, + uint64_t /* offset */) { + return Status::NotSupported( + "WritableFile::PositionedAppend() not supported."); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const DataVerificationInfo& /* verification_info */) { + return Status::NotSupported("PositionedAppend"); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); } + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { return Sync(); } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + /* + * If rate limiting is enabled, change the file-granularity priority used in + * rate-limiting writes. + * + * In the presence of finer-granularity priority such as + * `WriteOptions::rate_limiter_priority`, this file-granularity priority may + * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as + * a fallback for Env::IO_TOTAL finer-granularity priority. + * + * If rate limiting is not enabled, this call has no effect. + */ + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { return 0; } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported("WritableFile::InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) { + if (strict_bytes_per_sync_) { + return Sync(); + } + return Status::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + // TODO: Don't ignore errors from allocate + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks) + .PermitUncheckedError(); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) { + return Status::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + // No copying allowed + RandomRWFile(const RandomRWFile&) = delete; + RandomRWFile& operator=(const RandomRWFile&) = delete; + + virtual ~RandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + virtual Status Flush() = 0; + + virtual Status Sync() = 0; + + virtual Status Fsync() { return Sync(); } + + virtual Status Close() = 0; + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class MemoryMappedFileBuffer { + public: + MemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~MemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete; + MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class Directory { + public: + virtual ~Directory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual Status Fsync() = 0; + // Close directory. + virtual Status Close() { return Status::NotSupported("Close"); } + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +enum InfoLogLevel : unsigned char { + DEBUG_LEVEL = 0, + INFO_LEVEL, + WARN_LEVEL, + ERROR_LEVEL, + FATAL_LEVEL, + HEADER_LEVEL, + NUM_INFO_LOG_LEVELS, +}; + +// An interface for writing log messages. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Logger { + public: + size_t kDoNotSupportGetLogFileSize = (std::numeric_limits::max)(); + + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : closed_(false), log_level_(log_level) {} + // No copying allowed + Logger(const Logger&) = delete; + void operator=(const Logger&) = delete; + + virtual ~Logger(); + + // Close the log file. Must be called before destructor. If the return + // status is NotSupported(), it means the implementation does cleanup in + // the destructor + virtual Status Close(); + + // Write a header to the log file with the specified format + // It is recommended that you log all header information at the start of the + // application. But it is not enforced. + virtual void LogHeader(const char* format, va_list ap) { + // Default implementation does a simple INFO level log write. + // Please override as per the logger class requirement. + Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } + + // Write an entry to the log file with the specified format. + // + // Users who override the `Logv()` overload taking `InfoLogLevel` do not need + // to implement this, unless they explicitly invoke it in + // `Logv(InfoLogLevel, ...)`. + virtual void Logv(const char* /* format */, va_list /* ap */) { + assert(false); + } + + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + virtual void Logv(const InfoLogLevel log_level, const char* format, + va_list ap); + + virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; } + // Flush to the OS buffers + virtual void Flush() {} + virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } + virtual void SetInfoLogLevel(const InfoLogLevel log_level) { + log_level_ = log_level; + } + + // If you're adding methods here, remember to add them to LoggerWrapper too. + + protected: + virtual Status CloseImpl(); + bool closed_; + + private: + InfoLogLevel log_level_; +}; + +// Identifies a locked file. Except in custom Env/Filesystem implementations, +// the lifetime of a FileLock object should be managed only by LockFile() and +// UnlockFile(). +class FileLock { + public: + FileLock() {} + virtual ~FileLock(); + + private: + // No copying allowed + FileLock(const FileLock&) = delete; + void operator=(const FileLock&) = delete; +}; + +class DynamicLibrary { + public: + virtual ~DynamicLibrary() {} + + // Returns the name of the dynamic library. + virtual const char* Name() const = 0; + + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. + template + Status LoadFunction(const std::string& sym_name, std::function* function) { + assert(nullptr != function); + void* ptr = nullptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast(ptr); + return s; + } + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; +}; + +extern void LogFlush(const std::shared_ptr& info_log); + +extern void Log(const InfoLogLevel log_level, + const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// a set of log functions with different log levels. +extern void Header(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// Log the specified data to *info_log if info_log is non-nullptr. +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(const std::shared_ptr& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +extern void LogFlush(Logger* info_log); + +extern void Log(const InfoLogLevel log_level, Logger* info_log, + const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// a set of log functions with different log levels. +extern void Header(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::SequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target): +// ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {} +// Status Read(size_t n, Slice* result, char* scratch) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result, +// scratch); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // The Target struct allows an Env to be stored as a raw (Env*) or + // std::shared_ptr. By using this struct, the wrapping/calling + // class does not need to worry about the ownership/lifetime of the + // wrapped target env. If the guard is set, then the Env will point + // to the guard.get(). + struct Target { + Env* env; // The raw Env + std::shared_ptr guard; // The guarded Env + + // Creates a Target without assuming ownership of the target Env + explicit Target(Env* t) : env(t) {} + + // Creates a Target from the guarded env, assuming ownership + explicit Target(std::unique_ptr&& t) : guard(t.release()) { + env = guard.get(); + } + + // Creates a Target from the guarded env, assuming ownership + explicit Target(const std::shared_ptr& t) : guard(t) { + env = guard.get(); + } + + // Makes sure the raw Env is not nullptr + void Prepare() { + if (guard.get() != nullptr) { + env = guard.get(); + } else if (env == nullptr) { + env = Env::Default(); + } + } + }; + + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t); + explicit EnvWrapper(std::unique_ptr&& t); + explicit EnvWrapper(const std::shared_ptr& t); + ~EnvWrapper() override; + + // Return the target to which this Env forwards all calls + Env* target() const { return target_.env; } + + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return target_.env->Name(); } + + // The following text is boilerplate that forwards all methods to target() + Status RegisterDbPaths(const std::vector& paths) override { + return target_.env->RegisterDbPaths(paths); + } + + Status UnregisterDbPaths(const std::vector& paths) override { + return target_.env->UnregisterDbPaths(paths); + } + + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->NewWritableFile(f, r, options); + } + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + return target_.env->ReopenWritableFile(fname, result, options); + } + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) override { + return target_.env->ReuseWritableFile(fname, old_fname, r, options); + } + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + return target_.env->NewRandomRWFile(fname, result, options); + } + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return target_.env->NewMemoryMappedFileBuffer(fname, result); + } + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + return target_.env->NewDirectory(name, result); + } + Status FileExists(const std::string& f) override { + return target_.env->FileExists(f); + } + Status GetChildren(const std::string& dir, + std::vector* r) override { + return target_.env->GetChildren(dir, r); + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + return target_.env->GetChildrenFileAttributes(dir, result); + } + Status DeleteFile(const std::string& f) override { + return target_.env->DeleteFile(f); + } + Status Truncate(const std::string& fname, size_t size) override { + return target_.env->Truncate(fname, size); + } + Status CreateDir(const std::string& d) override { + return target_.env->CreateDir(d); + } + Status CreateDirIfMissing(const std::string& d) override { + return target_.env->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) override { + return target_.env->DeleteDir(d); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + return target_.env->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + return target_.env->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) override { + return target_.env->RenameFile(s, t); + } + + Status LinkFile(const std::string& s, const std::string& t) override { + return target_.env->LinkFile(s, t); + } + + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + return target_.env->NumFileLinks(fname, count); + } + + Status AreFilesSame(const std::string& first, const std::string& second, + bool* res) override { + return target_.env->AreFilesSame(first, second, res); + } + + Status LockFile(const std::string& f, FileLock** l) override { + return target_.env->LockFile(f, l); + } + + Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); } + + Status IsDirectory(const std::string& path, bool* is_dir) override { + return target_.env->IsDirectory(path, is_dir); + } + + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_.env->LoadLibrary(lib_name, search_path, result); + } + + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_.env->Schedule(f, a, pri, tag, u); + } + + int UnSchedule(void* tag, Priority pri) override { + return target_.env->UnSchedule(tag, pri); + } + + void StartThread(void (*f)(void*), void* a) override { + return target_.env->StartThread(f, a); + } + void WaitForJoin() override { return target_.env->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_.env->GetThreadPoolQueueLen(pri); + } + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override { + return target_.env->ReserveThreads(threads_to_be_reserved, pri); + } + + int ReleaseThreads(int threads_to_be_released, Priority pri) override { + return target_.env->ReleaseThreads(threads_to_be_released, pri); + } + + Status GetTestDirectory(std::string* path) override { + return target_.env->GetTestDirectory(path); + } + Status NewLogger(const std::string& fname, + std::shared_ptr* result) override { + return target_.env->NewLogger(fname, result); + } + uint64_t NowMicros() override { return target_.env->NowMicros(); } + uint64_t NowNanos() override { return target_.env->NowNanos(); } + uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); } + + void SleepForMicroseconds(int micros) override { + target_.env->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) override { + return target_.env->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) override { + return target_.env->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override { + return target_.env->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) override { + return target_.env->SetBackgroundThreads(num, pri); + } + int GetBackgroundThreads(Priority pri) override { + return target_.env->GetBackgroundThreads(pri); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); + } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); + } + + std::string TimeToString(uint64_t time) override { + return target_.env->TimeToString(time); + } + + Status GetThreadList(std::vector* thread_list) override { + return target_.env->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_.env->GetThreadStatusUpdater(); + } + + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } + + std::string GenerateUniqueId() override { + return target_.env->GenerateUniqueId(); + } + + EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { + return target_.env->OptimizeForLogRead(env_options); + } + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override { + return target_.env->OptimizeForManifestRead(env_options); + } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override { + return target_.env->OptimizeForLogWrite(env_options, db_options); + } + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override { + return target_.env->OptimizeForManifestWrite(env_options); + } + EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_.env->OptimizeForCompactionTableWrite(env_options, + immutable_ops); + } + EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_.env->OptimizeForCompactionTableRead(env_options, db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_.env->OptimizeForBlobFileRead(env_options, db_options); + } + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { + return target_.env->GetFreeSpace(path, diskfree); + } + void SanitizeEnvOptions(EnvOptions* env_opts) const override { + target_.env->SanitizeEnvOptions(env_opts); + } + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + + private: + Target target_; +}; + +class SequentialFileWrapper : public SequentialFile { + public: + explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + return target_->Read(n, result, scratch); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + return target_->PositionedRead(offset, n, result, scratch); + } + + private: + SequentialFile* target_; +}; + +class RandomAccessFileWrapper : public RandomAccessFile { + public: + explicit RandomAccessFileWrapper(RandomAccessFile* target) + : target_(target) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + return target_->MultiRead(reqs, num_reqs); + } + Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + RandomAccessFile* target_; +}; + +class WritableFileWrapper : public WritableFile { + public: + explicit WritableFileWrapper(WritableFile* t) : target_(t) {} + + Status Append(const Slice& data) override { return target_->Append(data); } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + return target_->Append(data, verification_info); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + return target_->PositionedAppend(data, offset, verification_info); + } + Status Truncate(uint64_t size) override { return target_->Truncate(size); } + Status Close() override { return target_->Close(); } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetIOPriority(Env::IOPriority pri) override { + target_->SetIOPriority(pri); + } + + Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { return target_->GetFileSize(); } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + return target_->RangeSync(offset, nbytes); + } + + void PrepareWrite(size_t offset, size_t len) override { + target_->PrepareWrite(offset, len); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + return target_->Allocate(offset, len); + } + + private: + WritableFile* target_; +}; + +class RandomRWFileWrapper : public RandomRWFile { + public: + explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + return target_->Write(offset, data); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + Status Close() override { return target_->Close(); } + + private: + RandomRWFile* target_; +}; + +class DirectoryWrapper : public Directory { + public: + explicit DirectoryWrapper(Directory* target) : target_(target) {} + + Status Fsync() override { return target_->Fsync(); } + Status Close() override { return target_->Close(); } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + Directory* target_; +}; + +class LoggerWrapper : public Logger { + public: + explicit LoggerWrapper(Logger* target) : target_(target) {} + + Status Close() override { return target_->Close(); } + void LogHeader(const char* format, va_list ap) override { + return target_->LogHeader(format, ap); + } + void Logv(const char* format, va_list ap) override { + return target_->Logv(format, ap); + } + void Logv(const InfoLogLevel log_level, const char* format, + va_list ap) override { + return target_->Logv(log_level, format, ap); + } + size_t GetLogFileSize() const override { return target_->GetLogFileSize(); } + void Flush() override { return target_->Flush(); } + InfoLogLevel GetInfoLogLevel() const override { + return target_->GetInfoLogLevel(); + } + void SetInfoLogLevel(const InfoLogLevel log_level) override { + return target_->SetInfoLogLevel(log_level); + } + + private: + Logger* target_; +}; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +// Returns a new environment that measures function call times for filesystem +// operations, reporting results to variables in PerfContext. +// This is a factory method for TimedEnv defined in utilities/env_timed.cc. +Env* NewTimedEnv(Env* base_env); + +// Returns an instance of logger that can be used for storing informational +// messages. +// This is a factory method for EnvLogger declared in logging/env_logging.h +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result); + +// Creates a new Env based on Env::Default() but modified to use the specified +// FileSystem. +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h new file mode 100644 index 000000000..282db6ed4 --- /dev/null +++ b/src/rocksdb/include/rocksdb/env_encryption.h @@ -0,0 +1,465 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class EncryptionProvider; + +struct ConfigOptions; + +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider); +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base_fs, + const std::shared_ptr& provider); + +// BlockAccessCipherStream is the base class for any cipher stream that +// supports random access at block level (without requiring data from other +// blocks). E.g. CTR (Counter operation mode) supports this requirement. +class BlockAccessCipherStream { + public: + virtual ~BlockAccessCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize); + + // Decrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize); + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) = 0; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; +}; + +// BlockCipher +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class BlockCipher : public Customizable { + public: + virtual ~BlockCipher(){}; + + // Creates a new BlockCipher from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "ROT13", a ROT13BlockCipher is created. + // + // @param config_options Options to control how this cipher is created + // and initialized. + // @param value The value might be: + // - ROT13 Create a ROT13 Cipher + // - ROT13:nn Create a ROT13 Cipher with block size of nn + // @param result The new cipher object + // @return OK if the cipher was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "BlockCipher"; } + // Short-cut method to create a ROT13 BlockCipher. + // This cipher is only suitable for test purposes and should not be used in + // production!!! + static std::shared_ptr NewROT13Cipher(size_t block_size); + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) = 0; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) = 0; +}; + +// The encryption provider is used to create a cipher stream for a specific +// file. The returned cipher stream will be used for actual +// encryption/decryption actions. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class EncryptionProvider : public Customizable { + public: + virtual ~EncryptionProvider(){}; + + // Creates a new EncryptionProvider from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "CTR", a CTREncryptionProvider will be + // created. If the value is ends with "://test" (e.g CTR://test"), the + // provider will be initialized in "TEST" mode prior to being returned. + // + // @param config_options Options to control how this provider is created + // and initialized. + // @param value The value might be: + // - CTR Create a CTR provider + // - CTR://test Create a CTR provider and initialize it for tests. + // @param result The new provider object + // @return OK if the provider was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "EncryptionProvider"; } + + // Short-cut method to create a CTR-provider + static std::shared_ptr NewCTRProvider( + const std::shared_ptr& cipher); + + // GetPrefixLength returns the length of the prefix that is added to every + // file and used for storing encryption options. For optimal performance, the + // prefix length should be a multiple of the page size. + virtual size_t GetPrefixLength() const = 0; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const = 0; + + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) = 0; + + // Returns a string representing an encryption marker prefix for this + // provider. If a marker is provided, this marker can be used to tell whether + // or not a file is encrypted by this provider. The maker will also be part + // of any encryption prefix for this provider. + virtual std::string GetMarker() const { return ""; } +}; + +class EncryptedSequentialFile : public FSSequentialFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + uint64_t offset_; + size_t prefixLength_; + + public: + // Default ctor. Given underlying sequential file is supposed to be at + // offset == prefixLength. + EncryptedSequentialFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + offset_(prefixLength), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + IOStatus Skip(uint64_t n) override; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; +}; + +// A file abstraction for randomly reading the contents of a file. +class EncryptedRandomAccessFile : public FSRandomAccessFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomAccessFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + // Readahead the file starting from offset by n bytes for caching. + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + size_t GetUniqueId(char* id, size_t max_size) const override; + + void Hint(AccessPattern pattern) override; + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class EncryptedWritableFile : public FSWritableFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + // Default ctor. Prefix is assumed to be written already. + EncryptedWritableFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + bool IsSyncThreadSafe() const override; + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + /* + * Get the size of valid data in the file. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override; + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override; + + void SetPreallocationBlockSize(size_t size) override; + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override; + + // Pre-allocates space for a file. + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +// A file abstraction for random reading and writing. +class EncryptedRandomRWFile : public FSRandomRWFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomRWFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +class EncryptedFileSystem : public FileSystemWrapper { + public: + explicit EncryptedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + static const char* kClassName() { return "EncryptedFileSystem"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(name); + } + } +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h new file mode 100644 index 000000000..b59395255 --- /dev/null +++ b/src/rocksdb/include/rocksdb/experimental.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + +// Supported only for Leveled compaction +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end); + +// Move all L0 files to target_level skipping compaction. +// This operation succeeds only if the files in L0 have disjoint ranges; this +// is guaranteed to happen, for instance, if keys are inserted in sorted +// order. Furthermore, all levels between 1 and target_level must be empty. +// If any of the above condition is violated, InvalidArgument will be +// returned. +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, + int target_level = 1); + +struct UpdateManifestForFilesStateOptions { + // When true, read current file temperatures from FileSystem and update in + // DB manifest when a temperature other than Unknown is reported and + // inconsistent with manifest. + bool update_temperatures = true; + + // TODO: new_checksums: to update files to latest file checksum algorithm +}; + +// Utility for updating manifest of DB directory (not open) for current state +// of files on filesystem. See UpdateManifestForFilesStateOptions. +// +// To minimize interference with ongoing DB operations, only the following +// guarantee is provided, assuming no IO error encountered: +// * Only files live in DB at start AND end of call to +// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in +// manifest. +// * For example, new files after start of call to +// UpdateManifestForFilesState() might not be updated, but that is not +// typically required to achieve goal of manifest consistency/completeness +// (because current DB configuration would ensure new files get the desired +// consistent metadata). +Status UpdateManifestForFilesState( + const DBOptions& db_opts, const std::string& db_name, + const std::vector& column_families, + const UpdateManifestForFilesStateOptions& opts = {}); + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h new file mode 100644 index 000000000..758bae4ac --- /dev/null +++ b/src/rocksdb/include/rocksdb/file_checksum.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The unknown file checksum. +constexpr char kUnknownFileChecksum[] = ""; +// The unknown sst file checksum function name. +constexpr char kUnknownFileChecksumFuncName[] = "Unknown"; +// The standard DB file checksum function name. +// This is the name of the checksum function returned by +// GetFileChecksumGenCrc32cFactory(); +constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c"; + +struct FileChecksumGenContext { + std::string file_name; + // The name of the requested checksum generator. + // Checksum factories may use or ignore requested_checksum_func_name, + // and checksum factories written before this field was available are still + // compatible. + std::string requested_checksum_func_name; +}; + +// FileChecksumGenerator is the class to generates the checksum value +// for each file when the file is written to the file system. +// Implementations may assume that +// * Finalize is called at most once during the life of the object +// * All calls to Update come before Finalize +// * All calls to GetChecksum come after Finalize +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenerator { + public: + virtual ~FileChecksumGenerator() {} + + // Update the current result after process the data. For different checksum + // functions, the temporal results may be stored and used in Update to + // include the new data. + virtual void Update(const char* data, size_t n) = 0; + + // Generate the final results if no further new data will be updated. + virtual void Finalize() = 0; + + // Get the checksum. The result should not be the empty string and may + // include arbitrary bytes, including non-printable characters. + virtual std::string GetChecksum() const = 0; + + // Returns a name that identifies the current file checksum function. + virtual const char* Name() const = 0; +}; + +// Create the FileChecksumGenerator object for each SST file. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenFactory : public Customizable { + public: + ~FileChecksumGenFactory() override {} + static const char* Type() { return "FileChecksumGenFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + // Create a new FileChecksumGenerator. + virtual std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) = 0; + + // Return the name of this FileChecksumGenFactory. + const char* Name() const override = 0; +}; + +// FileChecksumList stores the checksum information of a list of files (e.g., +// SST files). The FileChecksumList can be used to store the checksum +// information of all SST file getting from the MANIFEST, which are +// the checksum information of all valid SST file of a DB instance. It can +// also be used to store the checksum information of a list of SST files to +// be ingested. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumList { + public: + virtual ~FileChecksumList() {} + + // Clean the previously stored file checksum information. + virtual void reset() = 0; + + // Get the number of checksums in the checksum list + virtual size_t size() const = 0; + + // Return all the file checksum information being stored in a unordered_map. + // File_number is the key, the first part of the value is checksum value, + // and the second part of the value is checksum function name. + virtual Status GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) = 0; + + // Given the file_number, it searches if the file checksum information is + // stored. + virtual Status SearchOneFileChecksum(uint64_t file_number, + std::string* checksum, + std::string* checksum_func_name) = 0; + + // Insert the checksum information of one file to the FileChecksumList. + virtual Status InsertOneFileChecksum( + uint64_t file_number, const std::string& checksum, + const std::string& checksum_func_name) = 0; + + // Remove the checksum information of one SST file. + virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0; +}; + +// Create a new file checksum list. +extern FileChecksumList* NewFileChecksumList(); + +// Return a shared_ptr of the builtin Crc32c based file checksum generator +// factory object, which can be shared to create the Crc32c based checksum +// generator object. +// Note: this implementation is compatible with many other crc32c checksum +// implementations and uses big-endian encoding of the result, unlike most +// other crc32c checksums in RocksDB, which alter the result with +// crc32c::Mask and use little-endian encoding. +extern std::shared_ptr +GetFileChecksumGenCrc32cFactory(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h new file mode 100644 index 000000000..91ad47218 --- /dev/null +++ b/src/rocksdb/include/rocksdb/file_system.h @@ -0,0 +1,1849 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A FileSystem is an interface used by the rocksdb implementation to access +// storage functionality like the filesystem etc. Callers +// may wish to provide a custom FileSystem object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All FileSystem implementations are safe for concurrent access from +// multiple threads without any external synchronization. +// +// WARNING: Since this is a new interface, it is expected that there will be +// some changes as storage systems are ported over. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +class FileLock; +class FSDirectory; +class FSRandomAccessFile; +class FSRandomRWFile; +class FSSequentialFile; +class FSWritableFile; +class Logger; +class Slice; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; +struct ConfigOptions; + +using AccessPattern = RandomAccessFile::AccessPattern; +using FileAttributes = Env::FileAttributes; + +// DEPRECATED +// Priority of an IO request. This is a hint and does not guarantee any +// particular QoS. +// IO_LOW - Typically background reads/writes such as compaction/flush +// IO_HIGH - Typically user reads/synchronous WAL writes +enum class IOPriority : uint8_t { + kIOLow, + kIOHigh, + kIOTotal, +}; + +// Type of the data begin read/written. It can be passed down as a flag +// for the FileSystem implementation to optionally handle different types in +// different ways +enum class IOType : uint8_t { + kData, + kFilter, + kIndex, + kMetadata, + kWAL, + kManifest, + kLog, + kUnknown, + kInvalid, +}; + +// Per-request options that can be passed down to the FileSystem +// implementation. These are hints and are not necessarily guaranteed to be +// honored. More hints can be added here in the future to indicate things like +// storage media (HDD/SSD) to be used, replication level etc. +struct IOOptions { + // Timeout for the operation in microseconds + std::chrono::microseconds timeout; + + // DEPRECATED + // Priority - high or low + IOPriority prio; + + // Priority used to charge rate limiter configured in file system level (if + // any) + // Limitation: right now RocksDB internal does not consider this + // rate_limiter_priority + Env::IOPriority rate_limiter_priority; + + // Type of data being read/written + IOType type; + + // EXPERIMENTAL + // An option map that's opaque to RocksDB. It can be used to implement a + // custom contract between a FileSystem user and the provider. This is only + // useful in cases where a RocksDB user directly uses the FileSystem or file + // object for their own purposes, and wants to pass extra options to APIs + // such as NewRandomAccessFile and NewWritableFile. + std::unordered_map property_bag; + + // Force directory fsync, some file systems like btrfs may skip directory + // fsync, set this to force the fsync + bool force_dir_fsync; + + // Can be used by underlying file systems to skip recursing through sub + // directories and list only files in GetChildren API. + bool do_not_recurse; + + IOOptions() : IOOptions(false) {} + + explicit IOOptions(bool force_dir_fsync_) + : timeout(std::chrono::microseconds::zero()), + prio(IOPriority::kIOLow), + rate_limiter_priority(Env::IO_TOTAL), + type(IOType::kUnknown), + force_dir_fsync(force_dir_fsync_), + do_not_recurse(false) {} +}; + +struct DirFsyncOptions { + enum FsyncReason : uint8_t { + kNewFileSynced, + kFileRenamed, + kDirRenamed, + kFileDeleted, + kDefault, + } reason; + + std::string renamed_new_name; // for kFileRenamed + // add other options for other FsyncReason + + DirFsyncOptions(); + + explicit DirFsyncOptions(std::string file_renamed_new_name); + + explicit DirFsyncOptions(FsyncReason fsync_reason); +}; + +// File scope options that control how a file is opened/created and accessed +// while its open. We may add more options here in the future such as +// redundancy level, media to use etc. +struct FileOptions : EnvOptions { + // Embedded IOOptions to control the parameters for any IOs that need + // to be issued for the file open/creation + IOOptions io_options; + + // EXPERIMENTAL + // The feature is in development and is subject to change. + // When creating a new file, set the temperature of the file so that + // underlying file systems can put it with appropriate storage media and/or + // coding. + Temperature temperature = Temperature::kUnknown; + + // The checksum type that is used to calculate the checksum value for + // handoff during file writes. + ChecksumType handoff_checksum_type; + + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const DBOptions& opts) + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const EnvOptions& opts) + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const FileOptions& opts) + : EnvOptions(opts), + io_options(opts.io_options), + temperature(opts.temperature), + handoff_checksum_type(opts.handoff_checksum_type) {} + + FileOptions& operator=(const FileOptions&) = default; +}; + +// A structure to pass back some debugging information from the FileSystem +// implementation to RocksDB in case of an IO error +struct IODebugContext { + // file_path to be filled in by RocksDB in case of an error + std::string file_path; + + // A map of counter names to values - set by the FileSystem implementation + std::map counters; + + // To be set by the FileSystem implementation + std::string msg; + + // To be set by the underlying FileSystem implementation. + std::string request_id; + + // In order to log required information in IO tracing for different + // operations, Each bit in trace_data stores which corresponding info from + // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it + // means bit at position 0 is set so TraceData::kRequestID (request_id) will + // be logged in the trace record. + // + enum TraceData : char { + // The value of each enum represents the bitwise position for + // that information in trace_data which will be used by IOTracer for + // tracing. Make sure to add them sequentially. + kRequestID = 0, + }; + uint64_t trace_data = 0; + + IODebugContext() {} + + void AddCounter(std::string& name, uint64_t value) { + counters.emplace(name, value); + } + + // Called by underlying file system to set request_id and log request_id in + // IOTracing. + void SetRequestId(const std::string& _request_id) { + request_id = _request_id; + trace_data |= (1 << TraceData::kRequestID); + } + + std::string ToString() { + std::ostringstream ss; + ss << file_path << ", "; + for (auto counter : counters) { + ss << counter.first << " = " << counter.second << ","; + } + ss << msg; + return ss.str(); + } +}; + +// A function pointer type for custom destruction of void pointer passed to +// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer +// allocated by FS in ReadAsync API. +using IOHandleDeleter = std::function; + +// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile, +// FSRandomRWFileclass, and FSDIrectory classes define the interface between +// RocksDB and storage systems, such as Posix filesystems, +// remote filesystems etc. +// The interface allows for fine grained control of individual IO operations, +// such as setting a timeout, prioritization, hints on data placement, +// different handling based on type of IO etc. +// This is accomplished by passing an instance of IOOptions to every +// API call that can potentially perform IO. Additionally, each such API is +// passed a pointer to a IODebugContext structure that can be used by the +// storage system to include troubleshooting information. The return values +// of the APIs is of type IOStatus, which can indicate an error code/sub-code, +// as well as metadata about the error such as its scope and whether its +// retryable. +// NewCompositeEnv can be used to create an Env with a custom FileSystem for +// DBOptions::env. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileSystem : public Customizable { + public: + FileSystem(); + + // No copying allowed + FileSystem(const FileSystem&) = delete; + + virtual ~FileSystem(); + + static const char* Type() { return "FileSystem"; } + static const char* kDefaultName() { return "DefaultFileSystem"; } + + // Loads the FileSystem specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. + static Status Load(const std::string& value, + std::shared_ptr* result); + + // Loads the FileSystem specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // @param config_options Controls how the FileSystem is loaded + // @param value The name and optional properties describing the file system + // to load. + // @param result On success, returns the loaded FileSystem + // @return OK if the FileSystem was successfully loaded. + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + + // Return a default FileSystem suitable for the current operating + // system. + static std::shared_ptr Default(); + + // Handles the event when a new DB or a new ColumnFamily starts using the + // specified data paths. + // + // The data paths might be shared by different DBs or ColumnFamilies, + // so RegisterDbPaths might be called with the same data paths. + // For example, when CreateColumnFamily is called multiple times with the same + // data path, RegisterDbPaths will also be called with the same data path. + // + // If the return status is ok, then the paths must be correspondingly + // called in UnregisterDbPaths; + // otherwise this method should have no side effect, and UnregisterDbPaths + // do not need to be called for the paths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // Handles the event a DB or a ColumnFamily stops using the specified data + // paths. + // + // It should be called corresponding to each successful RegisterDbPaths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual IOStatus NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + kWLTHNotSet = 0, // No hint information set + kWLTHNone, // No hints about write life time + kWLTHShort, // Data written has a short life time + kWLTHMedium, // Data written has a medium life time + kWLTHLong, // Data written has a long life time + kWLTHExtreme, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Create an object that writes to a file with the specified name. + // `FSWritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus ReopenWritableFile( + const std::string& /*fname*/, const FileOptions& /*options*/, + std::unique_ptr* /*result*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("ReopenWritableFile"); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg); + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "RandomRWFile is not implemented in this FileSystem"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual IOStatus NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr* /*result*/) { + return IOStatus::NotSupported( + "MemoryMappedFileBuffer is not implemented in this FileSystem"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual IOStatus NewDirectory(const std::string& name, + const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual IOStatus FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + assert(result != nullptr); + std::vector child_fnames; + IOStatus s = GetChildren(dir, options, &child_fnames, dbg); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes, + dbg)) + .ok()) { + if (FileExists(path, options, dbg).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return IOStatus::OK(); + } + +// This seems to clash with a macro on Windows, so #undef it here +#ifdef DeleteFile +#undef DeleteFile +#endif + // Delete the named file. + virtual IOStatus DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Truncate the named file to the specified size. + virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "Truncate is not supported for this FileSystem"); + } + + // Create the specified directory. Returns error if directory exists. + virtual IOStatus CreateDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Delete the specified directory. + virtual IOStatus DeleteDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Store the size of fname in *file_size. + virtual IOStatus GetFileSize(const std::string& fname, + const IOOptions& options, uint64_t* file_size, + IODebugContext* dbg) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) = 0; + // Rename file src to target. + virtual IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Hard Link file src to target. + virtual IOStatus LinkFile(const std::string& /*src*/, + const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "LinkFile is not supported for this FileSystem"); + } + + virtual IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*count*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "Getting number of file links is not supported for this FileSystem"); + } + + virtual IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "AreFilesSame is not supported for this FileSystem"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg); + + // Get full directory name for this db. + virtual IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) = 0; + + // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions + // copy constructor + virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {} + + // OptimizeForLogRead will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for reading log files. + virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const; + + // OptimizeForManifestRead will create a new FileOptions object that is a copy + // of the FileOptions in the parameters, but is optimized for reading manifest + // files. + virtual FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const; + + // OptimizeForLogWrite will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const; + + // OptimizeForManifestWrite will create a new FileOptions object that is a + // copy of the FileOptions in the parameters, but is optimized for writing + // manifest files. Default implementation returns the copy of the same + // object. + virtual FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const; + + // OptimizeForCompactionTableWrite will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // writing table files. + virtual FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading table files. + virtual FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + + // OptimizeForBlobFileRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading blob files. + virtual FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + +// This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + + // Get the amount of free disk space + virtual IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, + uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("GetFreeSpace"); + } + + virtual IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& options, bool* is_dir, + IODebugContext* /*dgb*/) = 0; + + // EXPERIMENTAL + // Poll for completion of read IO requests. The Poll() method should call the + // callback functions to indicate completion of read requests. + // Underlying FS is required to support Poll API. Poll implementation should + // ensure that the callback gets called at IO completion, and return only + // after the callback has been called. + // If Poll returns partial results for any reads, its caller reponsibility to + // call Read or ReadAsync in order to get the remaining bytes. + // + // Default implementation is to return IOStatus::OK. + + virtual IOStatus Poll(std::vector& /*io_handles*/, + size_t /*min_completions*/) { + return IOStatus::OK(); + } + + // EXPERIMENTAL + // Abort the read IO requests submitted asynchronously. Underlying FS is + // required to support AbortIO API. AbortIO implementation should ensure that + // the all the read requests related to io_handles should be aborted and + // it shouldn't call the callback for these io_handles. + // + // Default implementation is to return IOStatus::OK. + virtual IOStatus AbortIO(std::vector& /*io_handles*/) { + return IOStatus::OK(); + } + + // If you're adding methods here, remember to add them to EnvWrapper too. + + private: + void operator=(const FileSystem&); +}; + +// A file abstraction for reading sequentially through a file +class FSSequentialFile { + public: + FSSequentialFile() {} + + virtual ~FSSequentialFile() {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // REQUIRES: External synchronization + virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual IOStatus Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + Slice* /*result*/, char* /*scratch*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedRead"); + } + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead and asynchronous Read APIs. +struct FSReadRequest { + // Input parameter that represents the file offset in bytes. + uint64_t offset; + + // Input parameter that represents the length to read in bytes. `result` only + // returns fewer bytes if end of file is hit (or `status` is not OK). + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer. + // The lifecycle of scratch will be until IO is completed. + // + // In case of asynchronous reads, its an output parameter and it will be + // maintained until callback has been called. Scratch is allocated by RocksDB + // and will be passed to underlying FileSystem. + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + // + // In case of asynchronous reads, this output parameter is set by Async Read + // APIs to point to the data buffer, and + // the number of valid bytes. + // Slice result should point to scratch i.e the data should + // always be read into scratch. + Slice result; + + // Output parameter set by underlying FileSystem that represents status of + // read request. + IOStatus status; +}; + +// A file abstraction for randomly reading the contents of a file. +class FSRandomAccessFile { + public: + FSRandomAccessFile() {} + + virtual ~FSRandomAccessFile() {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + // If it's not implemented (default: `NotSupported`), RocksDB will create + // internal prefetch buffer to improve read performance. + virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("Prefetch"); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping but can be in any order. If the function return Status + // is not ok, status of individual requests will be ignored and return + // status will be assumed for all read requests. The function return status + // is only meant for errors that occur before processing individual read + // requests. + virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + FSReadRequest& req = reqs[i]; + req.status = + Read(req.offset, req.len, options, &req.result, req.scratch, dbg); + } + return IOStatus::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // EXPERIMENTAL + // This API reads the requested data in FSReadRequest asynchronously. This is + // a asynchronous call, i.e it should return after submitting the request. + // + // When the read request is completed, callback function specified in cb + // should be called with arguments cb_arg and the result populated in + // FSReadRequest with result and status fileds updated by FileSystem. + // cb_arg should be used by the callback to track the original request + // submitted. + // + // This API should also populate io_handle which should be used by + // underlying FileSystem to store the context in order to distinguish the read + // requests at their side and provide the custom deletion function in del_fn. + // RocksDB guarantees that the del_fn for io_handle will be called after + // receiving the callback. Furthermore, RocksDB guarantees that if it calls + // the Poll API for this io_handle, del_fn will be called after the Poll + // returns. RocksDB is responsible for managing the lifetime of io_handle. + // + // req contains the request offset and size passed as input parameter of read + // request and result and status fields are output parameter set by underlying + // FileSystem. The data should always be read into scratch field. + // + // Default implementation is to read the data synchronously. + virtual IOStatus ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) { + req.status = + Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg); + cb(req, cb_arg); + return IOStatus::OK(); + } + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A data structure brings the data verification information, which is +// used together with data being written to a file. +struct DataVerificationInfo { + // checksum of the data being written. + Slice checksum; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class FSWritableFile { + public: + FSWritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit FSWritableFile(const FileOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + + virtual ~FSWritableFile() {} + + // Append data to the end of the file + // Note: A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) = 0; + + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) { + return Append(data, options, dbg); + } + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual IOStatus PositionedAppend(const Slice& /* data */, + uint64_t /* offset */, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedAppend"); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const IOOptions& /*options*/, + const DataVerificationInfo& /* verification_info */, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedAppend"); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + virtual IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) = 0; + + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + virtual IOStatus Sync(const IOOptions& options, + IODebugContext* dbg) = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + /* + * If rate limiting is enabled, change the file-granularity priority used in + * rate-limiting writes. + * + * In the presence of finer-granularity priority such as + * `WriteOptions::rate_limiter_priority`, this file-granularity priority may + * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as + * a fallback for Env::IO_TOTAL finer-granularity priority. + * + * If rate limiting is not enabled, this call has no effect. + */ + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, IODebugContext* dbg) { + if (strict_bytes_per_sync_) { + return Sync(options, dbg); + } + return IOStatus::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks, options, dbg) + .PermitUncheckedError(); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + FSWritableFile(const FSWritableFile&); + void operator=(const FSWritableFile&); + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class FSRandomRWFile { + public: + FSRandomRWFile() {} + + virtual ~FSRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // + // Returns Status::OK() on success. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0; + + // EXPERIMENTAL + // When available, returns the actual temperature for the file. This is + // useful in case some outside process moves a file from one tier to another, + // though the temperature is generally expected not to change while a file is + // open. + virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. + + // No copying allowed + FSRandomRWFile(const RandomRWFile&) = delete; + FSRandomRWFile& operator=(const RandomRWFile&) = delete; +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class FSMemoryMappedFileBuffer { + public: + FSMemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~FSMemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete; + FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class FSDirectory { + public: + virtual ~FSDirectory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; + + // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it + // may fsync directory or just the renaming file (e.g. btrfs). By default, it + // just calls directory fsync. + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& /*dir_fsync_options*/) { + return Fsync(options, dbg); + } + + // Close directory + virtual IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("Close"); + } + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target): +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {} +// Status Read(size_t n, FileSystem::IOOptions& options, Slice* result, +// char* scratch, FileSystem::IODebugContext* dbg) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options, +// result, +// scratch, dbg); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class FileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit FileSystemWrapper(const std::shared_ptr& t); + ~FileSystemWrapper() override {} + + // Return the target to which this Env forwards all calls + FileSystem* target() const { return target_.get(); } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewSequentialFile(f, file_opts, r, dbg); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewRandomAccessFile(f, file_opts, r, dbg); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->NewWritableFile(f, file_opts, r, dbg); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->ReopenWritableFile(fname, file_opts, result, dbg); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->NewRandomRWFile(fname, file_opts, result, dbg); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return target_->NewMemoryMappedFileBuffer(fname, result); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + return target_->NewDirectory(name, io_opts, result, dbg); + } + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override { + return target_->FileExists(f, io_opts, dbg); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override { + return target_->GetChildren(dir, io_opts, r, dbg); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + return target_->GetChildrenFileAttributes(dir, options, result, dbg); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteFile(f, options, dbg); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override { + return target_->Truncate(fname, size, options, dbg); + } + IOStatus CreateDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDir(d, options, dbg); + } + IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDirIfMissing(d, options, dbg); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteDir(d, options, dbg); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& options, + uint64_t* s, IODebugContext* dbg) override { + return target_->GetFileSize(f, options, s, dbg); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override { + return target_->GetFileModificationTime(fname, options, file_mtime, dbg); + } + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override { + return target_->GetAbsolutePath(db_path, options, output_path, dbg); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->RenameFile(s, t, options, dbg); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->LinkFile(s, t, options, dbg); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& options, + uint64_t* count, IODebugContext* dbg) override { + return target_->NumFileLinks(fname, options, count, dbg); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& options, bool* res, + IODebugContext* dbg) override { + return target_->AreFilesSame(first, second, options, res, dbg); + } + + IOStatus LockFile(const std::string& f, const IOOptions& options, + FileLock** l, IODebugContext* dbg) override { + return target_->LockFile(f, options, l, dbg); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& options, + IODebugContext* dbg) override { + return target_->UnlockFile(l, options, dbg); + } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override { + return target_->GetTestDirectory(options, path, dbg); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override { + return target_->NewLogger(fname, options, result, dbg); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeFileOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, + uint64_t* diskfree, IODebugContext* dbg) override { + return target_->GetFreeSpace(path, options, diskfree, dbg); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override { + return target_->IsDirectory(path, options, is_dir, dbg); + } + + const Customizable* Inner() const override { return target_.get(); } + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + + virtual IOStatus Poll(std::vector& io_handles, + size_t min_completions) override { + return target_->Poll(io_handles, min_completions); + } + + virtual IOStatus AbortIO(std::vector& io_handles) override { + return target_->AbortIO(io_handles); + } + + protected: + std::shared_ptr target_; +}; + +class FSSequentialFileWrapper : public FSSequentialFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {} + + FSSequentialFile* target() const { return target_; } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + return target_->Read(n, options, result, scratch, dbg); + } + IOStatus Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + return target_->PositionedRead(offset, n, options, result, scratch, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + FSSequentialFile* target_; +}; + +class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSSequentialFileOwnerWrapper(std::unique_ptr&& t) + : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSRandomAccessFileWrapper : public FSRandomAccessFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {} + + FSRandomAccessFile* target() const { return target_; } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { + return target_->MultiRead(reqs, num_reqs, options, dbg); + } + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Prefetch(offset, n, options, dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + }; + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override { + return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + std::unique_ptr guard_; + FSRandomAccessFile* target_; +}; + +class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomAccessFileOwnerWrapper( + std::unique_ptr&& t) + : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSWritableFileWrapper : public FSWritableFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {} + + FSWritableFile* target() const { return target_; } + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->Append(data, options, verification_info, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, verification_info, + dbg); + } + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Truncate(size, options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + return target_->GetFileSize(options, dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + return target_->RangeSync(offset, nbytes, options, dbg); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override { + target_->PrepareWrite(offset, len, options, dbg); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Allocate(offset, len, options, dbg); + } + + private: + FSWritableFile* target_; +}; + +class FSWritableFileOwnerWrapper : public FSWritableFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSWritableFileOwnerWrapper(std::unique_ptr&& t) + : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSRandomRWFileWrapper : public FSRandomRWFile { + public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {} + + FSRandomRWFile* target() const { return target_; } + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Write(offset, data, options, dbg); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + Temperature GetTemperature() const override { + return target_->GetTemperature(); + } + + private: + FSRandomRWFile* target_; +}; + +class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomRWFileOwnerWrapper(std::unique_ptr&& t) + : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + +class FSDirectoryWrapper : public FSDirectory { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSDirectoryWrapper(std::unique_ptr&& t) + : guard_(std::move(t)) { + target_ = guard_.get(); + } + + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {} + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr guard_; + FSDirectory* target_; +}; + +// A utility routine: write "data" to the named file. +extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h new file mode 100644 index 000000000..954d15b4a --- /dev/null +++ b/src/rocksdb/include/rocksdb/filter_policy.h @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +struct BlockBasedTableOptions; +struct ConfigOptions; + +// As of RocksDB 7.0, the details of these classes are internal +class FilterBitsBuilder; +class FilterBitsReader; + +// Contextual information passed to BloomFilterPolicy at filter building time. +// Used in overriding FilterPolicy::GetBuilderWithContext(). References other +// structs because this is expected to be a temporary, stack-allocated object. +struct FilterBuildingContext { + // This constructor is for internal use only and subject to change. + FilterBuildingContext(const BlockBasedTableOptions& table_options); + + // Options for the table being built + const BlockBasedTableOptions& table_options; + + // BEGIN from (DB|ColumnFamily)Options in effect at table creation time + CompactionStyle compaction_style = kCompactionStyleLevel; + + // Number of LSM levels, or -1 if unknown + int num_levels = -1; + + // An optional logger for reporting errors, warnings, etc. + Logger* info_log = nullptr; + // END from (DB|ColumnFamily)Options + + // Name of the column family for the table (or empty string if unknown) + // TODO: consider changing to Slice + std::string column_family_name; + + // The table level at time of constructing the SST file, or -1 if unknown + // or N/A as in SstFileWriter. (The table file could later be used at a + // different level.) + int level_at_creation = -1; + + // True if known to be going into bottommost sorted run for applicable + // key range (which might not even be last level with data). False + // otherwise. + bool is_bottommost = false; + + // Reason for creating the file with the filter + TableFileCreationReason reason = TableFileCreationReason::kMisc; +}; + +// Determines what kind of filter (if any) to generate in SST files, and under +// which conditions. API users can create custom filter policies that +// defer to other built-in policies (see NewBloomFilterPolicy and +// NewRibbonFilterPolicy) based on the context provided to +// GetBuilderWithContext. +class FilterPolicy : public Customizable { + public: + virtual ~FilterPolicy(); + static const char* Type() { return "FilterPolicy"; } + + // The name used for identifying whether a filter on disk is readable + // by this FilterPolicy. If this FilterPolicy is part of a family that + // can read each others filters, such as built-in BloomFilterPolcy and + // RibbonFilterPolicy, the CompatibilityName is a shared family name, + // while kinds of filters in the family can have distinct Customizable + // Names. This function is pure virtual so that wrappers around built-in + // policies are prompted to defer to CompatibilityName() of the wrapped + // policy, which is important for compatibility. + // + // For custom filter policies that are not part of a read-compatible + // family (rare), implementations may return Name(). + virtual const char* CompatibilityName() const = 0; + + // Creates a new FilterPolicy based on the input value string and returns the + // result The value might be an ID, and ID with properties, or an old-style + // policy string. + // The value describes the FilterPolicy being created. + // For BloomFilters, value may be a ":"-delimited value of the form: + // "bloomfilter:[bits_per_key]", + // e.g. ""bloomfilter:4" + // The above string is equivalent to calling NewBloomFilterPolicy(4). + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + // Return a new FilterBitsBuilder for constructing full or partitioned + // filter blocks, or return nullptr to indicate "no filter". Custom + // implementations should defer to a built-in FilterPolicy to get a + // new FilterBitsBuilder, but the FilterBuildingContext can be used + // to decide which built-in FilterPolicy to defer to. + virtual FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const = 0; + + // Return a new FilterBitsReader for full or partitioned filter blocks. + // Caller retains ownership of any buffer pointed to by the input Slice. + // Custom implementation should defer to GetFilterBitsReader on any + // built-in FilterPolicy, which can read filters generated by any other + // built-in FilterPolicy. + virtual FilterBitsReader* GetFilterBitsReader( + const Slice& /*contents*/) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. See +// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter +// +// bits_per_key: average bits allocated per key in bloom filter. A good +// choice is 9.9, which yields a filter with ~ 1% false positive rate. +// When format_version < 5, the value will be rounded to the nearest +// integer. Recommend using no more than three decimal digits after the +// decimal point, as in 6.667. +// +// To avoid configurations that are unlikely to produce good filtering +// value for the CPU overhead, bits_per_key < 0.5 is rounded down to 0.0 +// which means "generate no filter", and 0.5 <= bits_per_key < 1.0 is +// rounded up to 1.0, for a 62% FP rate. +// +// The caller is responsible for eventually deleting the result, though +// this is typically handled automatically with BlockBasedTableOptions: +// table_options.filter_policy.reset(NewBloomFilterPolicy(...)); +// +// As of RocksDB 7.0, the use_block_based_builder parameter is ignored. +// (The old, inefficient block-based filter is no longer accessible in +// the public API.) +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy( + double bits_per_key, bool IGNORED_use_block_based_builder = false); + +// A new Bloom alternative that saves about 30% space compared to +// Bloom filters, with similar query times but roughly 3-4x CPU time +// and 3x temporary space usage during construction. For example, if +// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same +// 0.95% FP rate as Bloom filter but only using about 7 bits per key. +// +// The space savings of Ribbon filters makes sense for lower (higher +// numbered; larger; longer-lived) levels of LSM, whereas the speed of +// Bloom filters make sense for highest levels of LSM. Setting +// bloom_before_level allows for this design with Level and Universal +// compaction styles. For example, bloom_before_level=1 means that Bloom +// filters will be used in level 0, including flushes, and Ribbon +// filters elsewhere, including FIFO compaction and external SST files. +// For this option, memtable flushes are considered level -1 (so that +// flushes can be distinguished from intra-L0 compaction). +// bloom_before_level=0 (default) -> Generate Bloom filters only for +// flushes under Level and Universal compaction styles. +// bloom_before_level=-1 -> Always generate Ribbon filters (except in +// some extreme or exceptional cases). +// +// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier +// versions reading the data will behave as if no filter was used +// (degraded performance until compaction rebuilds filters). All +// built-in FilterPolicies (Bloom or Ribbon) are able to read other +// kinds of built-in filters. +// +// Note: the current Ribbon filter schema uses some extra resources +// when constructing very large filters. For example, for 100 million +// keys in a single filter (one SST file without partitioned filters), +// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom. +// However, the savings in filter space from just ~60 open SST files +// makes up for the additional temporary memory use. +// +// Also consider using optimize_filters_for_memory to save filter +// memory. +extern const FilterPolicy* NewRibbonFilterPolicy( + double bloom_equivalent_bits_per_key, int bloom_before_level = 0); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h new file mode 100644 index 000000000..7a5dd957e --- /dev/null +++ b/src/rocksdb/include/rocksdb/flush_block_policy.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class BlockBuilder; +struct ConfigOptions; +struct Options; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, const Slice& value) = 0; + + virtual ~FlushBlockPolicy() {} +}; + +class FlushBlockPolicyFactory : public Customizable { + public: + static const char* Type() { return "FlushBlockPolicyFactory"; } + + // Creates a FlushBlockPolicyFactory based on the input value. + // By default, this method can create EveryKey or BySize PolicyFactory, + // which take now config_options. + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result); + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() {} +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory(); + + static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; } + const char* Name() const override { return kClassName(); } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const override; + + static FlushBlockPolicy* NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/functor_wrapper.h b/src/rocksdb/include/rocksdb/functor_wrapper.h new file mode 100644 index 000000000..17b021bf7 --- /dev/null +++ b/src/rocksdb/include/rocksdb/functor_wrapper.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace detail { +template +struct IndexSequence {}; + +template +struct IndexSequenceHelper + : public IndexSequenceHelper {}; + +template +struct IndexSequenceHelper<0U, Next...> { + using type = IndexSequence; +}; + +template +using make_index_sequence = typename IndexSequenceHelper::type; + +template +void call(Function f, Tuple t, IndexSequence) { + f(std::get(t)...); +} + +template +void call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + call(f, t, make_index_sequence{}); +} +} // namespace detail + +template +class FunctorWrapper { + public: + explicit FunctorWrapper(std::function functor, Args &&...args) + : functor_(std::move(functor)), args_(std::forward(args)...) {} + + void invoke() { detail::call(functor_, args_); } + + private: + std::function functor_; + std::tuple args_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h new file mode 100644 index 000000000..0bf5e939a --- /dev/null +++ b/src/rocksdb/include/rocksdb/io_status.h @@ -0,0 +1,244 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// An IOStatus encapsulates the result of an operation. It may indicate +// success, or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on an IOStatus without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same IOStatus must use +// external synchronization. + +#pragma once + +#include + +#include "rocksdb/slice.h" +#ifdef OS_WIN +#include +#endif +#include + +#include "status.h" + +namespace ROCKSDB_NAMESPACE { + +class IOStatus : public Status { + public: + using Code = Status::Code; + using SubCode = Status::SubCode; + + enum IOErrorScope : unsigned char { + kIOErrorScopeFileSystem, + kIOErrorScopeFile, + kIOErrorScopeRange, + kIOErrorScopeMax, + }; + + // Create a success status. + IOStatus() : IOStatus(kOk, kNone) {} + ~IOStatus() {} + + // Copy the specified status. + IOStatus(const IOStatus& s); + IOStatus& operator=(const IOStatus& s); + IOStatus(IOStatus&& s) noexcept; + IOStatus& operator=(IOStatus&& s) noexcept; + bool operator==(const IOStatus& rhs) const; + bool operator!=(const IOStatus& rhs) const; + + void SetRetryable(bool retryable) { retryable_ = retryable; } + void SetDataLoss(bool data_loss) { data_loss_ = data_loss; } + void SetScope(IOErrorScope scope) { + scope_ = static_cast(scope); + } + + bool GetRetryable() const { return retryable_; } + bool GetDataLoss() const { return data_loss_; } + IOErrorScope GetScope() const { return static_cast(scope_); } + + // Return a success status. + static IOStatus OK() { return IOStatus(); } + + static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotSupported, msg, msg2); + } + static IOStatus NotSupported(SubCode msg = kNone) { + return IOStatus(kNotSupported, msg); + } + + // Return error status of an appropriate type. + static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static IOStatus NotFound(SubCode msg = kNone) { + return IOStatus(kNotFound, msg); + } + + static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kCorruption, msg, msg2); + } + static IOStatus Corruption(SubCode msg = kNone) { + return IOStatus(kCorruption, msg); + } + + static IOStatus InvalidArgument(const Slice& msg, + const Slice& msg2 = Slice()) { + return IOStatus(kInvalidArgument, msg, msg2); + } + static IOStatus InvalidArgument(SubCode msg = kNone) { + return IOStatus(kInvalidArgument, msg); + } + + static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, msg, msg2); + } + static IOStatus IOError(SubCode msg = kNone) { + return IOStatus(kIOError, msg); + } + + static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); } + static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kBusy, msg, msg2); + } + + static IOStatus TimedOut(SubCode msg = kNone) { + return IOStatus(kTimedOut, msg); + } + static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kTimedOut, msg, msg2); + } + + static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); } + static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kNoSpace, msg, msg2); + } + + static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); } + static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kPathNotFound, msg, msg2); + } + + static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); } + static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kIOFenced, msg, msg2); + } + + static IOStatus Aborted(SubCode msg = kNone) { + return IOStatus(kAborted, msg); + } + static IOStatus Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kAborted, msg, msg2); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + // std::string ToString() const; + + private: + friend IOStatus status_to_io_status(Status&&); + + explicit IOStatus(Code _code, SubCode _subcode = kNone) + : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {} + + IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + IOStatus(Code _code, const Slice& msg, const Slice& msg2) + : IOStatus(_code, kNone, msg, msg2) {} +}; + +inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2) + : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) { + assert(code_ != kOk); + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_.reset(result); +} + +inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline IOStatus& IOStatus::operator=(const IOStatus& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + code_ = s.code_; + subcode_ = s.subcode_; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); + } + return *this; +} + +inline IOStatus::IOStatus(IOStatus&& s) noexcept : IOStatus() { + *this = std::move(s); +} + +inline IOStatus& IOStatus::operator=(IOStatus&& s) noexcept { + if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + s.scope_ = kIOErrorScopeFileSystem; + state_ = std::move(s.state_); + } + return *this; +} + +inline bool IOStatus::operator==(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return (code_ == rhs.code_); +} + +inline bool IOStatus::operator!=(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return !(*this == rhs); +} + +inline IOStatus status_to_io_status(Status&& status) { + IOStatus io_s; + Status& s = io_s; + s = std::move(status); + return io_s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h new file mode 100644 index 000000000..559d44c57 --- /dev/null +++ b/src/rocksdb/include/rocksdb/iostats_context.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include + +#include "rocksdb/perf_level.h" + +// A thread local context for gathering io-stats efficiently and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +namespace ROCKSDB_NAMESPACE { + +// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each +// item in Temperature class. +struct FileIOByTemperature { + // the number of bytes read to Temperature::kHot file + uint64_t hot_file_bytes_read; + // the number of bytes read to Temperature::kWarm file + uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCold file + uint64_t cold_file_bytes_read; + // total number of reads to Temperature::kHot file + uint64_t hot_file_read_count; + // total number of reads to Temperature::kWarm file + uint64_t warm_file_read_count; + // total number of reads to Temperature::kCold file + uint64_t cold_file_read_count; + // reset all the statistics to 0. + void Reset() { + hot_file_bytes_read = 0; + warm_file_bytes_read = 0; + cold_file_bytes_read = 0; + hot_file_read_count = 0; + warm_file_read_count = 0; + cold_file_read_count = 0; + } +}; + +struct IOStatsContext { + // reset all io-stats counter to zero + void Reset(); + + std::string ToString(bool exclude_zero_counters = false) const; + + // the thread pool id + uint64_t thread_pool_id; + + // number of bytes that has been written. + uint64_t bytes_written; + // number of bytes that has been read. + uint64_t bytes_read; + + // time spent in open() and fopen(). + uint64_t open_nanos; + // time spent in fallocate(). + uint64_t allocate_nanos; + // time spent in write() and pwrite(). + uint64_t write_nanos; + // time spent in read() and pread() + uint64_t read_nanos; + // time spent in sync_file_range(). + uint64_t range_sync_nanos; + // time spent in fsync + uint64_t fsync_nanos; + // time spent in preparing write (fallocate etc). + uint64_t prepare_write_nanos; + // time spent in Logger::Logv(). + uint64_t logger_nanos; + // CPU time spent in write() and pwrite() + uint64_t cpu_write_nanos; + // CPU time spent in read() and pread() + uint64_t cpu_read_nanos; + + FileIOByTemperature file_io_stats_by_temperature; + + // It is not consistent that whether iostats follows PerfLevel.Timer counters + // follows it but BackupEngine relies on counter metrics to always be there. + // Here we create a backdoor option to disable some counters, so that some + // existing stats are not polluted by file operations, such as logging, by + // turning this off. + bool disable_iostats = false; +}; + +// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, +// non-thread-local IOStatsContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, a pointer to a thread-local IOStatsContext object will be +// returned. +// +// This function never returns nullptr. +IOStatsContext* get_iostats_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h new file mode 100644 index 000000000..9d4c9f73a --- /dev/null +++ b/src/rocksdb/include/rocksdb/iterator.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#pragma once + +#include + +#include "rocksdb/cleanable.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator : public Cleanable { + public: + Iterator() {} + // No copying allowed + Iterator(const Iterator&) = delete; + void operator=(const Iterator&) = delete; + + virtual ~Iterator() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + // Always returns false if !status().ok(). + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // All Seek*() methods clear any error status() that the iterator had prior to + // the call; after the seek, status() indicates only the error (if any) that + // happened during the seek, not any past errors. + // Target does not contain timestamp. + virtual void Seek(const Slice& target) = 0; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + // Target does not contain timestamp. + virtual void SeekForPrev(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of the + // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev + // operation). + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. If the entry is a plain key-value, + // return the value as-is; if it is a wide-column entity, return the value of + // the default anonymous column (see kDefaultWideColumnName) if any, or an + // empty value otherwise. The underlying storage for the returned slice is + // valid only until the next modification of the iterator (i.e. the next + // SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation). + // REQUIRES: Valid() + virtual Slice value() const = 0; + + // Return the wide columns for the current entry. If the entry is a + // wide-column entity, return it as-is; if it is a plain key-value, return it + // as an entity with a single anonymous column (see kDefaultWideColumnName) + // which contains the value. The underlying storage for the returned + // structure is valid only until the next modification of the iterator (i.e. + // the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation). + // REQUIRES: Valid() + virtual const WideColumns& columns() const { + assert(false); + return kNoWideColumns; + } + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // If supported, renew the iterator to represent the latest state. The + // iterator will be invalidated after the call. Not supported if + // ReadOptions.snapshot is given when creating the iterator. + virtual Status Refresh() { + return Status::NotSupported("Refresh() is not supported"); + } + + // Property "rocksdb.iterator.is-key-pinned": + // If returning "1", this means that the Slice returned by key() is valid + // as long as the iterator is not deleted. + // It is guaranteed to always return "1" if + // - Iterator created with ReadOptions::pin_data = true + // - DB tables were created with + // BlockBasedTableOptions::use_delta_encoding = false. + // Property "rocksdb.iterator.super-version-number": + // LSM version used by the iterator. The same format as DB Property + // kCurrentSuperVersionNumber. See its comment for more information. + // Property "rocksdb.iterator.internal-key": + // Get the user-key portion of the internal key at which the iteration + // stopped. + virtual Status GetProperty(std::string prop_name, std::string* prop); + + virtual Slice timestamp() const { + assert(false); + return Slice(); + } +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h new file mode 100644 index 000000000..7408cbc87 --- /dev/null +++ b/src/rocksdb/include/rocksdb/ldb_tool.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// An interface for converting a slice to a readable string +class SliceFormatter { + public: + virtual ~SliceFormatter() {} + virtual std::string Format(const Slice& s) const = 0; +}; + +// Options for customizing ldb tool (beyond the DB Options) +struct LDBOptions { + // Create LDBOptions with default values for all fields + LDBOptions(); + + // Key formatter that converts a slice to a readable string. + // Default: Slice::ToString() + std::shared_ptr key_formatter; + + std::string print_help_header = "ldb - RocksDB Tool"; +}; + +class LDBTool { + public: + void Run( + int argc, char** argv, Options db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector* column_families = nullptr); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h new file mode 100644 index 000000000..8644fcf3f --- /dev/null +++ b/src/rocksdb/include/rocksdb/listener.h @@ -0,0 +1,847 @@ +// Copyright (c) 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/io_status.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +using TablePropertiesCollection = + std::unordered_map>; + +class DB; +class ColumnFamilyHandle; +class Status; +struct CompactionJobStats; + +struct FileCreationBriefInfo { + FileCreationBriefInfo() = default; + FileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id) + : db_name(_db_name), + cf_name(_cf_name), + file_path(_file_path), + job_id(_job_id) {} + // the name of the database where the file was created. + std::string db_name; + // the name of the column family where the file was created. + std::string cf_name; + // the path to the created file. + std::string file_path; + // the id of the job (which could be flush or compaction) that + // created the file. + int job_id = 0; +}; + +struct TableFileCreationBriefInfo : public FileCreationBriefInfo { + // reason of creating the table. + TableFileCreationReason reason; +}; + +struct TableFileCreationInfo : public TableFileCreationBriefInfo { + TableFileCreationInfo() = default; + explicit TableFileCreationInfo(TableProperties&& prop) + : table_properties(prop) {} + // the size of the file. + uint64_t file_size; + // Detailed properties of the created file. + TableProperties table_properties; + // The status indicating whether the creation was successful or not. + Status status; + // The checksum of the table file being created + std::string file_checksum; + // The checksum function name of checksum generator used for this table file + std::string file_checksum_func_name; +}; + +struct BlobFileCreationBriefInfo : public FileCreationBriefInfo { + BlobFileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason) + : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id), + reason(_reason) {} + // reason of creating the blob file. + BlobFileCreationReason reason; +}; + +struct BlobFileCreationInfo : public BlobFileCreationBriefInfo { + BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + Status _status, const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id, + _reason), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + status(_status), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name) {} + + // the number of blob in a file. + uint64_t total_blob_count; + // the total bytes in a file. + uint64_t total_blob_bytes; + // The status indicating whether the creation was successful or not. + Status status; + // The checksum of the blob file being created. + std::string file_checksum; + // The checksum function name of checksum generator used for this blob file. + std::string file_checksum_func_name; +}; + +enum class CompactionReason : int { + kUnknown = 0, + // [Level] number of L0 files > level0_file_num_compaction_trigger + kLevelL0FilesNum, + // [Level] total size of level > MaxBytesForLevel() + kLevelMaxLevelSize, + // [Universal] Compacting for size amplification + kUniversalSizeAmplification, + // [Universal] Compacting for size ratio + kUniversalSizeRatio, + // [Universal] number of sorted runs > level0_file_num_compaction_trigger + kUniversalSortedRunNum, + // [FIFO] total size > max_table_files_size + kFIFOMaxSize, + // [FIFO] reduce number of files. + kFIFOReduceNumFiles, + // [FIFO] files with creation time < (current_time - interval) + kFIFOTtl, + // Manual compaction + kManualCompaction, + // DB::SuggestCompactRange() marked files for compaction + kFilesMarkedForCompaction, + // [Level] Automatic compaction within bottommost level to cleanup duplicate + // versions of same user key, usually due to a released snapshot. + kBottommostFiles, + // Compaction based on TTL + kTtl, + // According to the comments in flush_job.cc, RocksDB treats flush as + // a level 0 compaction in internal stats. + kFlush, + // Compaction caused by external sst file ingestion + kExternalSstIngestion, + // Compaction due to SST file being too old + kPeriodicCompaction, + // Compaction in order to move files to temperature + kChangeTemperature, + // Compaction scheduled to force garbage collection of blob files + kForcedBlobGC, + // A special TTL compaction for RoundRobin policy, which basically the same as + // kLevelMaxLevelSize, but the goal is to compact TTLed files. + kRoundRobinTtl, + // total number of compaction reasons, new reasons must be added above this. + kNumOfReasons, +}; + +enum class FlushReason : int { + kOthers = 0x00, + kGetLiveFiles = 0x01, + kShutDown = 0x02, + kExternalFileIngestion = 0x03, + kManualCompaction = 0x04, + kWriteBufferManager = 0x05, + kWriteBufferFull = 0x06, + kTest = 0x07, + kDeleteFiles = 0x08, + kAutoCompaction = 0x09, + kManualFlush = 0x0a, + kErrorRecovery = 0xb, + // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable + // will not be called to avoid many small immutable memtables. + kErrorRecoveryRetryFlush = 0xc, + kWalFull = 0xd, +}; + +// TODO: In the future, BackgroundErrorReason will only be used to indicate +// why the BG Error is happening (e.g., flush, compaction). We may introduce +// other data structure to indicate other essential information such as +// the file type (e.g., Manifest, SST) and special context. +enum class BackgroundErrorReason { + kFlush, + kCompaction, + kWriteCallback, + kMemTable, + kManifestWrite, + kFlushNoWAL, + kManifestWriteNoWAL, +}; + +enum class WriteStallCondition { + kNormal, + kDelayed, + kStopped, +}; + +struct WriteStallInfo { + // the name of the column family + std::string cf_name; + // state of the write controller + struct { + WriteStallCondition cur; + WriteStallCondition prev; + } condition; +}; + +#ifndef ROCKSDB_LITE + +struct FileDeletionInfo { + FileDeletionInfo() = default; + + FileDeletionInfo(const std::string& _db_name, const std::string& _file_path, + int _job_id, Status _status) + : db_name(_db_name), + file_path(_file_path), + job_id(_job_id), + status(_status) {} + // The name of the database where the file was deleted. + std::string db_name; + // The path to the deleted file. + std::string file_path; + // The id of the job which deleted the file. + int job_id = 0; + // The status indicating whether the deletion was successful or not. + Status status; +}; + +struct TableFileDeletionInfo : public FileDeletionInfo {}; + +struct BlobFileDeletionInfo : public FileDeletionInfo { + BlobFileDeletionInfo(const std::string& _db_name, + const std::string& _file_path, int _job_id, + Status _status) + : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {} +}; + +enum class FileOperationType { + kRead, + kWrite, + kTruncate, + kClose, + kFlush, + kSync, + kFsync, + kRangeSync, + kAppend, + kPositionedAppend, + kOpen +}; + +struct FileOperationInfo { + using Duration = std::chrono::nanoseconds; + using SteadyTimePoint = + std::chrono::time_point; + using SystemTimePoint = + std::chrono::time_point; + using StartTimePoint = std::pair; + using FinishTimePoint = SteadyTimePoint; + + FileOperationType type; + const std::string& path; + // Rocksdb try to provide file temperature information, but it's not + // guaranteed. + Temperature temperature; + uint64_t offset; + size_t length; + const Duration duration; + const SystemTimePoint& start_ts; + Status status; + + FileOperationInfo(const FileOperationType _type, const std::string& _path, + const StartTimePoint& _start_ts, + const FinishTimePoint& _finish_ts, const Status& _status, + const Temperature _temperature = Temperature::kUnknown) + : type(_type), + path(_path), + temperature(_temperature), + duration(std::chrono::duration_cast( + _finish_ts - _start_ts.second)), + start_ts(_start_ts.first), + status(_status) {} + static StartTimePoint StartNow() { + return std::make_pair( + std::chrono::system_clock::now(), std::chrono::steady_clock::now()); + } + static FinishTimePoint FinishNow() { + return std::chrono::steady_clock::now(); + } +}; + +struct BlobFileInfo { + BlobFileInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number) + : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {} + + std::string blob_file_path; + uint64_t blob_file_number; +}; + +struct BlobFileAdditionInfo : public BlobFileInfo { + BlobFileAdditionInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _total_blob_count, + const uint64_t _total_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes) {} + uint64_t total_blob_count; + uint64_t total_blob_bytes; +}; + +struct BlobFileGarbageInfo : public BlobFileInfo { + BlobFileGarbageInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _garbage_blob_count, + const uint64_t _garbage_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes) {} + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; +}; + +struct FlushJobInfo { + // the id of the column family + uint32_t cf_id; + // the name of the column family + std::string cf_name; + // the path to the newly created file + std::string file_path; + // the file number of the newly created file + uint64_t file_number; + // the oldest blob file referenced by the newly created file + uint64_t oldest_blob_file_number; + // the id of the thread that completed this flush job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + // If true, then rocksdb is currently slowing-down all writes to prevent + // creating too many Level 0 files as compaction seems not able to + // catch up the write request speed. This indicates that there are + // too many files in Level 0. + bool triggered_writes_slowdown; + // If true, then rocksdb is currently blocking any writes to prevent + // creating more L0 files. This indicates that there are too many + // files in level 0. Compactions should try to compact L0 files down + // to lower levels as soon as possible. + bool triggered_writes_stop; + // The smallest sequence number in the newly created file + SequenceNumber smallest_seqno; + // The largest sequence number in the newly created file + SequenceNumber largest_seqno; + // Table properties of the table being flushed + TableProperties table_properties; + + FlushReason flush_reason; + + // Compression algorithm used for blob output files + CompressionType blob_compression_type; + + // Information about blob files created during flush in Integrated BlobDB. + std::vector blob_file_addition_infos; +}; + +struct CompactionFileInfo { + // The level of the file. + int level; + + // The file number of the file. + uint64_t file_number; + + // The file number of the oldest blob file this SST file references. + uint64_t oldest_blob_file_number; +}; + +struct SubcompactionJobInfo { + ~SubcompactionJobInfo() { status.PermitUncheckedError(); } + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the id of the thread that completed this compaction job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + + // sub-compaction job id, which is only unique within the same compaction, so + // use both 'job_id' and 'subcompaction_job_id' to identify a subcompaction + // within an instance. + // For non subcompaction job, it's set to -1. + int subcompaction_job_id; + // the smallest input level of the compaction. + int base_input_level; + // the output level of the compaction. + int output_level; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + + // Statistics and other additional details on the compaction + CompactionJobStats stats; + + // Compression algorithm used for blob output files. + CompressionType blob_compression_type; +}; + +struct CompactionJobInfo { + ~CompactionJobInfo() { status.PermitUncheckedError(); } + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the id of the thread that completed this compaction job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + + // the smallest input level of the compaction. + int base_input_level; + // the output level of the compaction. + int output_level; + + // The following variables contain information about compaction inputs + // and outputs. A file may appear in both the input and output lists + // if it was simply moved to a different level. The order of elements + // is the same across input_files and input_file_infos; similarly, it is + // the same across output_files and output_file_infos. + + // The names of the compaction input files. + std::vector input_files; + + // Additional information about the compaction input files. + std::vector input_file_infos; + + // The names of the compaction output files. + std::vector output_files; + + // Additional information about the compaction output files. + std::vector output_file_infos; + + // Table properties for input and output tables. + // The map is keyed by values from input_files and output_files. + TablePropertiesCollection table_properties; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + + // Statistics and other additional details on the compaction + CompactionJobStats stats; + + // Compression algorithm used for blob output files. + CompressionType blob_compression_type; + + // Information about blob files created during compaction in Integrated + // BlobDB. + std::vector blob_file_addition_infos; + + // Information about blob files deleted during compaction in Integrated + // BlobDB. + std::vector blob_file_garbage_infos; +}; + +struct MemTableInfo { + // the name of the column family to which memtable belongs + std::string cf_name; + // Sequence number of the first element that was inserted + // into the memtable. + SequenceNumber first_seqno; + // Sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + SequenceNumber earliest_seqno; + // Total number of entries in memtable + uint64_t num_entries; + // Total number of deletes in memtable + uint64_t num_deletes; +}; + +struct ExternalFileIngestionInfo { + // the name of the column family + std::string cf_name; + // Path of the file outside the DB + std::string external_file_path; + // Path of the file inside the DB + std::string internal_file_path; + // The global sequence number assigned to keys in this file + SequenceNumber global_seqno; + // Table properties of the table being flushed + TableProperties table_properties; +}; + +// Result of auto background error recovery +struct BackgroundErrorRecoveryInfo { + // The original error that triggered the recovery + Status old_bg_error; + + // The final bg_error after all recovery attempts. Status::OK() means + // the recovery was successful and the database is fully operational. + Status new_bg_error; +}; + +struct IOErrorInfo { + IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation, + const std::string& _file_path, size_t _length, uint64_t _offset) + : io_status(_io_status), + operation(_operation), + file_path(_file_path), + length(_length), + offset(_offset) {} + + IOStatus io_status; + FileOperationType operation; + std::string file_path; + size_t length; + uint64_t offset; +}; + +// EventListener class contains a set of callback functions that will +// be called when specific RocksDB event happens such as flush. It can +// be used as a building block for developing custom features such as +// stats-collector or external compaction algorithm. +// +// IMPORTANT +// Because compaction is needed to resolve a "writes stopped" condition, +// calling or waiting for any blocking DB write function (no_slowdown=false) +// from a compaction-related listener callback can hang RocksDB. For DB +// writes from a callback we recommend a WriteBatch and no_slowdown=true, +// because the WriteBatch can accumulate writes for later in case DB::Write +// returns Status::Incomplete. Similarly, calling CompactRange or similar +// could hang by waiting for a background worker that is occupied until the +// callback returns. +// +// Otherwise, callback functions should not run for an extended period of +// time before the function returns, because this will slow RocksDB. +// +// [Threading] All EventListener callback will be called using the +// actual thread that involves in that specific event. For example, it +// is the RocksDB background flush thread that does the actual flush to +// call EventListener::OnFlushCompleted(). +// +// [Locking] All EventListener callbacks are designed to be called without +// the current thread holding any DB mutex. This is to prevent potential +// deadlock and performance issue when using EventListener callback +// in a complex way. +// +// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into +// RocksDB, because RocksDB is not exception-safe. This could cause undefined +// behavior including data loss, unreported corruption, deadlocks, and more. +class EventListener : public Customizable { + public: + static const char* Type() { return "EventListener"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& id, + std::shared_ptr* result); + const char* Name() const override { + // Since EventListeners did not have a name previously, we will assume + // an empty name. Instances should override this method. + return ""; + } + // A callback function to RocksDB which will be called whenever a + // registered RocksDB flushes a file. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to flush memtables. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushBegin(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is deleted. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from the + // returned value. + virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to compact. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {} + + // A callback function for RocksDB which will be called whenever + // a registered RocksDB compacts a file. The default implementation + // is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param db a pointer to the rocksdb instance which just compacted + // a file. + // @param ci a reference to a CompactionJobInfo struct. 'ci' is released + // after this function is returned, and must be copied if it is needed + // outside of this function. + virtual void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*ci*/) {} + + // A callback function to RocksDB which will be called before a sub-compaction + // begins. If a compaction is split to 2 sub-compactions, it will trigger one + // `OnCompactionBegin()` first, then two `OnSubcompactionBegin()`. + // If compaction is not split, it will still trigger one + // `OnSubcompactionBegin()`, as internally, compaction is always handled by + // sub-compaction. The default implementation is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param ci a reference to a CompactionJobInfo struct, it contains a + // `sub_job_id` which is only unique within the specified compaction (which + // can be identified by `job_id`). 'ci' is released after this function is + // returned, and must be copied if it's needed outside this function. + // Note: `table_properties` is not set for sub-compaction, the information + // could be got from `OnCompactionBegin()`. + virtual void OnSubcompactionBegin(const SubcompactionJobInfo& /*si*/) {} + + // A callback function to RocksDB which will be called whenever a + // sub-compaction completed. The same as `OnSubcompactionBegin()`, if a + // compaction is split to 2 sub-compactions, it will be triggered twice. If + // a compaction is not split, it will still be triggered once. + // The default implementation is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param ci a reference to a CompactionJobInfo struct, it contains a + // `sub_job_id` which is only unique within the specified compaction (which + // can be identified by `job_id`). 'ci' is released after this function is + // returned, and must be copied if it's needed outside this function. + // Note: `table_properties` is not set for sub-compaction, the information + // could be got from `OnCompactionCompleted()`. + virtual void OnSubcompactionCompleted(const SubcompactionJobInfo& /*si*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is created. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Historically it will only be called if the file is successfully created. + // Now it will also be called on failure case. User can check info.status + // to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a SST file is being created. It will follow by OnTableFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a memtable is made immutable. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a column family handle is deleted. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // @param handle is a pointer to the column family handle to be deleted + // which will become a dangling pointer after the deletion. + virtual void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* /*handle*/) {} + + // A callback function for RocksDB which will be called after an external + // file is ingested using IngestExternalFile. + // + // Note that the this function will run on the same thread as + // IngestExternalFile(), if this function is blocked, IngestExternalFile() + // will be blocked from finishing. + virtual void OnExternalFileIngested( + DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before setting the + // background error status to a non-OK value. The new background error status + // is provided in `bg_error` and can be modified by the callback. E.g., a + // callback can suppress errors by resetting it to Status::OK(), thus + // preventing the database from entering read-only mode. We do not provide any + // guarantee when failed flushes/compactions will be rescheduled if the user + // suppresses an error. + // + // Note that this function can run on the same threads as flush, compaction, + // and user writes. So, it is extremely important not to perform heavy + // computations or blocking calls in this function. + virtual void OnBackgroundError(BackgroundErrorReason /* reason */, + Status* /* bg_error */) {} + + // A callback function for RocksDB which will be called whenever a change + // of superversion triggers a change of the stall conditions. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever a file read + // operation finishes. + virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file write + // operation finishes. + virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file flush + // operation finishes. + virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file sync + // operation finishes. + virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // rangeSync operation finishes. + virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // truncate operation finishes. + virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file close + // operation finishes. + virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {} + + // If true, the OnFile*Finish functions will be called. If + // false, then they won't be called. + virtual bool ShouldBeNotifiedOnFileIO() { return false; } + + // A callback function for RocksDB which will be called just before + // starting the automatic recovery process for recoverable background + // errors, such as NoSpace(). The callback can suppress the automatic + // recovery by setting *auto_recovery to false. The database will then + // have to be transitioned out of read-only mode by calling DB::Resume() + virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */, + Status /* bg_error */, + bool* /* auto_recovery */) {} + + // DEPRECATED + // A callback function for RocksDB which will be called once the database + // is recovered from read-only mode after an error. When this is called, it + // means normal writes to the database can be issued and the user can + // initiate any further recovery actions needed + virtual void OnErrorRecoveryCompleted(Status old_bg_error) { + old_bg_error.PermitUncheckedError(); + } + + // A callback function for RocksDB which will be called once the recovery + // attempt from a background retryable error is completed. The recovery + // may have been successful or not. In either case, the callback is called + // with the old and new error. If info.new_bg_error is Status::OK(), that + // means the recovery succeeded. + virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) { + } + + // A callback function for RocksDB which will be called before + // a blob file is being created. It will follow by OnBlobFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreationStarted( + const BlobFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is created. + // It will be called whether the file is successfully created or not. User can + // check info.status to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is deleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever an IO error + // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback. + virtual void OnIOError(const IOErrorInfo& /*info*/) {} + + ~EventListener() override {} +}; + +#else + +class EventListener {}; +struct FlushJobInfo {}; + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h new file mode 100644 index 000000000..5cb799e42 --- /dev/null +++ b/src/rocksdb/include/rocksdb/memory_allocator.h @@ -0,0 +1,81 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// MemoryAllocator is an interface that a client can implement to supply custom +// memory allocation and deallocation methods. See rocksdb/cache.h for more +// information. +// All methods should be thread-safe. +class MemoryAllocator : public Customizable { + public: + static const char* Type() { return "MemoryAllocator"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + + // Allocate a block of at least size. Has to be thread-safe. + virtual void* Allocate(size_t size) = 0; + + // Deallocate previously allocated block. Has to be thread-safe. + virtual void Deallocate(void* p) = 0; + + // Returns the memory size of the block allocated at p. The default + // implementation that just returns the original allocation_size is fine. + virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const { + // default implementation just returns the allocation size + return allocation_size; + } + + std::string GetId() const override { return GenerateIndividualId(); } +}; + +struct JemallocAllocatorOptions { + static const char* kName() { return "JemallocAllocatorOptions"; } + // Jemalloc tcache cache allocations by size class. For each size class, + // it caches between 20 (for large size classes) to 200 (for small size + // classes). To reduce tcache memory usage in case the allocator is access + // by large number of threads, we can control whether to cache an allocation + // by its size. + bool limit_tcache_size = false; + + // Lower bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommended to set it to block_size/4. + size_t tcache_size_lower_bound = 1024; + + // Upper bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommended to set it to block_size. + size_t tcache_size_upper_bound = 16 * 1024; +}; + +// Generate memory allocator which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvise to exclude cache items from core dump. +// Applications can use the allocator with block cache to exclude block cache +// usage from core dump. +// +// Implementation details: +// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator are through the same arena. +// The memory allocator hooks memory allocation of the arena, and calls +// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduction of jemalloc +// metadata for some workloads. +// +// To mitigate mutex contention for using one single arena, jemalloc tcache +// (thread-local cache) is enabled to cache unused allocations for future use. +// The tcache normally incurs 0.5M extra memory usage per-thread. The usage +// can be reduced by limiting allocation sizes to cache. +extern Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr* memory_allocator); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h new file mode 100644 index 000000000..cb5444dca --- /dev/null +++ b/src/rocksdb/include/rocksdb/memtablerep.h @@ -0,0 +1,423 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an MemTableAllocator object when a new MemTableRep +// is requested. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration within a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#pragma once + +#include +#include + +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class Allocator; +class LookupKey; +class SliceTransform; +class Logger; +struct DBOptions; + +using KeyHandle = void*; + +extern Slice GetLengthPrefixedSlice(const char* data); + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + using DecodedType = ROCKSDB_NAMESPACE::Slice; + + virtual DecodedType decode_key(const char* key) const { + // The format of key is frozen and can be treated as a part of the API + // contract. Refer to MemTable::Add for details. + return GetLengthPrefixedSlice(key); + } + + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; + + virtual ~KeyComparator() {} + }; + + explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} + + // Allocate a buf of len size for storing key. The idea is that a + // specific memtable representation knows its underlying data structure + // better. By allowing it to allocate memory, it can possibly put + // correlated stuff in consecutive memory area to make processor + // prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert). + // REQUIRES: nothing that compares equal to key is currently in the + // collection, and no concurrent modifications to the table in progress + virtual void Insert(KeyHandle handle) = 0; + + // Same as ::Insert + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKey(KeyHandle handle) { + Insert(handle); + return true; + } + + // Same as Insert(), but in additional pass a hint to insert location for + // the key. If hint points to nullptr, a new hint will be populated. + // otherwise the hint will be updated to reflect the last insert location. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to Insert() by default. + virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + Insert(handle); + } + + // Same as ::InsertWithHint + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { + InsertWithHint(handle, hint); + return true; + } + + // Same as ::InsertWithHint, but allow concurrent write + // + // If hint points to nullptr, a new hint will be allocated on heap, otherwise + // the hint will be updated to reflect the last insert location. The hint is + // owned by the caller and it is the caller's responsibility to delete the + // hint later. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to InsertConcurrently() by default. + virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + InsertConcurrently(handle); + } + + // Same as ::InsertWithHintConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { + InsertWithHintConcurrently(handle, hint); + return true; + } + + // Like Insert(handle), but may be called concurrent with other calls + // to InsertConcurrently for other handles. + // + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual void InsertConcurrently(KeyHandle handle); + + // Same as ::InsertConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyConcurrently(KeyHandle handle) { + InsertConcurrently(handle); + return true; + } + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, + // does nothing. After MarkReadOnly() is called, this table rep will + // not be written to (ie No more calls to Allocate(), Insert(), + // or any writes done directly to entries accessed through the iterator.) + virtual void MarkReadOnly() {} + + // Notify this table rep that it has been flushed to stable storage. + // By default, does nothing. + // + // Invariant: MarkReadOnly() is called, before MarkFlushed(). + // Note that this method if overridden, should not run for an extended period + // of time. Otherwise, RocksDB may be blocked. + virtual void MarkFlushed() {} + + // Look up key from the mem table, since the first key in the mem table whose + // user_key matches the one given k, call the function callback_func(), with + // callback_args directly forwarded as the first parameter, and the mem table + // key as the second parameter. If the return value is false, then terminates. + // Otherwise, go through the next key. + // + // It's safe for Get() to terminate after having finished all the potential + // key for the k.user_key(), or not. + // + // Default: + // Get() function with a default value of dynamically construct an iterator, + // seek and call the call back function. + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)); + + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, + const Slice& /*end_key*/) { + return 0; + } + + // Returns a vector of unique random memtable entries of approximate + // size 'target_sample_size' (this size is not strictly enforced). + virtual void UniqueRandomSample(const uint64_t num_entries, + const uint64_t target_sample_size, + std::unordered_set* entries) { + (void)num_entries; + (void)target_sample_size; + (void)entries; + assert(false); + } + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the allocator. Safe to call from any thread. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() {} + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() {} + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + + // retreat to the first entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) = 0; + + virtual void RandomSeek() {} + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + // arena: If not null, the arena is used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { + return GetIterator(arena); + } + + // Return true if the current MemTableRep supports merge operator. + // Default: true + virtual bool IsMergeOperatorSupported() const { return true; } + + // Return true if the current MemTableRep supports snapshot + // Default: true + virtual bool IsSnapshotSupported() const { return true; } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; + + Allocator* allocator_; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory : public Customizable { + public: + ~MemTableRepFactory() override {} + + static const char* Type() { return "MemTableRepFactory"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::unique_ptr* factory); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* factory); + + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) = 0; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, + const SliceTransform* slice_transform, Logger* logger, + uint32_t /* column_family_id */) { + return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); + } + + const char* Name() const override = 0; + + // Return true if the current MemTableRep supports concurrent inserts + // Default: false + virtual bool IsInsertConcurrentlySupported() const { return false; } + + // Return true if the current MemTableRep supports detecting duplicate + // at insertion time. If true, then MemTableRep::Insert* returns + // false when if the already exists. + // Default: false + virtual bool CanHandleDuplicatedKey() const { return false; } +}; + +// This uses a skip list to store keys. It is the default. +// +// Parameters: +// lookahead: If non-zero, each iterator's seek operation will start the +// search from the previously visited record (doing at most 'lookahead' +// steps). This is an optimization for the access pattern including many +// seeks with consecutive keys. +class SkipListFactory : public MemTableRepFactory { + public: + explicit SkipListFactory(size_t lookahead = 0); + + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "SkipListFactory"; } + static const char* kNickName() { return "skip_list"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + std::string GetId() const override; + + // Methods for MemTableRepFactory class overrides + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; + + bool IsInsertConcurrentlySupported() const override { return true; } + + bool CanHandleDuplicatedKey() const override { return true; } + + private: + size_t lookahead_; +}; + +#ifndef ROCKSDB_LITE +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + size_t count_; + + public: + explicit VectorRepFactory(size_t count = 0); + + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "VectorRepFactory"; } + static const char* kNickName() { return "vector"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + // Methods for MemTableRepFactory class overrides + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; +}; + +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist +extern MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count = 1000000, int32_t skiplist_height = 4, + int32_t skiplist_branching_factor = 4); + +// The factory is to create memtables based on a hash table: +// it contains a fixed array of buckets, each pointing to either a linked list +// or a skip list if number of entries inside the bucket exceeds +// threshold_use_skiplist. +// @bucket_count: number of fixed array buckets +// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt +// @bucket_entries_logging_threshold: if number of entries in one bucket +// exceeds this number, log about it. +// @if_log_bucket_dist_when_flash: if true, log distribution of number of +// entries when flushing. +// @threshold_use_skiplist: a bucket switches to skip list if number of +// entries exceed this parameter. +extern MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, + int bucket_entries_logging_threshold = 4096, + bool if_log_bucket_dist_when_flash = true, + uint32_t threshold_use_skiplist = 256); + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h new file mode 100644 index 000000000..ae795220b --- /dev/null +++ b/src/rocksdb/include/rocksdb/merge_operator.h @@ -0,0 +1,265 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMergeV2) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges multiple +// operands together. this is especially useful if your key values have +// complex structures but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class MergeOperator : public Customizable { + public: + virtual ~MergeOperator() {} + static const char* Type() { return "MergeOperator"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + std::shared_ptr* result); + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here. + // The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + // deprecated, please use FullMergeV2() + assert(false); + return false; + } + + struct MergeOperationInput { + // If user-defined timestamp is enabled, `_key` includes timestamp. + explicit MergeOperationInput(const Slice& _key, + const Slice* _existing_value, + const std::vector& _operand_list, + Logger* _logger) + : key(_key), + existing_value(_existing_value), + operand_list(_operand_list), + logger(_logger) {} + + // The key associated with the merge operation. + const Slice& key; + // The existing value of the current key, nullptr means that the + // value doesn't exist. + const Slice* existing_value; + // A list of operands to apply. + const std::vector& operand_list; + // Logger could be used by client to log any errors that happen during + // the merge operation. + Logger* logger; + }; + + struct MergeOperationOutput { + explicit MergeOperationOutput(std::string& _new_value, + Slice& _existing_operand) + : new_value(_new_value), existing_operand(_existing_operand) {} + + // Client is responsible for filling the merge result here. + std::string& new_value; + // If the merge result is one of the existing operands (or existing_value), + // client can set this field to the operand (or existing_value) instead of + // using new_value. + Slice& existing_operand; + }; + + // This function applies a stack of merge operands in chronological order + // on top of an existing value. There are two ways in which this method is + // being used: + // a) During Get() operation, it used to calculate the final value of a key + // b) During compaction, in order to collapse some operands with the based + // value. + // + // Note: The name of the method is somewhat misleading, as both in the cases + // of Get() or compaction it may be called on a subset of operands: + // K: 0 +1 +2 +7 +4 +5 2 +1 +2 + // ^ + // | + // snapshot + // In the example above, Get(K) operation will call FullMerge with a base + // value of 2 and operands [+1, +2]. Compaction process might decide to + // collapse the beginning of the history up to the snapshot by performing + // full Merge with base value of 0 and operands [+1, +2, +7, +4]. + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // The string that new_value is pointing to will be empty. + // + // The default implementation of PartialMergeMulti will use this function + // as a helper, for backward compatibility. Any successor class of + // MergeOperator should either implement PartialMerge or PartialMergeMulti, + // although implementing PartialMergeMulti is suggested as it is in general + // more effective to merge multiple operands at a time instead of two + // operands at a time. + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMergeV2() function + // and return false there. The default implementation of PartialMerge will + // always return false. + virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& /*right_operand*/, + std::string* /*new_value*/, + Logger* /*logger*/) const { + return false; + } + + // This function performs merge when all the operands are themselves merge + // operation types that you would have passed to a DB::Merge() call in the + // same order (front() first) + // (i.e. DB::Merge(key, operand_list[0]), followed by + // DB::Merge(key, operand_list[1]), ...) + // + // PartialMergeMulti should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. *new_value should + // be constructed such that a call to DB::Merge(key, *new_value) would yield + // the same result as sequential individual calls to DB::Merge(key, operand) + // for each operand in operand_list from front() to back(). + // + // The string that new_value is pointing to will be empty. + // + // The PartialMergeMulti function will be called when there are at least two + // operands. + // + // In the default implementation, PartialMergeMulti will invoke PartialMerge + // multiple times, where each time it only merges two operands. Developers + // should either implement PartialMergeMulti, or implement PartialMerge which + // is served as the helper function of the default PartialMergeMulti. + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const override = 0; + + // Determines whether the PartialMerge can be called with just a single + // merge operand. + // Override and return true for allowing a single operand. PartialMerge + // and PartialMergeMulti should be overridden and implemented + // correctly to properly handle a single operand. + virtual bool AllowSingleOperand() const { return false; } + + // Allows to control when to invoke a full merge during Get. + // This could be used to limit the number of merge operands that are looked at + // during a point lookup, thereby helping in limiting the number of levels to + // read from. + // Doesn't help with iterators. + // + // Note: the merge operands are passed to this function in the reversed order + // relative to how they were merged (passed to FullMerge or FullMergeV2) + // for performance reasons, see also: + // https://github.com/facebook/rocksdb/issues/3865 + virtual bool ShouldMerge(const std::vector& /*operands*/) const { + return false; + } +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + ~AssociativeMergeOperator() override {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result + // here. The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const = 0; + + private: + // Default implementations of the MergeOperator functions + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& key, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* logger) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h new file mode 100644 index 000000000..0cdffcd5f --- /dev/null +++ b/src/rocksdb/include/rocksdb/metadata.h @@ -0,0 +1,245 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Basic identifiers and metadata for a file in a DB. This only includes +// information considered relevant for taking backups, checkpoints, or other +// services relating to DB file storage. +// This is only appropriate for immutable files, such as SST files or all +// files in a backup. See also LiveFileStorageInfo. +struct FileStorageInfo { + // The name of the file within its directory (e.g. "123456.sst") + std::string relative_filename; + // The directory containing the file, without a trailing '/'. This could be + // a DB path, wal_dir, etc. + std::string directory; + + // The id of the file within a single DB. Set to 0 if the file does not have + // a number (e.g. CURRENT) + uint64_t file_number = 0; + // The type of the file as part of a DB. + FileType file_type = kTempFile; + + // File size in bytes. See also `trim_to_size`. + uint64_t size = 0; + + // This feature is experimental and subject to change. + Temperature temperature = Temperature::kUnknown; + + // The checksum of a SST file, the value is decided by the file content and + // the checksum algorithm used for this SST file. The checksum function is + // identified by the file_checksum_func_name. If the checksum function is + // not specified, file_checksum is "0" by default. + std::string file_checksum; + + // The name of the checksum function used to generate the file checksum + // value. If file checksum is not enabled (e.g., sst_file_checksum_func is + // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is + // "Unknown". + std::string file_checksum_func_name; +}; + +// Adds to FileStorageInfo the ability to capture the state of files that +// might change in a running DB. +struct LiveFileStorageInfo : public FileStorageInfo { + // If non-empty, this string represents the "saved" contents of the file + // for the current context. (This field is used for checkpointing CURRENT + // file.) In that case, size == replacement_contents.size() and file on disk + // should be ignored. If empty string, the file on disk should still have + // "saved" contents. (See trim_to_size.) + std::string replacement_contents; + + // If true, the file on disk is allowed to be larger than `size` but only + // the first `size` bytes should be used for the current context. If false, + // the file is corrupt if size on disk does not equal `size`. + bool trim_to_size = false; +}; + +// The metadata that describes an SST file. (Does not need to extend +// LiveFileStorageInfo because SST files are always immutable.) +struct SstFileMetaData : public FileStorageInfo { + SstFileMetaData() { file_type = kTableFile; } + + SstFileMetaData(const std::string& _file_name, uint64_t _file_number, + const std::string& _directory, uint64_t _size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, + const std::string& _smallestkey, + const std::string& _largestkey, uint64_t _num_reads_sampled, + bool _being_compacted, Temperature _temperature, + uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + std::string& _file_checksum, + std::string& _file_checksum_func_name) + : smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), + smallestkey(_smallestkey), + largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), + being_compacted(_being_compacted), + num_entries(0), + num_deletions(0), + oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time) { + if (!_file_name.empty()) { + if (_file_name[0] == '/') { + relative_filename = _file_name.substr(1); + name = _file_name; // Deprecated field + } else { + relative_filename = _file_name; + name = std::string("/") + _file_name; // Deprecated field + } + assert(relative_filename.size() + 1 == name.size()); + assert(relative_filename[0] != '/'); + assert(name[0] == '/'); + } + directory = _directory; + db_path = _directory; // Deprecated field + file_number = _file_number; + file_type = kTableFile; + size = _size; + temperature = _temperature; + file_checksum = _file_checksum; + file_checksum_func_name = _file_checksum_func_name; + } + + SequenceNumber smallest_seqno = 0; // Smallest sequence number in file. + SequenceNumber largest_seqno = 0; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled = 0; // How many times the file is read. + bool being_compacted = + false; // true if the file is currently being compacted. + + uint64_t num_entries = 0; + uint64_t num_deletions = 0; + + uint64_t oldest_blob_file_number = 0; // The id of the oldest blob file + // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancestor of this file. + // The timestamp is provided SystemClock::GetCurrentTime(). + // 0 if the information is not available. + // + // Note: for TTL blob files, it contains the start of the expiration range. + uint64_t oldest_ancester_time = 0; + // Timestamp when the SST file is created, provided by + // SystemClock::GetCurrentTime(). 0 if the information is not available. + uint64_t file_creation_time = 0; + + // DEPRECATED: The name of the file within its directory with a + // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct + // instead. + std::string name; + + // DEPRECATED: replaced by `directory` in base struct + std::string db_path; +}; + +// The full set of metadata associated with each SST file. +struct LiveFileMetaData : SstFileMetaData { + std::string column_family_name; // Name of the column family + int level; // Level at which this file resides. + LiveFileMetaData() : column_family_name(), level(0) {} +}; + +// The MetaData that describes a Blob file +struct BlobMetaData { + BlobMetaData() + : blob_file_number(0), + blob_file_size(0), + total_blob_count(0), + total_blob_bytes(0), + garbage_blob_count(0), + garbage_blob_bytes(0) {} + + BlobMetaData(uint64_t _file_number, const std::string& _file_name, + const std::string& _file_path, uint64_t _file_size, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes, + const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : blob_file_number(_file_number), + blob_file_name(_file_name), + blob_file_path(_file_path), + blob_file_size(_file_size), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes), + checksum_method(_file_checksum), + checksum_value(_file_checksum_func_name) {} + uint64_t blob_file_number; + std::string blob_file_name; + std::string blob_file_path; + uint64_t blob_file_size; + uint64_t total_blob_count; + uint64_t total_blob_bytes; + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; + std::string checksum_method; + std::string checksum_value; +}; + +// The metadata that describes a level. +struct LevelMetaData { + LevelMetaData(int _level, uint64_t _size, + const std::vector&& _files) + : level(_level), size(_size), files(_files) {} + + // The level which this meta data describes. + const int level; + // The size of this level in bytes, which is equal to the sum of + // the file size of its "files". + const uint64_t size; + // The metadata of all sst files in this level. + const std::vector files; +}; + +// The metadata that describes a column family. +struct ColumnFamilyMetaData { + ColumnFamilyMetaData() : size(0), file_count(0), name("") {} + ColumnFamilyMetaData(const std::string& _name, uint64_t _size, + const std::vector&& _levels) + : size(_size), name(_name), levels(_levels) {} + + // The size of this column family in bytes, which is equal to the sum of + // the file size of its "levels". + uint64_t size; + // The number of files in this column family. + size_t file_count; + // The name of the column family. + std::string name; + // The metadata of all levels in this column family. + std::vector levels; + + // The total size of all blob files + uint64_t blob_file_size = 0; + // The number of blob files in this column family. + size_t blob_file_count = 0; + // The metadata of the blobs in this column family. + std::vector blob_files; +}; + +// Metadata returned as output from ExportColumnFamily() and used as input to +// CreateColumnFamiliesWithImport(). +struct ExportImportFilesMetaData { + std::string db_comparator_name; // Used to safety check at import. + std::vector files; // Vector of file metadata. +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h new file mode 100644 index 000000000..7a4d8b5a6 --- /dev/null +++ b/src/rocksdb/include/rocksdb/options.h @@ -0,0 +1,2113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/advanced_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/data_structure.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/listener.h" +#include "rocksdb/sst_partitioner.h" +#include "rocksdb/types.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" + +#ifdef max +#undef max +#endif + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class Comparator; +class ConcurrentTaskLimiter; +class Env; +enum InfoLogLevel : unsigned char; +class SstFileManager; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class MemTableRepFactory; +class RateLimiter; +class Slice; +class Statistics; +class InternalKeyComparator; +class WalFilter; +class FileSystem; + +struct Options; +struct DbPath; + +using FileTypeSet = SmallEnumSet; + +struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { + // The function recovers options to a previous version. Only 4.6 or later + // versions are supported. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in to be used as the block cache + ColumnFamilyOptions* OptimizeForSmallDb( + std::shared_ptr* cache = nullptr); + + // Use this if you don't need to keep the data sorted, i.e. you'll never use + // an iterator, only Put() and Get() API calls + // + // Not supported in ROCKSDB_LITE + ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb); + + // Default values for some parameters in ColumnFamilyOptions are not + // optimized for heavy workloads and big datasets, which means you might + // observe write stalls under some conditions. As a starting point for tuning + // RocksDB options, use the following two functions: + // * OptimizeLevelStyleCompaction -- optimizes level style compaction + // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction + // Universal style compaction is focused on reducing Write Amplification + // Factor for big data sets, but increases Space Amplification. You can learn + // more about the different styles here: + // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide + // Make sure to also call IncreaseParallelism(), which will provide the + // biggest performance gains. + // Note: we might use more memory than memtable_memory_budget during high + // write rate period + // + // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE + ColumnFamilyOptions* OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + ColumnFamilyOptions* OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator = BytewiseComparator(); + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // opening the DB in this case. + // Default: nullptr + std::shared_ptr merge_operator = nullptr; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new `CompactionFilter` to be used for different + // compaction runs and/or requires a `CompactionFilter` for table file + // creations outside of compaction, it can specify compaction_filter_factory + // instead of this option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter = nullptr; + + // This is a factory that provides `CompactionFilter` objects which allow + // an application to modify/delete a key-value during table file creation. + // + // Unlike the `compaction_filter` option, which is used when compaction + // creates a table file, this factory allows using a `CompactionFilter` when a + // table file is created for various reasons. The factory can decide what + // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by + // default the decision is to use a `CompactionFilter` for + // `TableFileCreationReason::kCompaction` only. + // + // Each thread of work involving creating table files will create a new + // `CompactionFilter` when it will be used according to the above + // `TableFileCreationReason`-based decision. This allows the application to + // know about the different ongoing threads of work and makes it unnecessary + // for `CompactionFilter` to provide thread-safety. + // + // Default: nullptr + std::shared_ptr compaction_filter_factory = nullptr; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Note that write_buffer_size is enforced per column family. + // See db_write_buffer_size for sharing memory across column families. + // + // Default: 64MB + // + // Dynamically changeable through SetOptions() API + size_t write_buffer_size = 64 << 20; + + // Compress blocks using the specified compression algorithm. + // + // Default: kSnappyCompression, if it's supported. If snappy is not linked + // with the library, the default is kNoCompression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + // + // If you do not set `compression_opts.level`, or set it to + // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the + // default corresponding to `compression` as follows: + // + // - kZSTD: 3 + // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1) + // - kLZ4HCCompression: 0 + // - For all others, we do not specify a compression level + // + // Dynamically changeable through SetOptions() API + CompressionType compression; + + // Compression algorithm that will be used for the bottommost level that + // contain files. The behavior for num_levels = 1 is not well defined. + // Right now, with num_levels = 1, all compaction outputs will use + // bottommost_compression and all flush outputs still use options.compression, + // but the behavior is subject to change. + // + // Default: kDisableCompressionOption (Disabled) + CompressionType bottommost_compression = kDisableCompressionOption; + + // different options for compression algorithms used by bottommost_compression + // if it is enabled. To enable it, please see the definition of + // CompressionOptions. Behavior for num_levels = 1 is the same as + // options.bottommost_compression. + CompressionOptions bottommost_compression_opts; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + // + // Default: 4 + // + // Dynamically changeable through SetOptions() API + int level0_file_num_compaction_trigger = 4; + + // If non-nullptr, use the specified function to put keys in contiguous + // groups called "prefixes". These prefixes are used to place one + // representative entry for the group into the Bloom filter + // rather than an entry for each key (see whole_key_filtering). + // Under certain conditions, this enables optimizing some range queries + // (Iterators) in addition to some point lookups (Get/MultiGet). + // + // Together `prefix_extractor` and `comparator` must satisfy one essential + // property for valid prefix filtering of range queries: + // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and + // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3), + // Then InDomain(k2) and prefix(k2) == prefix(k1) + // + // In other words, all keys with the same prefix must be in a contiguous + // group by comparator order, and cannot be interrupted by keys with no + // prefix ("out of domain"). (This makes it valid to conclude that no + // entries within some bounds are present if the upper and lower bounds + // have a common prefix and no entries with that same prefix are present.) + // + // Some other properties are recommended but not strictly required. Under + // most sensible comparators, the following will need to hold true to + // satisfy the essential property above: + // * "Prefix is a prefix": key.starts_with(prefix(key)) + // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then + // Compare(prefix(k1), prefix(k2)) <= 0 + // + // The next two properties ensure that seeking to a prefix allows + // enumerating all entries with that prefix: + // * "Prefix starts the group": Compare(prefix(key), key) <= 0 + // * "Prefix idempotent": prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + std::shared_ptr prefix_extractor = nullptr; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 200MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 200MB, total file size for level-2 will be 2GB, + // and total file size for level-3 will be 20GB. + // + // Default: 256MB. + // + // Dynamically changeable through SetOptions() API + uint64_t max_bytes_for_level_base = 256 * 1048576; + + // Deprecated. + uint64_t snap_refresh_nanos = 0; + + // Disable automatic compactions. Manual compactions can still + // be issued on this column family + // + // Dynamically changeable through SetOptions() API + bool disable_auto_compactions = false; + + // This is a factory that provides TableFactory objects. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. + std::shared_ptr table_factory; + + // A list of paths where SST files for this column family + // can be put into, with its target size. Similar to db_paths, + // newer data is placed into paths specified earlier in the + // vector while older data gradually moves to paths specified + // later in the vector. + // Note that, if a path is supplied to multiple column + // families, it would have files and total size from all + // the column families combined. User should provision for the + // total size(from all the column families) in such cases. + // + // If left empty, db_paths will be used. + // Default: empty + std::vector cf_paths; + + // Compaction concurrent thread limiter for the column family. + // If non-nullptr, use given concurrent thread limiter to control + // the max outstanding compaction tasks. Limiter can be shared with + // multiple column families across db instances. + // + // Default: nullptr + std::shared_ptr compaction_thread_limiter = nullptr; + + // If non-nullptr, use the specified factory for a function to determine the + // partitioning of sst files. This helps compaction to split the files + // on interesting boundaries (key prefixes) to make propagation of sst + // files less write amplifying (covering the whole key space). + // THE FEATURE IS STILL EXPERIMENTAL + // + // Default: nullptr + std::shared_ptr sst_partitioner_factory = nullptr; + + // Create ColumnFamilyOptions with default values for all fields + ColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit ColumnFamilyOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +enum class WALRecoveryMode : char { + // Original levelDB recovery + // + // We tolerate the last record in any log to be incomplete due to a crash + // while writing it. Zeroed bytes from preallocation are also tolerated in the + // trailing data of any log. + // + // Use case: Applications for which updates, once applied, must not be rolled + // back even after a crash-recovery. In this recovery mode, RocksDB guarantees + // this as long as `WritableFile::Append()` writes are durable. In case the + // user needs the guarantee in more situations (e.g., when + // `WritableFile::Append()` writes to page cache, but the user desires this + // guarantee in face of power-loss crash-recovery), RocksDB offers various + // mechanisms to additionally invoke `WritableFile::Sync()` in order to + // strengthen the guarantee. + // + // This differs from `kPointInTimeRecovery` in that, in case a corruption is + // detected during recovery, this mode will refuse to open the DB. Whereas, + // `kPointInTimeRecovery` will stop recovery just before the corruption since + // that is a valid point-in-time to which to recover. + kTolerateCorruptedTailRecords = 0x00, + // Recover from clean shutdown + // We don't expect to find any corruption in the WAL + // Use case : This is ideal for unit tests and rare applications that + // can require high consistency guarantee + kAbsoluteConsistency = 0x01, + // Recover to point-in-time consistency (default) + // We stop the WAL playback on discovering WAL inconsistency + // Use case : Ideal for systems that have disk controller cache like + // hard disk, SSD without super capacitor that store related data + kPointInTimeRecovery = 0x02, + // Recovery after a disaster + // We ignore any corruption in the WAL and try to salvage as much data as + // possible + // Use case : Ideal for last ditch effort to recover data or systems that + // operate with low grade unrelated data + kSkipAnyCorruptedRecords = 0x03, +}; + +struct DbPath { + std::string path; + uint64_t target_size; // Target size of total files under the path, in byte. + + DbPath() : target_size(0) {} + DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} +}; + +extern const char* kHostnameForDbHostId; + +enum class CompactionServiceJobStatus : char { + kSuccess, + kFailure, + kUseLocal, +}; + +struct CompactionServiceJobInfo { + std::string db_name; + std::string db_id; + std::string db_session_id; + uint64_t job_id; // job_id is only unique within the current DB and session, + // restart DB will reset the job_id. `db_id` and + // `db_session_id` could help you build unique id across + // different DBs and sessions. + + Env::Priority priority; + + CompactionServiceJobInfo(std::string db_name_, std::string db_id_, + std::string db_session_id_, uint64_t job_id_, + Env::Priority priority_) + : db_name(std::move(db_name_)), + db_id(std::move(db_id_)), + db_session_id(std::move(db_session_id_)), + job_id(job_id_), + priority(priority_) {} +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionService : public Customizable { + public: + static const char* Type() { return "CompactionService"; } + + // Returns the name of this compaction service. + const char* Name() const override = 0; + + // Start the remote compaction with `compaction_service_input`, which can be + // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the + // information the user might want to know, which includes `job_id`. + virtual CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& /*info*/, + const std::string& /*compaction_service_input*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + // Wait for remote compaction to finish. + virtual CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& /*info*/, + std::string* /*compaction_service_result*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + ~CompactionService() override = default; +}; + +struct DBOptions { + // The function recovers options to the option as in version 4.6. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + DBOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in for the memory of the + // memtable to cost to + DBOptions* OptimizeForSmallDb(std::shared_ptr* cache = nullptr); + +#ifndef ROCKSDB_LITE + // By default, RocksDB uses only one background thread for flush and + // compaction. Calling this function will set it up such that total of + // `total_threads` is used. Good value for `total_threads` is the number of + // cores. You almost definitely want to call this function if your system is + // bottlenecked by RocksDB. + DBOptions* IncreaseParallelism(int total_threads = 16); +#endif // ROCKSDB_LITE + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing = false; + + // If true, missing column families will be automatically created. + // Default: false + bool create_missing_column_families = false; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists = false; + + // If true, RocksDB will aggressively check consistency of the data. + // Also, if any of the writes to the database fails (Put, Delete, Merge, + // Write), the database will switch to read-only mode and fail all other + // Write operations. + // In most cases you want this to be set to true. + // Default: true + bool paranoid_checks = true; + + // If true, during memtable flush, RocksDB will validate total entries + // read in flush, and compare with counter inserted into it. + // The option is here to turn the feature off in case this new validation + // feature has a bug. + // Default: true + bool flush_verify_memtable_count = true; + + // If true, the log numbers and sizes of the synced WALs are tracked + // in MANIFEST. During DB recovery, if a synced WAL is missing + // from disk, or the WAL's size does not match the recorded size in + // MANIFEST, an error will be reported and the recovery will be aborted. + // + // This is one additional protection against WAL corruption besides the + // per-WAL-entry checksum. + // + // Note that this option does not work with secondary instance. + // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`, + // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not + // tracked for performance/efficiency reasons. + // + // Default: false + bool track_and_verify_wals_in_manifest = false; + + // If true, verifies the SST unique id between MANIFEST and actual file + // each time an SST file is opened. This check ensures an SST file is not + // overwritten or misplaced. A corruption error will be reported if mismatch + // detected, but only when MANIFEST tracks the unique id, which starts from + // RocksDB version 7.3. Although the tracked internal unique id is related + // to the one returned by GetUniqueIdFromTableProperties, that is subject to + // change. + // NOTE: verification is currently only done on SST files using block-based + // table format. + // + // Setting to false should only be needed in case of unexpected problems. + // + // Although an early version of this option opened all SST files for + // verification on DB::Open, that is no longer guaranteed. However, as + // documented in an above option, if max_open_files is -1, DB will open all + // files on DB::Open(). + // + // Default: true + bool verify_sst_unique_id_in_manifest = true; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. In the near + // future, support for doing storage operations such as read/write files + // through env will be deprecated in favor of file_system (see below) + // Default: Env::Default() + Env* env = Env::Default(); + + // Limits internal file read/write bandwidth: + // + // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH` + // - Compaction requests read and write bandwidth at + // `Env::IOPriority::IO_LOW` + // - Reads associated with a `ReadOptions` can be charged at + // `ReadOptions::rate_limiter_priority` (see that option's API doc for usage + // and limitations). + // - Writes associated with a `WriteOptions` can be charged at + // `WriteOptions::rate_limiter_priority` (see that option's API doc for + // usage and limitations). + // + // Rate limiting is disabled if nullptr. If rate limiter is enabled, + // bytes_per_sync is set to 1MB by default. + // + // Default: nullptr + std::shared_ptr rate_limiter = nullptr; + + // Use to track SST files and control their file deletion rate. + // + // Features: + // - Throttle the deletion rate of the SST files. + // - Keep track the total size of all SST files. + // - Set a maximum allowed space limit for SST files that when reached + // the DB wont do any further flushes or compactions and will set the + // background error. + // - Can be shared between multiple dbs. + // Limitations: + // - Only track and throttle deletes of SST files in + // first db_path (db_name if db_paths is empty). + // + // Default: nullptr + std::shared_ptr sst_file_manager = nullptr; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + std::shared_ptr info_log = nullptr; + +#ifdef NDEBUG + InfoLogLevel info_log_level = INFO_LEVEL; +#else + InfoLogLevel info_log_level = DEBUG_LEVEL; +#endif // NDEBUG + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // + // A high value or -1 for this option can cause high memory usage. + // See BlockBasedTableOptions::cache_usage_options to constrain + // memory usage in case of block based table format. + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_open_files = -1; + + // If max_open_files is -1, DB will open all files on DB::Open(). You can + // use this option to increase the number of threads used to open the files. + // Default: 16 + int max_file_opening_threads = 16; + + // Once write-ahead logs exceed this size, we will start forcing the flush of + // column families whose memtables are backed by the oldest live WAL file + // (i.e. the ones that are causing all the space amplification). If set to 0 + // (default), we will dynamically choose the WAL size limit to be + // [sum of all write_buffer_size * max_write_buffer_number] * 4 + // + // For example, with 15 column families, each with + // write_buffer_size = 128 MB + // max_write_buffer_number = 6 + // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB + // + // The RocksDB wiki has some discussion about how the WAL interacts + // with memtables and flushing of column families. + // https://github.com/facebook/rocksdb/wiki/Column-Families + // + // This option takes effect only when there are more than one column + // family as otherwise the wal size is dictated by the write_buffer_size. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t max_total_wal_size = 0; + + // If non-null, then we should collect metrics about database operations + std::shared_ptr statistics = nullptr; + + // By default, writes to stable storage use fdatasync (on platforms + // where this function is available). If this option is true, + // fsync is used instead. + // + // fsync and fdatasync are equally safe for our purposes and fdatasync is + // faster, so it is rarely necessary to set this option. It is provided + // as a workaround for kernel/filesystem bugs, such as one that affected + // fdatasync with ext4 in kernel versions prior to 3.7. + bool use_fsync = false; + + // A list of paths where SST files can be put into, with its target size. + // Newer data is placed into paths specified earlier in the vector while + // older data gradually moves to paths specified later in the vector. + // + // For example, you have a flash device with 10GB allocated for the DB, + // as well as a hard drive of 2TB, you should config it to be: + // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] + // + // The system will try to guarantee data under each path is close to but + // not larger than the target size. But current and future file sizes used + // by determining where to place a file are based on best-effort estimation, + // which means there is a chance that the actual size under the directory + // is slightly more than target size under some workloads. User should give + // some buffer room for those cases. + // + // If none of the paths has sufficient room to place a file, the file will + // be placed to the last path anyway, despite to the target size. + // + // Placing newer data to earlier paths is also best-efforts. User should + // expect user files to be placed in higher levels in some extreme cases. + // + // If left empty, only one path will be used, which is db_name passed when + // opening the DB. + // Default: empty + std::vector db_paths; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir = ""; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir = ""; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + // + // Default: 6 hours + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; + + // Maximum number of concurrent background jobs (compactions and flushes). + // + // Default: 2 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_jobs = 2; + + // DEPRECATED: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes` (we replace -1 by 1 in case one option is unset). + // + // Maximum number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // + // If you're increasing this, also consider increasing number of threads in + // LOW priority thread pool. For more information, see + // Env::SetBackgroundThreads + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_compactions = -1; + + // This value represents the maximum number of threads that will + // concurrently perform a compaction job by breaking it into multiple, + // smaller ones that are run simultaneously. + // Default: 1 (i.e. no subcompactions) + // + // Dynamically changeable through SetDBOptions() API. + uint32_t max_subcompactions = 1; + + // DEPRECATED: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes`. + // + // Maximum number of concurrent background memtable flush jobs, submitted by + // default to the HIGH priority thread pool. If the HIGH priority thread pool + // is configured to have zero threads, flush jobs will share the LOW priority + // thread pool with compaction jobs. + // + // It is important to use both thread pools when the same Env is shared by + // multiple db instances. Without a separate pool, long running compaction + // jobs could potentially block memtable flush jobs of other db instances, + // leading to unnecessary Put stalls. + // + // If you're increasing this, also consider increasing number of threads in + // HIGH priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: -1 + int max_background_flushes = -1; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size = 0; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + // Not supported in ROCKSDB_LITE mode! + size_t log_file_time_to_roll = 0; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num = 1000; + + // Recycle log files. + // If non-zero, we will reuse previously written log files for new + // logs, overwriting the old data. The value indicates how many + // such files we will keep around at any point in time for later + // use. This is more efficient because the blocks are already + // allocated and fdatasync does not need to update the inode after + // each write. + // Default: 0 + size_t recycle_log_file_num = 0; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is 1GB so that the manifest file can grow, but not + // reach the limit of storage capacity. + uint64_t max_manifest_file_size = 1024 * 1024 * 1024; + + // Number of shards used for table cache. + int table_cache_numshardbits = 6; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_seconds / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds = 0; + uint64_t WAL_size_limit_MB = 0; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size = 4 * 1024 * 1024; + + // Allow the OS to mmap file for reading sst tables. + // Not recommended for 32-bit OS. + // When the option is set to true and compression is disabled, the blocks + // will not be copied and will be read directly from the mmap-ed memory + // area, and the block will not be inserted into the block cache. However, + // checksums will still be checked if ReadOptions.verify_checksums is set + // to be true. It means a checksum check every time a block is read, more + // than the setup where the option is set to false and the block cache is + // used. The common use of the options is to run RocksDB on ramfs, where + // checksum verification is usually not needed. + // Default: false + bool allow_mmap_reads = false; + + // Allow the OS to mmap file for writing. + // DB::SyncWAL() only works if this is set to false. + // Default: false + bool allow_mmap_writes = false; + + // Enable direct I/O mode for read/write + // they may or may not improve performance depending on the use case + // + // Files will be opened in "direct I/O" mode + // which means that data r/w from the disk will not be cached or + // buffered. The hardware buffer of the devices may however still + // be used. Memory mapped files are not impacted by these parameters. + + // Use O_DIRECT for user and compaction reads. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool use_direct_reads = false; + + // Use O_DIRECT for writes in background flush and compactions. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool use_direct_io_for_flush_and_compaction = false; + + // If false, fallocate() calls are bypassed, which disables file + // preallocation. The file space preallocation is used to increase the file + // write/append performance. By default, RocksDB preallocates space for WAL, + // SST, Manifest files, the extra space is truncated when the file is written. + // Warning: if you're using btrfs, we would recommend setting + // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra + // allocated space cannot be freed, which could be significant if you have + // lots of files. More details about this limitation: + // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md + bool allow_fallocate = true; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec = true; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // + // Default: 600 (10 min) + // + // Dynamically changeable through SetDBOptions() API. + unsigned int stats_dump_period_sec = 600; + + // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec + // Default: 600 + unsigned int stats_persist_period_sec = 600; + + // If true, automatically persist stats to a hidden column family (column + // family name: ___rocksdb_stats_history___) every + // stats_persist_period_sec seconds; otherwise, write to an in-memory + // struct. User can query through `GetStatsHistory` API. + // If user attempts to create a column family with the same name on a DB + // which have previously set persist_stats_to_disk to true, the column family + // creation will fail, but the hidden column family will survive, as well as + // the previously persisted statistics. + // When peristing stats to disk, the stat name will be limited at 100 bytes. + // Default: false + bool persist_stats_to_disk = false; + + // if not zero, periodically take stats snapshots and store in memory, the + // memory size for stats snapshots is capped at stats_history_buffer_size + // Default: 1MB + size_t stats_history_buffer_size = 1024 * 1024; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open = true; + + // Amount of data to build up in memtables across all column + // families before writing to disk. + // + // This is distinct from write_buffer_size, which enforces a limit + // for a single memtable. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: 0 (disabled) + size_t db_write_buffer_size = 0; + + // The memory usage of memtable will report to this object. The same object + // can be passed into multiple DBs and it will track the sum of size of all + // the DBs. If the total size of all live memtables of all the DBs exceeds + // a limit, a flush will be triggered in the next DB to which the next write + // is issued, as long as there is one or more column family not already + // flushing. + // + // If the object is only passed to one DB, the behavior is the same as + // db_write_buffer_size. When write_buffer_manager is set, the value set will + // override db_write_buffer_size. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: null + std::shared_ptr write_buffer_manager = nullptr; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; + AccessHint access_hint_on_compaction_start = NORMAL; + + // If non-zero, we perform bigger reads when doing compaction. If you're + // running RocksDB on spinning disks, you should set this to at least 2MB. + // That way RocksDB's compaction is doing sequential instead of random reads. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + size_t compaction_readahead_size = 0; + + // This is a maximum buffer size that is used by WinMmapReadableFile in + // unbuffered disk I/O mode. We need to maintain an aligned buffer for + // reads. We allow the buffer to grow until the specified value and then + // for bigger requests allocate one shot buffers. In unbuffered mode we + // always bypass read-ahead buffer at ReadaheadRandomAccessFile + // When read-ahead is required we then make use of compaction_readahead_size + // value and always try to read ahead. With read-ahead we always + // pre-allocate buffer to the size instead of growing it up to a limit. + // + // This option is currently honored only on Windows + // + // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. + size_t random_access_max_buffer_size = 1024 * 1024; + + // This is the maximum buffer size that is used by WritableFileWriter. + // With direct IO, we need to maintain an aligned buffer for writes. + // We allow the buffer to grow until it's size hits the limit in buffered + // IO and fix the buffer size when using direct IO to ensure alignment of + // write requests if the logical sector size is unusual + // + // Default: 1024 * 1024 (1 MB) + // + // Dynamically changeable through SetDBOptions() API. + size_t writable_file_max_buffer_size = 1024 * 1024; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex = false; + + // Create DBOptions with default values for all fields + DBOptions(); + // Create DBOptions from Options + explicit DBOptions(const Options& options); + + void Dump(Logger* log) const; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. This operation can be used + // to smooth out write I/Os over time. Users shouldn't rely on it for + // persistence guarantee. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // + // You may consider using rate_limiter to regulate write rate to device. + // When rate limiter is enabled, it automatically enables bytes_per_sync + // to 1MB. + // + // This option applies to table files + // + // Default: 0, turned off + // + // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead + // Dynamically changeable through SetDBOptions() API. + uint64_t bytes_per_sync = 0; + + // Same as bytes_per_sync, but applies to WAL files + // + // Default: 0, turned off + // + // Dynamically changeable through SetDBOptions() API. + uint64_t wal_bytes_per_sync = 0; + + // When true, guarantees WAL files have at most `wal_bytes_per_sync` + // bytes submitted for writeback at any given time, and SST files have at most + // `bytes_per_sync` bytes pending writeback at any given time. This can be + // used to handle cases where processing speed exceeds I/O speed during file + // generation, which can lead to a huge sync when the file is finished, even + // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // A vector of EventListeners whose callback functions will be called + // when specific RocksDB event happens. + std::vector> listeners; + + // If true, then the status of the threads involved in this DB will + // be tracked and available via GetThreadList() API. + // + // Default: false + bool enable_thread_tracking = false; + + // The limited write rate to DB if soft_pending_compaction_bytes_limit or + // level0_slowdown_writes_trigger is triggered, or we are writing to the + // last mem table allowed and we allow more than 3 mem tables. It is + // calculated using size of user write requests before compression. + // RocksDB may decide to slow down more if the compaction still + // gets behind further. + // If the value is 0, we will infer a value from `rater_limiter` value + // if it is not empty, or 16MB if `rater_limiter` is empty. Note that + // if users change the rate in `rate_limiter` after DB is opened, + // `delayed_write_rate` won't be adjusted. + // + // Unit: byte per second. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delayed_write_rate = 0; + + // By default, a single write thread queue is maintained. The thread gets + // to the head of the queue becomes write batch group leader and responsible + // for writing to WAL and memtable for the batch group. + // + // If enable_pipelined_write is true, separate write thread queue is + // maintained for WAL write and memtable write. A write thread first enter WAL + // writer queue and then memtable writer queue. Pending thread on the WAL + // writer queue thus only have to wait for previous writers to finish their + // WAL writing but not the memtable writing. Enabling the feature may improve + // write throughput and reduce latency of the prepare phase of two-phase + // commit. + // + // Default: false + bool enable_pipelined_write = false; + + // Setting unordered_write to true trades higher write throughput with + // relaxing the immutability guarantee of snapshots. This violates the + // repeatability one expects from ::Get from a snapshot, as well as + // ::MultiGet and Iterator's consistent-point-in-time view property. + // If the application cannot tolerate the relaxed guarantees, it can implement + // its own mechanisms to work around that and yet benefit from the higher + // throughput. Using TransactionDB with WRITE_PREPARED write policy and + // two_write_queues=true is one way to achieve immutable snapshots despite + // unordered_write. + // + // By default, i.e., when it is false, rocksdb does not advance the sequence + // number for new snapshots unless all the writes with lower sequence numbers + // are already finished. This provides the immutability that we except from + // snapshots. Moreover, since Iterator and MultiGet internally depend on + // snapshots, the snapshot immutability results into Iterator and MultiGet + // offering consistent-point-in-time view. If set to true, although + // Read-Your-Own-Write property is still provided, the snapshot immutability + // property is relaxed: the writes issued after the snapshot is obtained (with + // larger sequence numbers) will be still not visible to the reads from that + // snapshot, however, there still might be pending writes (with lower sequence + // number) that will change the state visible to the snapshot after they are + // landed to the memtable. + // + // Default: false + bool unordered_write = false; + + // If true, allow multi-writers to update mem tables in parallel. + // Only some memtable_factory-s support concurrent writes; currently it + // is implemented only for SkipListFactory. Concurrent memtable writes + // are not compatible with inplace_update_support or filter_deletes. + // It is strongly recommended to set enable_write_thread_adaptive_yield + // if you are going to use this feature. + // + // Default: true + bool allow_concurrent_memtable_write = true; + + // If true, threads synchronizing with the write batch group leader will + // wait for up to write_thread_max_yield_usec before blocking on a mutex. + // This can substantially improve throughput for concurrent workloads, + // regardless of whether allow_concurrent_memtable_write is enabled. + // + // Default: true + bool enable_write_thread_adaptive_yield = true; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + // + // Default: 1 MB + uint64_t max_write_batch_group_size_bytes = 1 << 20; + + // The maximum number of microseconds that a write operation will use + // a yielding spin loop to coordinate with other write threads before + // blocking on a mutex. (Assuming write_thread_slow_yield_usec is + // set properly) increasing this value is likely to increase RocksDB + // throughput at the expense of increased CPU usage. + // + // Default: 100 + uint64_t write_thread_max_yield_usec = 100; + + // The latency in microseconds after which a std::this_thread::yield + // call (sched_yield on Linux) is considered to be a signal that + // other processes or threads would like to use the current core. + // Increasing this makes writer threads more likely to take CPU + // by spinning, which will show up as an increase in the number of + // involuntary context switches. + // + // Default: 3 + uint64_t write_thread_slow_yield_usec = 3; + + // If true, then DB::Open() will not update the statistics used to optimize + // compaction decision by loading table properties from many files. + // Turning off this feature will improve DBOpen time especially in + // disk environment. + // + // Default: false + bool skip_stats_update_on_db_open = false; + + // If true, then DB::Open() will not fetch and check sizes of all sst files. + // This may significantly speed up startup if there are many sst files, + // especially when using non-default Env with expensive GetFileSize(). + // We'll still check that all required sst files exist. + // If paranoid_checks is false, this option is ignored, and sst files are + // not checked at all. + // + // Default: false + bool skip_checking_sst_file_sizes_on_db_open = false; + + // Recovery mode to control the consistency while replaying WAL + // Default: kPointInTimeRecovery + WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + // if set to false then recovery will fail when a prepared + // transaction is encountered in the WAL + bool allow_2pc = false; + + // A global cache for table-level rows. + // Default: nullptr (disabled) + // Not supported in ROCKSDB_LITE mode! + std::shared_ptr row_cache = nullptr; + +#ifndef ROCKSDB_LITE + // A filter object supplied to be invoked while processing write-ahead-logs + // (WALs) during recovery. The filter provides a way to inspect log + // records, ignoring a particular record or skipping replay. + // The filter is invoked at startup and is invoked from a single-thread + // currently. + WalFilter* wal_filter = nullptr; +#endif // ROCKSDB_LITE + + // If true, then DB::Open / CreateColumnFamily / DropColumnFamily + // SetOptions will fail if options file is not properly persisted. + // + // DEFAULT: false + bool fail_if_options_file_error = false; + + // If true, then print malloc stats together with rocksdb.stats + // when printing to LOG. + // DEFAULT: false + bool dump_malloc_stats = false; + + // By default RocksDB replay WAL logs and flush them on DB open, which may + // create very small SST files. If this option is enabled, RocksDB will try + // to avoid (but not guarantee not to) flush during recovery. Also, existing + // WAL logs will be kept, so that if crash happened before flush, we still + // have logs to recover from. + // + // DEFAULT: false + bool avoid_flush_during_recovery = false; + + // By default RocksDB will flush all memtables on DB close if there are + // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup + // DB close. Unpersisted data WILL BE LOST. + // + // DEFAULT: false + // + // Dynamically changeable through SetDBOptions() API. + bool avoid_flush_during_shutdown = false; + + // Set this option to true during creation of database if you want + // to be able to ingest behind (call IngestExternalFile() skipping keys + // that already exist, rather than overwriting matching keys). + // Setting this option to true will affect 2 things: + // 1) Disable some internal optimizations around SST file compression + // 2) Reserve bottom-most level for ingested files only. + // 3) Note that num_levels should be >= 3 if this option is turned on. + // + // DEFAULT: false + // Immutable. + bool allow_ingest_behind = false; + + // If enabled it uses two queues for writes, one for the ones with + // disable_memtable and one for the ones that also write to memtable. This + // allows the memtable writes not to lag behind other writes. It can be used + // to optimize MySQL 2PC in which only the commits, which are serial, write to + // memtable. + bool two_write_queues = false; + + // If true WAL is not flushed automatically after each write. Instead it + // relies on manual invocation of FlushWAL to write the WAL buffer to its + // file. + bool manual_wal_flush = false; + + // This feature is WORK IN PROGRESS + // If enabled WAL records will be compressed before they are written. + // Only zstd is supported. Compressed WAL records will be read in supported + // versions regardless of the wal_compression settings. + CompressionType wal_compression = kNoCompression; + + // If true, RocksDB supports flushing multiple column families and committing + // their results atomically to MANIFEST. Note that it is not + // necessary to set atomic_flush to true if WAL is always enabled since WAL + // allows the database to be restored to the last persistent state in WAL. + // This option is useful when there are column families with writes NOT + // protected by WAL. + // For manual flush, application has to specify which column families to + // flush atomically in DB::Flush. + // For auto-triggered flush, RocksDB atomically flushes ALL column families. + // + // Currently, any WAL-enabled writes after atomic flush may be replayed + // independently if the process crashes later and tries to recover. + bool atomic_flush = false; + + // If true, working thread may avoid doing unnecessary and long-latency + // operation (such as deleting obsolete files directly or deleting memtable) + // and will instead schedule a background job to do it. + // Use it if you're latency-sensitive. + // If set to true, takes precedence over + // ReadOptions::background_purge_on_iterator_cleanup. + bool avoid_unnecessary_blocking_io = false; + + // Historically DB ID has always been stored in Identity File in DB folder. + // If this flag is true, the DB ID is written to Manifest file in addition + // to the Identity file. By doing this 2 problems are solved + // 1. We don't checksum the Identity file where as Manifest file is. + // 2. Since the source of truth for DB is Manifest file DB ID will sit with + // the source of truth. Previously the Identity file could be copied + // independent of Manifest and that can result in wrong DB ID. + // We recommend setting this flag to true. + // Default: false + bool write_dbid_to_manifest = false; + + // The number of bytes to prefetch when reading the log. This is mostly useful + // for reading a remotely located log, as it can save the number of + // round-trips. If 0, then the prefetching is disabled. + // + // Default: 0 + size_t log_readahead_size = 0; + + // If user does NOT provide the checksum generator factory, the file checksum + // will NOT be used. A new file checksum generator object will be created + // when a SST file is created. Therefore, each created FileChecksumGenerator + // will only be used from a single thread and so does not need to be + // thread-safe. + // + // Default: nullptr + std::shared_ptr file_checksum_gen_factory = nullptr; + + // By default, RocksDB recovery fails if any table/blob file referenced in the + // final version reconstructed from the + // MANIFEST are missing after scanning the MANIFEST pointed to by the + // CURRENT file. It can also fail if verification of unique SST id fails. + // Best-efforts recovery is another recovery mode that does not necessarily + // fail when certain table/blob files are missing/corrupted or have mismatched + // unique id table property. Instead, best-efforts recovery recovers each + // column family to a point in the MANIFEST that corresponds to a version. In + // such a version, all valid table/blob files referenced have the expected + // file size. For table files, their unique id table property match the + // MANIFEST. + // + // Best-efforts recovery does not need a valid CURRENT file, and tries to + // recover the database using one of the available MANIFEST files in the db + // directory. + // Best-efforts recovery tries the available MANIFEST files from high file + // numbers (newer) to low file numbers (older), and stops after finding the + // first MANIFEST file from which the db can be recovered to a state without + // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob + // files. It is possible that the database can be restored to an empty state + // with no table or blob files. + // + // Regardless of this option, the IDENTITY file + // is updated if needed during recovery to match the DB ID in the MANIFEST (if + // previously using write_dbid_to_manifest) or to be in some valid state + // (non-empty DB ID). Currently, not compatible with atomic flush. + // Furthermore, WAL files will not be used for recovery if + // best_efforts_recovery is true. Also requires either 1) LOCK file exists or + // 2) underlying env's LockFile() call returns ok even for non-existing LOCK + // file. + // + // Default: false + bool best_efforts_recovery = false; + + // It defines how many times db resume is called by a separate thread when + // background retryable IO Error happens. When background retryable IO + // Error happens, SetBGError is called to deal with the error. If the error + // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), + // then db resume is called in background to recover from the error. If this + // value is 0 or negative, db resume will not be called. + // + // Default: INT_MAX + int max_bgerror_resume_count = INT_MAX; + + // If max_bgerror_resume_count is >= 2, db resume is called multiple times. + // This option decides how long to wait to retry the next resume if the + // previous resume fails and satisfy redo resume conditions. + // + // Default: 1000000 (microseconds). + uint64_t bgerror_resume_retry_interval = 1000000; + + // It allows user to opt-in to get error messages containing corrupted + // keys/values. Corrupt keys, values will be logged in the + // messages/logs/status that will help users with the useful information + // regarding affected data. By default value is set false to prevent users + // data to be exposed in the logs/messages etc. + // + // Default: false + bool allow_data_in_errors = false; + + // A string identifying the machine hosting the DB. This + // will be written as a property in every SST file written by the DB (or + // by offline writers such as SstFileWriter and RepairDB). It can be useful + // for troubleshooting in memory corruption caused by a failing host when + // writing a file, by tracing back to the writing host. These corruptions + // may not be caught by the checksum since they happen before checksumming. + // If left as default, the table writer will substitute it with the actual + // hostname when writing the SST file. If set to an empty string, the + // property will not be written to the SST file. + // + // Default: hostname + std::string db_host_id = kHostnameForDbHostId; + + // Use this if your DB want to enable checksum handoff for specific file + // types writes. Make sure that the File_system you use support the + // crc32c checksum verification + // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile. + // NOTE: currently RocksDB only generates crc32c based checksum for the + // handoff. If the storage layer has different checksum support, user + // should enble this set as empty. Otherwise,it may cause unexpected + // write failures. + FileTypeSet checksum_handoff_file_types; + + // EXPERIMENTAL + // CompactionService is a feature allows the user to run compactions on a + // different host or process, which offloads the background load from the + // primary host. + // It's an experimental feature, the interface will be changed without + // backward/forward compatibility support for now. Some known issues are still + // under development. + std::shared_ptr compaction_service = nullptr; + + // It indicates, which lowest cache tier we want to + // use for a certain DB. Currently we support volatile_tier and + // non_volatile_tier. They are layered. By setting it to kVolatileTier, only + // the block cache (current implemented volatile_tier) is used. So + // cache entries will not spill to secondary cache (current + // implemented non_volatile_tier), and block cache lookup misses will not + // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use + // both block cache and secondary cache. + // + // Default: kNonVolatileBlockTier + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + + // If set to false, when compaction or flush sees a SingleDelete followed by + // a Delete for the same user key, compaction job will not fail. + // Otherwise, compaction job will fail. + // This is a temporary option to help existing use cases migrate, and + // will be removed in a future release. + // Warning: do not set to false unless you are trying to migrate existing + // data in which the contract of single delete + // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced, + // thus has Delete mixed with SingleDelete for the same user key. Violation + // of the contract leads to undefined behaviors with high possibility of data + // inconsistency, e.g. deleted old data become visible again, etc. + bool enforce_single_del_contracts = true; +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options : public DBOptions, public ColumnFamilyOptions { + // Create an Options object with default values for all fields. + Options() : DBOptions(), ColumnFamilyOptions() {} + + Options(const DBOptions& db_options, + const ColumnFamilyOptions& column_family_options) + : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + + // Change to some default settings from an older version. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. + Options* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + void Dump(Logger* log) const; + + void DumpCFOptions(Logger* log) const; + + // Some functions that make it easier to optimize RocksDB + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + Options* OptimizeForSmallDb(); + + // Disable some checks that should not be necessary in the absence of + // software logic errors or CPU+memory hardware errors. This can improve + // write speeds but is only recommended for temporary use. Does not + // change protection against corrupt storage (e.g. verify_checksums). + Options* DisableExtraChecks(); +}; + +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1, // data in memtable or block cache + kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option + // will skip data in memtable. + // Note that this ReadTier currently only supports + // Get and MultiGet and does not support iterators. + kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. +}; + +// Options that control read operations +struct ReadOptions { + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an implicit + // snapshot of the state at the beginning of this read operation. + // Default: nullptr + const Snapshot* snapshot; + + // `iterate_lower_bound` defines the smallest key at which the backward + // iterator can return an entry. Once the bound is passed, Valid() will be + // false. `iterate_lower_bound` is inclusive ie the bound value is a valid + // entry. + // + // If prefix_extractor is not null, the Seek target and `iterate_lower_bound` + // need to have the same prefix. This is because ordering is not guaranteed + // outside of prefix domain. + // + // In case of user_defined timestamp, if enabled, iterate_lower_bound should + // point to key without timestamp part. + // Default: nullptr + const Slice* iterate_lower_bound; + + // "iterate_upper_bound" defines the extent up to which the forward iterator + // can return entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If prefix_extractor is not null: + // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used + // to infer whether prefix iterating (e.g. applying prefix bloom filter) + // can be used within RocksDB. This is done by comparing + // iterate_upper_bound with the seek key. + // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes + // effect if it shares the same prefix as the seek key. If + // iterate_upper_bound is outside the prefix of the seek key, then keys + // returned outside the prefix range will be undefined, just as if + // iterate_upper_bound = null. + // If iterate_upper_bound is not null, SeekToLast() will position the iterator + // at the first key smaller than iterate_upper_bound. + // + // In case of user_defined timestamp, if enabled, iterate_upper_bound should + // point to key without timestamp part. + // Default: nullptr + const Slice* iterate_upper_bound; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file. The readahead starts at 8KB and doubles on every + // additional read up to 256KB. + // This option can help if most of the range scans are large, and if it is + // determined that a larger readahead than that enabled by auto-readahead is + // needed. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + // Default: 0 + size_t readahead_size; + + // A threshold for the number of keys that can be skipped before failing an + // iterator seek as incomplete. The default value of 0 should be used to + // never fail a request as incomplete, even on skipping too many keys. + // Default: 0 + uint64_t max_skippable_internal_keys; + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + // Default: kReadAllTier + ReadTier read_tier; + + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums; + + // Should the "data block"/"index block" read for this iteration be placed in + // block cache? + // Callers may wish to set this field to false for bulk scans. + // This would help not to the change eviction order of existing items in the + // block cache. + // Default: true + bool fill_cache; + + // Specify to create a tailing iterator -- a special iterator that has a + // view of the complete database (i.e. it can also be used to read newly + // added data) and is optimized for sequential reads. It will return records + // that were inserted into the database after the creation of the iterator. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool tailing; + + // This options is not used anymore. It was to turn on a functionality that + // has been removed. + bool managed; + + // Enable a total order seek regardless of index format (e.g. hash index) + // used in the table. Some table format (e.g. plain table) may not support + // this option. + // If true when calling Get(), we also skip prefix bloom when reading from + // block based table, which only affects Get() performance. + // Default: false + bool total_order_seek; + + // When true, by default use total_order_seek = true, and RocksDB can + // selectively enable prefix seek mode if won't generate a different result + // from total_order_seek, based on seek key, and iterator upper bound. + // Not supported in ROCKSDB_LITE mode, in the way that even with value true + // prefix mode is not used. + // BUG: Using Comparator::IsSameLengthImmediateSuccessor and + // SliceTransform::FullLengthEnabled to enable prefix mode in cases where + // prefix of upper bound differs from prefix of seek key has a flaw. + // If present in the DB, "short keys" (shorter than "full length" prefix) + // can be omitted from auto_prefix_mode iteration when they would be present + // in total_order_seek iteration, regardless of whether the short keys are + // "in domain" of the prefix extractor. This is not an issue if no short + // keys are added to DB or are not expected to be returned by such + // iterators. (We are also assuming the new condition on + // IsSameLengthImmediateSuccessor is satisfied; see its BUG section). + // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". + // Default: false + bool auto_prefix_mode; + + // Enforce that the iterator only iterates over the same prefix as the seek. + // This option is effective only for prefix seeks, i.e. prefix_extractor is + // non-null for the column family and total_order_seek is false. Unlike + // iterate_upper_bound, prefix_same_as_start only works within a prefix + // but in both directions. + // Default: false + bool prefix_same_as_start; + + // Keep the blocks loaded by the iterator pinned in memory as long as the + // iterator is not deleted, If used when reading from tables created with + // BlockBasedTableOptions::use_delta_encoding = false, + // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + // return 1. + // Default: false + bool pin_data; + + // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we + // schedule a background job in the flush job queue and delete obsolete files + // in background. + // Default: false + bool background_purge_on_iterator_cleanup; + + // If true, range tombstones handling will be skipped in key lookup paths. + // For DB instances that don't use DeleteRange() calls, this setting can + // be used to optimize the read performance. + // Note that, if this assumption (of no previous DeleteRange() calls) is + // broken, stale keys could be served in read paths. + // Default: false + bool ignore_range_deletions; + + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. This option only affects Iterators and has + // no impact on point lookups. + // Default: empty (every table will be scanned) + std::function table_filter; + + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // For iterator, iter_start_ts is the lower bound (older) and timestamp + // serves as the upper bound. Versions of the same record that fall in + // the timestamp range will be returned. If iter_start_ts is nullptr, + // only the most recent version visible to timestamp is returned. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + // Default: nullptr + const Slice* timestamp; + const Slice* iter_start_ts; + + // Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + // in microseconds. + // It should be set to microseconds since epoch, i.e, gettimeofday or + // equivalent plus allowed duration in microseconds. The best way is to use + // env->NowMicros() + some timeout. + // This is best efforts. The call may exceed the deadline if there is IO + // involved and the file system doesn't support deadlines, or due to + // checking for deadline periodically rather than for every key if + // processing a batch + std::chrono::microseconds deadline; + + // A timeout in microseconds to be passed to the underlying FileSystem for + // reads. As opposed to deadline, this determines the timeout for each + // individual file read request. If a MultiGet/Get/Seek/Next etc call + // results in multiple reads, each read can last up to io_timeout us. + std::chrono::microseconds io_timeout; + + // It limits the maximum cumulative value size of the keys in batch while + // reading through MultiGet. Once the cumulative value size exceeds this + // soft limit then all the remaining keys are returned with status Aborted. + // + // Default: std::numeric_limits::max() + uint64_t value_size_soft_limit; + + // For iterators, RocksDB does auto-readahead on noticing more than two + // sequential reads for a table file if user doesn't provide readahead_size. + // The readahead starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size only when reads are sequential. However at each + // level, if iterator moves over next file, readahead_size starts again from + // 8KB. + // + // By enabling this option, RocksDB will do some enhancements for + // prefetching the data. + // + // Default: false + bool adaptive_readahead; + + // For file reads associated with this option, charge the internal rate + // limiter (see `DBOptions::rate_limiter`) at the specified priority. The + // special value `Env::IO_TOTAL` disables charging the rate limiter. + // + // The rate limiting is bypassed no matter this option's value for file reads + // on plain tables (these can exist when `ColumnFamilyOptions::table_factory` + // is a `PlainTableFactory`) and cuckoo tables (these can exist when + // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`). + // + // The bytes charged to rate limiter may not exactly match the file read bytes + // since there are some seemingly insignificant reads, like for file + // headers/footers, that we currently do not charge to rate limiter. + // + // Default: `Env::IO_TOTAL`. + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; + + // Experimental + // + // If async_io is enabled, RocksDB will prefetch some of data asynchronously. + // RocksDB apply it if reads are sequential and its internal automatic + // prefetching. + // + // Default: false + bool async_io; + + // Experimental + // + // If async_io is set, then this flag controls whether we read SST files + // in multiple levels asynchronously. Enabling this flag can help reduce + // MultiGet latency by maximizing the number of SST files read in + // parallel if the keys in the MultiGet batch are in different levels. It + // comes at the expense of slightly higher CPU overhead. + // + // Default: true + bool optimize_multiget_for_io; + + ReadOptions(); + ReadOptions(bool cksum, bool cache); +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may get lost after a crash. The backup engine + // relies on write-ahead logs to back up the memtable, so if + // you disable write-ahead logs, you must create backups with + // flush_before_backup=true to avoid losing unflushed memtable data. + // Default: false + bool disableWAL; + + // If true and if user is trying to write to column families that don't exist + // (they were dropped), ignore the write (don't return an error). If there + // are multiple writes in a WriteBatch, other writes will succeed. + // Default: false + bool ignore_missing_column_families; + + // If true and we need to wait or sleep for the write request, fails + // immediately with Status::Incomplete(). + // Default: false + bool no_slowdown; + + // If true, this write request is of lower priority if compaction is + // behind. In this case, no_slowdown = true, the request will be canceled + // immediately with Status::Incomplete() returned. Otherwise, it will be + // slowed down. The slowdown value is determined by RocksDB to guarantee + // it introduces minimum impacts to high priority writes. + // + // Default: false + bool low_pri; + + // If true, this writebatch will maintain the last insert positions of each + // memtable as hints in concurrent write. It can improve write performance + // in concurrent writes if keys in one writebatch are sequential. In + // non-concurrent writes (when concurrent_memtable_writes is false) this + // option will be ignored. + // + // Default: false + bool memtable_insert_hint_per_batch; + + // For writes associated with this option, charge the internal rate + // limiter (see `DBOptions::rate_limiter`) at the specified priority. The + // special value `Env::IO_TOTAL` disables charging the rate limiter. + // + // Currently the support covers automatic WAL flushes, which happen during + // live updates (`Put()`, `Write()`, `Delete()`, etc.) + // when `WriteOptions::disableWAL == false` + // and `DBOptions::manual_wal_flush == false`. + // + // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed + // due to implementation constraints. + // + // Default: `Env::IO_TOTAL` + Env::IOPriority rate_limiter_priority; + + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + // + // Default: zero (disabled). + size_t protection_bytes_per_key; + + WriteOptions() + : sync(false), + disableWAL(false), + ignore_missing_column_families(false), + no_slowdown(false), + low_pri(false), + memtable_insert_hint_per_batch(false), + rate_limiter_priority(Env::IO_TOTAL), + protection_bytes_per_key(0) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + // If true, the flush would proceed immediately even it means writes will + // stall for the duration of the flush; if false the operation will wait + // until it's possible to do flush w/o causing stall or until required flush + // is performed by someone else (foreground call or background thread). + // Default: false + bool allow_write_stall; + FlushOptions() : wait(true), allow_write_stall(false) {} +}; + +// Create a Logger from provided DBOptions +extern Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr* logger); + +// CompactionOptions are used in CompactFiles() call. +struct CompactionOptions { + // Compaction output compression type + // Default: snappy + // If set to `kDisableCompressionOption`, RocksDB will choose compression type + // according to the `ColumnFamilyOptions`, taking into account the output + // level if `compression_per_level` is specified. + CompressionType compression; + // Compaction will create files of size `output_file_size_limit`. + // Default: MAX, which means that compaction will create a single file + uint64_t output_file_size_limit; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions; + + CompactionOptions() + : compression(kSnappyCompression), + output_file_size_limit(std::numeric_limits::max()), + max_subcompactions(0) {} +}; + +// For level based compaction, we can configure if we want to skip/force +// bottommost level compaction. +enum class BottommostLevelCompaction { + // Skip bottommost level compaction + kSkip, + // Only compact bottommost level if there is a compaction filter + // This is the default option + kIfHaveCompactionFilter, + // Always compact bottommost level + kForce, + // Always compact bottommost level but in bottommost level avoid + // double-compacting files created in the same compaction + kForceOptimized, +}; + +// For manual compaction, we can configure if we want to skip/force garbage +// collection of blob files. +enum class BlobGarbageCollectionPolicy { + // Force blob file garbage collection. + kForce, + // Skip blob file garbage collection. + kDisable, + // Inherit blob file garbage collection policy from ColumnFamilyOptions. + kUseDefault, +}; + +// CompactRangeOptions is used by CompactRange() call. +struct CompactRangeOptions { + // If true, no other compaction will run at the same time as this + // manual compaction. + // + // Default: false + bool exclusive_manual_compaction = false; + + // If true, compacted files will be moved to the minimum level capable + // of holding the data or given level (specified non-negative target_level). + bool change_level = false; + // If change_level is true and target_level have non-negative value, compacted + // files will be moved to target_level. + int target_level = -1; + // Compaction outputs will be placed in options.db_paths[target_path_id]. + // Behavior is undefined if target_path_id is out of range. + uint32_t target_path_id = 0; + // By default level based compaction will only compact the bottommost level + // if there is a compaction filter + BottommostLevelCompaction bottommost_level_compaction = + BottommostLevelCompaction::kIfHaveCompactionFilter; + // If true, will execute immediately even if doing so would cause the DB to + // enter write stall mode. Otherwise, it'll sleep until load is low enough. + bool allow_write_stall = false; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions = 0; + // Set user-defined timestamp low bound, the data with older timestamp than + // low bound maybe GCed by compaction. Default: nullptr + const Slice* full_history_ts_low = nullptr; + + // Allows cancellation of an in-progress manual compaction. + // + // Cancellation can be delayed waiting on automatic compactions when used + // together with `exclusive_manual_compaction == true`. + std::atomic* canceled = nullptr; + // NOTE: Calling DisableManualCompaction() overwrites the uer-provided + // canceled variable in CompactRangeOptions. + // Typically, when CompactRange is being called in one thread (t1) with + // canceled = false, and DisableManualCompaction is being called in the + // other thread (t2), manual compaction is disabled normally, even if the + // compaction iterator may still scan a few items before *canceled is + // set to true + + // If set to kForce, RocksDB will override enable_blob_file_garbage_collection + // to true; if set to kDisable, RocksDB will override it to false, and + // kUseDefault leaves the setting in effect. This enables customers to both + // force-enable and force-disable GC when calling CompactRange. + BlobGarbageCollectionPolicy blob_garbage_collection_policy = + BlobGarbageCollectionPolicy::kUseDefault; + + // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff + // from ColumnFamilyOptions in effect. Otherwise, it will override the + // user-provided setting. This enables customers to selectively override the + // age cutoff. + double blob_garbage_collection_age_cutoff = -1; +}; + +// IngestExternalFileOptions is used by IngestExternalFile() +struct IngestExternalFileOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; + // If set to false, an ingested file keys could appear in existing snapshots + // that where created before the file was ingested. + bool snapshot_consistency = true; + // If set to false, IngestExternalFile() will fail if the file key range + // overlaps with existing keys or tombstones in the DB. + bool allow_global_seqno = true; + // If set to false and the file key range overlaps with the memtable key range + // (memtable flush required), IngestExternalFile will fail. + bool allow_blocking_flush = true; + // Set to true if you would like duplicate keys in the file being ingested + // to be skipped rather than overwriting existing data under that key. + // Use case: back-fill of some historical data in the database without + // over-writing existing newer version of data. + // This option could only be used if the DB has been running + // with allow_ingest_behind=true since the dawn of time. + // All files will be ingested at the bottommost level with seqno=0. + bool ingest_behind = false; + // Set to true if you would like to write global_seqno to a given offset in + // the external SST file for backward compatibility. Older versions of + // RocksDB writes a global_seqno to a given offset within ingested SST files, + // and new versions of RocksDB do not. If you ingest an external SST using + // new version of RocksDB and would like to be able to downgrade to an + // older version of RocksDB, you should set 'write_global_seqno' to true. If + // your service is just starting to use the new RocksDB, we recommend that + // you set this option to false, which brings two benefits: + // 1. No extra random write for global_seqno during ingestion. + // 2. Without writing external SST file, it's possible to do checksum. + // We have a plan to set this option to false by default in the future. + bool write_global_seqno = true; + // Set to true if you would like to verify the checksums of each block of the + // external SST file before ingestion. + // Warning: setting this to true causes slowdown in file ingestion because + // the external SST file has to be read. + bool verify_checksums_before_ingest = false; + // When verify_checksums_before_ingest = true, RocksDB uses default + // readahead setting to scan the file while verifying checksums before + // ingestion. + // Users can override the default value using this option. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t verify_checksums_readahead_size = 0; + // Set to TRUE if user wants to verify the sst file checksum of ingested + // files. The DB checksum function will generate the checksum of each + // ingested file (if file_checksum_gen_factory is set) and compare the + // checksum function name and checksum with the ingested checksum information. + // + // If this option is set to True: 1) if DB does not enable checksum + // (file_checksum_gen_factory == nullptr), the ingested checksum information + // will be ignored; 2) If DB enable the checksum function, we calculate the + // sst file checksum after the file is moved or copied and compare the + // checksum and checksum name. If checksum or checksum function name does + // not match, ingestion will be failed. If the verification is successful, + // checksum and checksum function name will be stored in Manifest. + // If this option is set to FALSE, 1) if DB does not enable checksum, + // the ingested checksum information will be ignored; 2) if DB enable the + // checksum, we only verify the ingested checksum function name and we + // trust the ingested checksum. If the checksum function name matches, we + // store the checksum in Manifest. DB does not calculate the checksum during + // ingestion. However, if no checksum information is provided with the + // ingested files, DB will generate the checksum and store in the Manifest. + bool verify_file_checksum = true; + // Set to TRUE if user wants file to be ingested to the bottommost level. An + // error of Status::TryAgain() will be returned if a file cannot fit in the + // bottommost level when calling + // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear + // the bottommost level in the overlapping range before re-attempt. + // + // ingest_behind takes precedence over fail_if_not_bottommost_level. + bool fail_if_not_bottommost_level = false; +}; + +enum TraceFilterType : uint64_t { + // Trace all the operations + kTraceFilterNone = 0x0, + // Do not trace the get operations + kTraceFilterGet = 0x1 << 0, + // Do not trace the write operations + kTraceFilterWrite = 0x1 << 1, + // Do not trace the `Iterator::Seek()` operations + kTraceFilterIteratorSeek = 0x1 << 2, + // Do not trace the `Iterator::SeekForPrev()` operations + kTraceFilterIteratorSeekForPrev = 0x1 << 3, + // Do not trace the `MultiGet()` operations + kTraceFilterMultiGet = 0x1 << 4, +}; + +// TraceOptions is used for StartTrace +struct TraceOptions { + // To avoid the trace file size grows large than the storage space, + // user can set the max trace file size in Bytes. Default is 64GB + uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; + // Specify trace sampling option, i.e. capture one per how many requests. + // Default to 1 (capture every request). + uint64_t sampling_frequency = 1; + // Note: The filtering happens before sampling. + uint64_t filter = kTraceFilterNone; + // When true, the order of write records in the trace will match the order of + // the corresponding write records in the WAL and applied to the DB. There may + // be a performance penalty associated with preserving this ordering. + // + // Default: false. This means write records in the trace may be in an order + // different from the WAL's order. + bool preserve_write_order = false; +}; + +// ImportColumnFamilyOptions is used by ImportColumnFamily() +struct ImportColumnFamilyOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; +}; + +// Options used with DB::GetApproximateSizes() +struct SizeApproximationOptions { + // Defines whether the returned size should include the recently written + // data in the memtables. If set to false, include_files must be true. + bool include_memtables = false; + // Defines whether the returned size should include data serialized to disk. + // If set to false, include_memtables must be true. + bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; +}; + +struct CompactionServiceOptionsOverride { + // Currently pointer configurations are not passed to compaction service + // compaction so the user needs to set it. It will be removed once pointer + // configuration passing is supported. + Env* env = Env::Default(); + std::shared_ptr file_checksum_gen_factory = nullptr; + + const Comparator* comparator = BytewiseComparator(); + std::shared_ptr merge_operator = nullptr; + const CompactionFilter* compaction_filter = nullptr; + std::shared_ptr compaction_filter_factory = nullptr; + std::shared_ptr prefix_extractor = nullptr; + std::shared_ptr table_factory; + std::shared_ptr sst_partitioner_factory = nullptr; + + // Only subsets of events are triggered in remote compaction worker, like: + // `OnTableFileCreated`, `OnTableFileCreationStarted`, + // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`, + // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and + // `OnCompactionCompleted` won't be triggered. They will be triggered on the + // primary DB side. + std::vector> listeners; + + // statistics is used to collect DB operation metrics, the metrics won't be + // returned to CompactionService primary host, to collect that, the user needs + // to set it here. + std::shared_ptr statistics = nullptr; + + // Only compaction generated SST files use this user defined table properties + // collector. + std::vector> + table_properties_collector_factories; +}; + +struct OpenAndCompactOptions { + // Allows cancellation of an in-progress compaction. + std::atomic* canceled = nullptr; +}; + +#ifndef ROCKSDB_LITE +struct LiveFilesStorageInfoOptions { + // Whether to populate FileStorageInfo::file_checksum* or leave blank + bool include_checksum_info = false; + // Flushes memtables if total size in bytes of live WAL files is >= this + // number (and DB is not read-only). + // Default: always force a flush without checking sizes. + uint64_t wal_size_for_flush = 0; +}; +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h new file mode 100644 index 000000000..cd1dd99f0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/perf_context.h @@ -0,0 +1,274 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/perf_level.h" + +namespace ROCKSDB_NAMESPACE { + +// A thread local context for gathering performance counter efficiently +// and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevel { + // # of times bloom filter has avoided file reads, i.e., negatives. + uint64_t bloom_filter_useful = 0; + // # of times bloom FullFilter has not avoided the reads. + uint64_t bloom_filter_full_positive = 0; + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + uint64_t bloom_filter_full_true_positive = 0; + + // total number of user key returned (only include keys that are found, does + // not include keys that are deleted or merged without a final put + uint64_t user_key_return_count = 0; + + // total nanos spent on reading data from SST files + uint64_t get_from_table_nanos = 0; + + uint64_t block_cache_hit_count = 0; // total number of block cache hits + uint64_t block_cache_miss_count = 0; // total number of block cache misses + + void Reset(); // reset all performance counters to zero +}; + +struct PerfContext { + ~PerfContext(); + + PerfContext() {} + + PerfContext(const PerfContext&); + PerfContext& operator=(const PerfContext&); + PerfContext(PerfContext&&) noexcept; + + void Reset(); // reset all performance counters to zero + + std::string ToString(bool exclude_zero_counters = false) const; + + // enable per level perf context and allocate storage for PerfContextByLevel + void EnablePerLevelPerfContext(); + + // temporarily disable per level perf context by setting the flag to false + void DisablePerLevelPerfContext(); + + // free the space for PerfContextByLevel, also disable per level perf context + void ClearPerLevelPerfContext(); + + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total nanos spent on block reads + uint64_t block_cache_index_hit_count; // total number of index block hits + // total number of standalone handles lookup from secondary cache + uint64_t block_cache_standalone_handle_count; + // total number of real handles lookup from secondary cache that are inserted + // into primary cache + uint64_t block_cache_real_handle_count; + uint64_t index_block_read_count; // total number of index block reads + uint64_t block_cache_filter_hit_count; // total number of filter block hits + uint64_t filter_block_read_count; // total number of filter block reads + uint64_t compression_dict_block_read_count; // total number of compression + // dictionary block reads + + uint64_t secondary_cache_hit_count; // total number of secondary cache hits + // total number of real handles inserted into secondary cache + uint64_t compressed_sec_cache_insert_real_count; + // total number of dummy handles inserted into secondary cache + uint64_t compressed_sec_cache_insert_dummy_count; + // bytes for vals before compression in secondary cache + uint64_t compressed_sec_cache_uncompressed_bytes; + // bytes for vals after compression in secondary cache + uint64_t compressed_sec_cache_compressed_bytes; + + uint64_t block_checksum_time; // total nanos spent on block checksum + uint64_t block_decompress_time; // total nanos spent on block decompression + + uint64_t get_read_bytes; // bytes for vals returned by Get + uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet + uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator + + uint64_t blob_cache_hit_count; // total number of blob cache hits + uint64_t blob_read_count; // total number of blob reads (with IO) + uint64_t blob_read_byte; // total number of bytes from blob reads + uint64_t blob_read_time; // total nanos spent on blob reads + uint64_t blob_checksum_time; // total nanos spent on blob checksum + uint64_t blob_decompress_time; // total nanos spent on blob decompression + + // total number of internal keys skipped over during iteration. + // There are several reasons for it: + // 1. when calling Next(), the iterator is in the position of the previous + // key, so that we'll need to skip it. It means this counter will always + // be incremented in Next(). + // 2. when calling Next(), we need to skip internal entries for the previous + // keys that are overwritten. + // 3. when calling Next(), Seek() or SeekToFirst(), after previous key + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next + // valid key that the operation should place the iterator to. We need + // to skip both of the tombstone and updates hidden by the tombstones. The + // tombstones are not included in this counter, while previous updates + // hidden by the tombstones will be included here. + // 4. symmetric cases for Prev() and SeekToLast() + // internal_recent_skipped_count is not included in this counter. + // + uint64_t internal_key_skipped_count; + // Total number of deletes and single deletes skipped over during iteration + // When calling Next(), Seek() or SeekToFirst(), after previous position + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next valid + // key. Every deleted key is counted once. We don't recount here if there are + // still older updates invalidated by the tombstones. + // + uint64_t internal_delete_skipped_count; + // How many times iterators skipped over internal keys that are more recent + // than the snapshot that iterator is using. + // + uint64_t internal_recent_skipped_count; + // How many values were fed into merge operator by iterators. + // + uint64_t internal_merge_count; + // Number of times we reseeked inside a merging iterator, specifically to skip + // after or before a range of keys covered by a range deletion in a newer LSM + // component. + uint64_t internal_range_del_reseek_count; + + uint64_t get_snapshot_time; // total nanos spent on getting snapshot + uint64_t get_from_memtable_time; // total nanos spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total nanos spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total nanos reading from output files + // total nanos spent on seeking memtable + uint64_t seek_on_memtable_time; + // number of seeks issued on memtable + // (including SeekForPrev but not SeekToFirst and SeekToLast) + uint64_t seek_on_memtable_count; + // number of Next()s issued on memtable + uint64_t next_on_memtable_count; + // number of Prev()s issued on memtable + uint64_t prev_on_memtable_count; + // total nanos spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total nanos spent on the merge min heap + uint64_t seek_max_heap_time; // total nanos spent on the merge max heap + // total nanos spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total nanos spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + + // This group of stats provide a breakdown of time spent by Write(). + // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write + // are enabled. + // + // total nanos spent on writing to WAL + uint64_t write_wal_time; + // total nanos spent on writing to mem tables + uint64_t write_memtable_time; + // total nanos spent on delaying or throttling write + uint64_t write_delay_time; + // total nanos spent on switching memtable/wal and scheduling + // flushes/compactions. + uint64_t write_scheduling_flushes_compactions_time; + // total nanos spent on writing a record, excluding the above four things + uint64_t write_pre_and_post_process_time; + + // time spent waiting for other threads of the batch group + uint64_t write_thread_wait_nanos; + + // time spent on acquiring DB mutex. + uint64_t db_mutex_lock_nanos; + // Time spent on waiting with a condition variable created with DB mutex. + uint64_t db_condition_wait_nanos; + // Time spent on merge operator. + uint64_t merge_operator_time_nanos; + + // Time spent on reading index block from block cache or SST file + uint64_t read_index_block_nanos; + // Time spent on reading filter block from block cache or SST file + uint64_t read_filter_block_nanos; + // Time spent on creating data block iterator + uint64_t new_table_block_iter_nanos; + // Time spent on creating a iterator of an SST file. + uint64_t new_table_iterator_nanos; + // Time spent on seeking a key in data/index blocks + uint64_t block_seek_nanos; + // Time spent on finding or creating a table reader + uint64_t find_table_nanos; + // total number of mem table bloom hits + uint64_t bloom_memtable_hit_count; + // total number of mem table bloom misses + uint64_t bloom_memtable_miss_count; + // total number of SST table bloom hits + uint64_t bloom_sst_hit_count; + // total number of SST table bloom misses + uint64_t bloom_sst_miss_count; + + // Time spent waiting on key locks in transaction lock manager. + uint64_t key_lock_wait_time; + // number of times acquiring a lock was blocked by another transaction. + uint64_t key_lock_wait_count; + + // Total time spent in Env filesystem operations. These are only populated + // when TimedEnv is used. + uint64_t env_new_sequential_file_nanos; + uint64_t env_new_random_access_file_nanos; + uint64_t env_new_writable_file_nanos; + uint64_t env_reuse_writable_file_nanos; + uint64_t env_new_random_rw_file_nanos; + uint64_t env_new_directory_nanos; + uint64_t env_file_exists_nanos; + uint64_t env_get_children_nanos; + uint64_t env_get_children_file_attributes_nanos; + uint64_t env_delete_file_nanos; + uint64_t env_create_dir_nanos; + uint64_t env_create_dir_if_missing_nanos; + uint64_t env_delete_dir_nanos; + uint64_t env_get_file_size_nanos; + uint64_t env_get_file_modification_time_nanos; + uint64_t env_rename_file_nanos; + uint64_t env_link_file_nanos; + uint64_t env_lock_file_nanos; + uint64_t env_unlock_file_nanos; + uint64_t env_new_logger_nanos; + + uint64_t get_cpu_nanos; + uint64_t iter_next_cpu_nanos; + uint64_t iter_prev_cpu_nanos; + uint64_t iter_seek_cpu_nanos; + + // Time spent in encrypting data. Populated when EncryptedEnv is used. + uint64_t encrypt_data_nanos; + // Time spent in decrypting data. Populated when EncryptedEnv is used. + uint64_t decrypt_data_nanos; + + uint64_t number_async_seek; + + std::map* level_to_perf_context = nullptr; + bool per_level_perf_context_enabled = false; +}; + +// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global, +// non-thread-local PerfContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local PerfContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. +PerfContext* get_perf_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h new file mode 100644 index 000000000..e7dded0e3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/perf_level.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// How much perf stats to collect. Affects perf_context and iostats_context. +enum PerfLevel : unsigned char { + kUninitialized = 0, // unknown setting + kDisable = 1, // disable perf stats + kEnableCount = 2, // enable only count stats + kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time + // stats except for mutexes + // Other than time, also measure CPU time counters. Still don't measure + // time (neither wall time nor CPU time) for mutexes. + kEnableTimeAndCPUTimeExceptForMutex = 4, + kEnableTime = 5, // enable count and time stats + kOutOfBounds = 6 // N.B. Must always be the last value! +}; + +// set the perf stats level for current thread +void SetPerfLevel(PerfLevel level); + +// get current perf stats level for current thread +PerfLevel GetPerfLevel(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h new file mode 100644 index 000000000..f14f01999 --- /dev/null +++ b/src/rocksdb/include/rocksdb/persistent_cache.h @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// PersistentCache +// +// Persistent cache interface for caching IO pages on a persistent medium. The +// cache interface is specifically designed for persistent read cache. +class PersistentCache { + public: + using StatsType = std::vector>; + + virtual ~PersistentCache() {} + + // Insert to page cache + // + // page_key Identifier to identify a page uniquely across restarts + // data Page data to copy (caller retains ownership) + // size Size of the page + virtual Status Insert(const Slice& key, const char* data, + const size_t size) = 0; + + // Lookup page cache by page identifier + // + // page_key Page identifier + // buf Buffer where the data should be copied + // size Size of the page + virtual Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) = 0; + + // True if the cache is configured to store serialized blocks, which are + // potentially compressed and include a trailer (when SST format calls for + // one). False if the cache stores uncompressed blocks (no trailer). + virtual bool IsCompressed() = 0; + + // Return stats as map of {string, double} per-tier + // + // Persistent cache can be initialized as a tier of caches. The stats are per + // tire top-down + virtual StatsType Stats() = 0; + + virtual std::string GetPrintableOptions() const = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharding the same persistent cache to partition the key space. Typically + // the client will allocate a new id at startup and prepend the id to its + // cache keys. + virtual uint64_t NewId() = 0; +}; + +// Factor method to create a new persistent cache +Status NewPersistentCache(Env* const env, const std::string& path, + const uint64_t size, + const std::shared_ptr& log, + const bool optimized_for_nvm, + std::shared_ptr* cache); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h new file mode 100644 index 000000000..9cad6edf4 --- /dev/null +++ b/src/rocksdb/include/rocksdb/rate_limiter.h @@ -0,0 +1,159 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class RateLimiter { + public: + enum class OpType { + kRead, + kWrite, + }; + + enum class Mode { + kReadsOnly, + kWritesOnly, + kAllIo, + }; + + // For API compatibility, default to rate-limiting writes only. + explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {} + + virtual ~RateLimiter() {} + + // This API allows user to dynamically change rate limiter's bytes per second. + // REQUIRED: bytes_per_second > 0 + virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; + + // Deprecated. New RateLimiter derived classes should override + // Request(const int64_t, const Env::IOPriority, Statistics*) or + // Request(const int64_t, const Env::IOPriority, Statistics*, OpType) + // instead. + // + // Request for token for bytes. If this request can not be satisfied, the call + // is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) { + assert(false); + } + + // Request for token for bytes and potentially update statistics. If this + // request can not be satisfied, the call is blocked. Caller is responsible to + // make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* /* stats */) { + // For API compatibility, default implementation calls the older API in + // which statistics are unsupported. + Request(bytes, pri); + } + + // Requests token to read or write bytes and potentially updates statistics. + // + // If this request can not be satisfied, the call is blocked. Caller is + // responsible to make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats, OpType op_type) { + if (IsRateLimited(op_type)) { + Request(bytes, pri, stats); + } + } + + // Requests token to read or write bytes and potentially updates statistics. + // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of + // direct I/O) to allocate an appropriate number of bytes, which may be less + // than the number of bytes requested. + virtual size_t RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type); + + // Max bytes can be granted in a single burst + virtual int64_t GetSingleBurstBytes() const = 0; + + // Total bytes that go through rate limiter + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + // Total # of requests that go through rate limiter + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + // Total # of requests that are pending for bytes in rate limiter + // For convenience, this function is supported by the RateLimiter returned + // by NewGenericRateLimiter but is not required by RocksDB. + // + // REQUIRED: total_pending_request != nullptr + virtual Status GetTotalPendingRequests( + int64_t* total_pending_requests, + const Env::IOPriority pri = Env::IO_TOTAL) const { + assert(total_pending_requests != nullptr); + (void)total_pending_requests; + (void)pri; + return Status::NotSupported(); + } + + virtual int64_t GetBytesPerSecond() const = 0; + + virtual bool IsRateLimited(OpType op_type) { + if ((mode_ == RateLimiter::Mode::kWritesOnly && + op_type == RateLimiter::OpType::kRead) || + (mode_ == RateLimiter::Mode::kReadsOnly && + op_type == RateLimiter::OpType::kWrite)) { + return false; + } + return true; + } + + protected: + Mode GetMode() { return mode_; } + + private: + const Mode mode_; +}; + +// Create a RateLimiter object, which can be shared among RocksDB instances to +// control write rate of flush and compaction. +// @rate_bytes_per_sec: this is the only parameter you want to set most of the +// time. It controls the total write rate of compaction and flush in bytes per +// second. Currently, RocksDB does not enforce rate limit for anything other +// than flush and compaction, e.g. write to WAL. +// @refill_period_us: this controls how often tokens are refilled. For example, +// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to +// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to +// burstier writes while smaller value introduces more CPU overhead. +// The default should work for most cases. +// @fairness: RateLimiter accepts high-pri requests and low-pri requests. +// A low-pri request is usually blocked in favor of hi-pri request. Currently, +// RocksDB assigns low-pri to request from compaction and high-pri to request +// from flush. Low-pri requests can get blocked if flush requests come in +// continuously. This fairness parameter grants low-pri requests permission by +// 1/fairness chance even though high-pri requests exist to avoid starvation. +// You should be good by leaving it at default 10. +// @mode: Mode indicates which types of operations count against the limit. +// @auto_tuned: Enables dynamic adjustment of rate limit within the range +// `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to +// the recent demand for background I/O. +extern RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000, + int32_t fairness = 10, + RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly, + bool auto_tuned = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h new file mode 100644 index 000000000..a339ec2aa --- /dev/null +++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// For testing purposes +#if ROCKSDB_NAMESPACE == 42 +#undef ROCKSDB_NAMESPACE +#endif + +// Normal logic +#ifndef ROCKSDB_NAMESPACE +#define ROCKSDB_NAMESPACE rocksdb +#endif diff --git a/src/rocksdb/include/rocksdb/secondary_cache.h b/src/rocksdb/include/rocksdb/secondary_cache.h new file mode 100644 index 000000000..a6a8c8b1d --- /dev/null +++ b/src/rocksdb/include/rocksdb/secondary_cache.h @@ -0,0 +1,133 @@ +// Copyright (c) 2021, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A handle for lookup result. The handle may not be immediately ready or +// have a valid value. The caller must call isReady() to determine if its +// ready, and call Wait() in order to block until it becomes ready. +// The caller must call value() after it becomes ready to determine if the +// handle successfullly read the item. +class SecondaryCacheResultHandle { + public: + virtual ~SecondaryCacheResultHandle() = default; + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the value. If nullptr, it means the lookup was unsuccessful + virtual void* Value() = 0; + + // Return the size of value + virtual size_t Size() = 0; +}; + +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SecondaryCache : public Customizable { + public: + ~SecondaryCache() override = default; + + static const char* Type() { return "SecondaryCache"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Insert the given value into this cache. Ownership of `value` is + // transferred to the callee, who is reponsible for deleting the value + // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(), + // the callee is responsible for such cleanup even in case of non-OK + // Status. + // Typically, the value is not saved directly but the implementation + // uses the SaveToCallback provided by helper to extract value's + // persistable data (typically uncompressed block), which will be written + // to this tier. The implementation may or may not write it to cache + // depending on the admission control policy, even if the return status + // is success (OK). + // + // If the implementation is asynchronous or otherwise uses `value` after + // the call returns, then InsertSaved() must be overridden not to rely on + // Insert(). For example, there could be a "holding area" in memory where + // Lookup() might return the same parsed value back. But more typically, if + // the implementation only uses `value` for getting persistable data during + // the call, then the default implementation of `InsertSaved()` suffices. + virtual Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) = 0; + + // Insert a value from its saved/persistable data (typically uncompressed + // block), as if generated by SaveToCallback/SizeCallback. This can be used + // in "warming up" the cache from some auxiliary source, and like Insert() + // may or may not write it to cache depending on the admission control + // policy, even if the return status is success. + // + // The default implementation assumes synchronous, non-escaping Insert(), + // wherein `value` is not used after return of Insert(). See Insert(). + virtual Status InsertSaved(const Slice& key, const Slice& saved); + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready. + // + // advise_erase is a hint from the primary cache indicating that the handle + // will be cached there, so the secondary cache is advised to drop it from + // the cache as an optimization. To use this feature, SupportForceErase() + // needs to return true. + // This hint can also be safely ignored. + // + // is_in_sec_cache is to indicate whether the handle is possibly erased + // from the secondary cache after the Lookup. + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait, + bool advise_erase, bool& is_in_sec_cache) = 0; + + // Indicate whether a handle can be erased in this secondary cache. + [[nodiscard]] virtual bool SupportForceErase() const = 0; + + // At the discretion of the implementation, erase the data associated + // with key. + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready. + virtual void WaitAll(std::vector handles) = 0; + + // Set the maximum configured capacity of the cache. + // When the new capacity is less than the old capacity and the existing usage + // is greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage. + // + // The derived class can make this function no-op and return NotSupported(). + virtual Status SetCapacity(size_t /* capacity */) { + return Status::NotSupported(); + } + + // The derived class can make this function no-op and return NotSupported(). + virtual Status GetCapacity(size_t& /* capacity */) { + return Status::NotSupported(); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h new file mode 100644 index 000000000..0d7eb5949 --- /dev/null +++ b/src/rocksdb/include/rocksdb/slice.h @@ -0,0 +1,264 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#pragma once + +#include +#include +#include +#include +#include +#include // RocksDB now requires C++17 support + +#include "rocksdb/cleanable.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) {} + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) {} + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) {} + + // Create a slice that refers to the same contents as "sv" + /* implicit */ + Slice(const std::string_view& sv) : data_(sv.data()), size_(sv.size()) {} + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); } + + // Create a single slice from SliceParts using buf as storage. + // buf must exist as long as the returned Slice exists. + Slice(const struct SliceParts& parts, std::string* buf); + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { + data_ = ""; + size_ = 0; + } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + void remove_suffix(size_t n) { + assert(n <= size()); + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + // when hex is true, returns a string of twice the length hex encoded (0-9A-F) + std::string ToString(bool hex = false) const; + + // Return a string_view that references the same data as this slice. + std::string_view ToStringView() const { + return std::string_view(data_, size_); + } + + // Decodes the current slice interpreted as an hexadecimal string into result, + // if successful returns true, if this isn't a valid hex string + // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. + // This slice is expected to have an even number of 0-9A-F characters + // also accepts lowercase (a-f) + bool DecodeHex(std::string* result) const; + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0)); + } + + bool ends_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); + } + + // Compare two slices and returns the first byte where they differ + size_t difference_offset(const Slice& b) const; + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +/** + * A Slice that can be pinned with some cleanup tasks, which will be run upon + * ::Reset() or object destruction, whichever is invoked first. This can be used + * to avoid memcpy by having the PinnableSlice object referring to the data + * that is locked in the memory and release them after the data is consumed. + */ +class PinnableSlice : public Slice, public Cleanable { + public: + PinnableSlice() { buf_ = &self_space_; } + explicit PinnableSlice(std::string* buf) { buf_ = buf; } + + PinnableSlice(PinnableSlice&& other); + PinnableSlice& operator=(PinnableSlice&& other); + + // No copy constructor and copy assignment allowed. + PinnableSlice(PinnableSlice&) = delete; + PinnableSlice& operator=(PinnableSlice&) = delete; + + inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1, + void* arg2) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + RegisterCleanup(f, arg1, arg2); + assert(pinned_); + } + + inline void PinSlice(const Slice& s, Cleanable* cleanable) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + if (cleanable != nullptr) { + cleanable->DelegateCleanupsTo(this); + } + assert(pinned_); + } + + inline void PinSelf(const Slice& slice) { + assert(!pinned_); + buf_->assign(slice.data(), slice.size()); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + inline void PinSelf() { + assert(!pinned_); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + void remove_suffix(size_t n) { + assert(n <= size()); + if (pinned_) { + size_ -= n; + } else { + buf_->erase(size() - n, n); + PinSelf(); + } + } + + void remove_prefix(size_t n) { + assert(n <= size()); + if (pinned_) { + data_ += n; + size_ -= n; + } else { + buf_->erase(0, n); + PinSelf(); + } + } + + void Reset() { + Cleanable::Reset(); + pinned_ = false; + size_ = 0; + } + + inline std::string* GetSelf() { return buf_; } + + inline bool IsPinned() const { return pinned_; } + + private: + friend class PinnableSlice4Test; + std::string self_space_; + std::string* buf_; + bool pinned_ = false; +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* _parts, int _num_parts) + : parts(_parts), num_parts(_num_parts) {} + SliceParts() : parts(nullptr), num_parts(0) {} + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); } + +inline int Slice::compare(const Slice& b) const { + assert(data_ != nullptr && b.data_ != nullptr); + const size_t min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) + r = -1; + else if (size_ > b.size_) + r = +1; + } + return r; +} + +inline size_t Slice::difference_offset(const Slice& b) const { + size_t off = 0; + const size_t len = (size_ < b.size_) ? size_ : b.size_; + for (; off < len; off++) { + if (data_[off] != b.data_[off]) break; + } + return off; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h new file mode 100644 index 000000000..8909b9c53 --- /dev/null +++ b/src/rocksdb/include/rocksdb/slice_transform.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +struct ConfigOptions; + +// A SliceTransform is a generic pluggable way of transforming one string +// to another. Its primary use-case is in configuring RocksDB prefix Bloom +// filters, by setting prefix_extractor in ColumnFamilyOptions. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SliceTransform : public Customizable { + public: + virtual ~SliceTransform(){}; + + // Return the name of this transformation. + virtual const char* Name() const override = 0; + static const char* Type() { return "SliceTransform"; } + + // Creates and configures a new SliceTransform from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Returns a string representation of this SliceTransform, representing the ID + // and any additional properties. + std::string AsString() const; + + // Extract a prefix from a specified key, partial key, iterator upper bound, + // etc. This is normally used for building and checking prefix Bloom filters + // but should accept any string for which InDomain() returns true. + // See ColumnFamilyOptions::prefix_extractor for specific properties that + // must be satisfied by prefix extractors. + virtual Slice Transform(const Slice& key) const = 0; + + // Determine whether the specified key is compatible with the logic + // specified in the Transform method. Keys for which InDomain returns + // false will not be added to or queried against prefix Bloom filters. + // + // For example, if the Transform method returns a fixed length + // prefix of size 4, then an invocation to InDomain("abc") returns + // false because the specified key length(3) is shorter than the + // prefix size of 4. + // + // Wiki documentation here: + // https://github.com/facebook/rocksdb/wiki/Prefix-Seek + // + virtual bool InDomain(const Slice& key) const = 0; + + // DEPRECATED: This is currently not used and remains here for backward + // compatibility. + virtual bool InRange(const Slice& /*dst*/) const { return false; } + + // Returns information on maximum prefix length, if there is one. + // If Transform(x).size() == n for some keys and otherwise < n, + // should return true and set *len = n. Returning false is safe but + // currently disables some auto_prefix_mode filtering. + // Specifically, if the iterate_upper_bound is the immediate successor (see + // Comparator::IsSameLengthImmediateSuccessor) of the seek key's prefix, + // we require this function return true and iterate_upper_bound.size() == n + // to recognize and optimize the prefix seek. + // Otherwise (including FullLengthEnabled returns false, or prefix length is + // less than maximum), Seek with auto_prefix_mode is only optimized if the + // iterate_upper_bound and seek key have the same prefix. + // BUG: Despite all these conditions and even with the extra condition on + // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not + // sufficient to ensure auto_prefix_mode returns all entries that + // total_order_seek would return. See auto_prefix_mode "BUG" section. + virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } + + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. + // + // This function is not used by RocksDB, but for users. If users pass + // Options by string to RocksDB, they might not know what prefix extractor + // they are using. This function is to help users can determine: + // if they want to iterate all keys prefixing `prefix`, whether it is + // safe to use prefix bloom filter and seek to key `prefix`. + // If this function returns true, this means a user can Seek() to a prefix + // using the bloom filter. Otherwise, user needs to skip the bloom filter + // by setting ReadOptions.total_order_seek = true. + // + // Here is an example: Suppose we implement a slice transform that returns + // the first part of the string up to and including first ",": + // 1. SameResultWhenAppended("abc,") should return true. If applying prefix + // bloom filter using it, all slices matching "abc,.*" will be extracted + // to "abc,", so any SST file or memtable containing any of those key + // will not be filtered out. + // 2. SameResultWhenAppended("abc") should return false. A user will not be + // guaranteed to see all the keys matching "abc.*" if a user prefix + // seeks to "abc" against a DB with the same setting. If one SST file + // only contains "abcd,e", the file can be filtered out and the key will + // be invisible, because the prefix according to the configured extractor + // is "abcd,". + // + // i.e., an implementation always returning false is safe. + virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const { + return false; + } +}; + +// The prefix is the first `prefix_len` bytes of the key, and keys shorter +// then `prefix_len` are not InDomain. +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +// The prefix is the first min(length(key),`cap_len`) bytes of the key, and +// all keys are InDomain. +extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); + +// Prefix is equal to key. All keys are InDomain. +extern const SliceTransform* NewNoopTransform(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h new file mode 100644 index 000000000..1ea56e71e --- /dev/null +++ b/src/rocksdb/include/rocksdb/snapshot.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +// +// To Create a Snapshot, call DB::GetSnapshot(). +// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot). +class Snapshot { + public: + virtual SequenceNumber GetSequenceNumber() const = 0; + + // Returns unix time i.e. the number of seconds since the Epoch, 1970-01-01 + // 00:00:00 (UTC). + virtual int64_t GetUnixTime() const = 0; + + virtual uint64_t GetTimestamp() const = 0; + + protected: + virtual ~Snapshot(); +}; + +// Simple RAII wrapper class for Snapshot. +// Constructing this object will create a snapshot. Destructing will +// release the snapshot. +class ManagedSnapshot { + public: + explicit ManagedSnapshot(DB* db); + + // Instead of creating a snapshot, take ownership of the input snapshot. + ManagedSnapshot(DB* db, const Snapshot* _snapshot); + + ~ManagedSnapshot(); + + const Snapshot* snapshot(); + + private: + DB* db_; + const Snapshot* snapshot_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h new file mode 100644 index 000000000..9261ba47d --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE +#pragma once + +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +class SSTDumpTool { + public: + int Run(int argc, char const* const* argv, Options options = Options()); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h new file mode 100644 index 000000000..613292151 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_manager.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Logger; + +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. +// SstFileManager is NOT an extensible interface but a public interface for +// result of NewSstFileManager. Any derived classes must be RocksDB internal. +class SstFileManager { + public: + virtual ~SstFileManager() {} + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature; maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + + // Set the amount of buffer room each compaction should be able to leave. + // In other words, at its maximum disk space consumption, the compaction + // should still leave compaction_buffer_size available on the disk so that + // other background functions may continue, such as logging and flushing. + virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; + + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. + // + // thread-safe. + virtual bool IsMaxAllowedSpaceReached() = 0; + + // Returns true if the total size of SST and blob files as well as estimated + // size of ongoing compactions exceeds the maximums allowed space usage. + virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; + + // Return the total size of all tracked files. + // thread-safe + virtual uint64_t GetTotalSize() = 0; + + // Return a map containing all tracked files and their corresponding sizes. + // thread-safe + virtual std::unordered_map GetTrackedFiles() = 0; + + // Return delete rate limit in bytes per second. + // thread-safe + virtual int64_t GetDeleteRateBytesPerSecond() = 0; + + // Update the delete rate limit in bytes per second. + // zero means disable delete rate limiting and delete files immediately + // thread-safe + virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0; + + // Return trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual double GetMaxTrashDBRatio() = 0; + + // Update trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual void SetMaxTrashDBRatio(double ratio) = 0; + + // Return the total size of trash files + // thread-safe + virtual uint64_t GetTotalTrashSize() = 0; + + // Set the statistics ptr to dump the stat information + virtual void SetStatisticsPtr(const std::shared_ptr& stats) = 0; +}; + +// Create a new SstFileManager that can be shared among multiple RocksDB +// instances to track SST and blob files and control there deletion rate. +// Even though SstFileManager don't track WAL files but it still control +// there deletion rate. +// +// @param env: Pointer to Env object, please see "rocksdb/env.h". +// @param fs: Pointer to FileSystem object (rocksdb/file_system.h" +// @param info_log: If not nullptr, info_log will be used to log errors. +// +// == Deletion rate limiting specific arguments == +// @param trash_dir: Deprecated, this argument have no effect +// @param rate_bytes_per_sec: How many bytes should be deleted per second, If +// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb +// in 1 second, we will wait for another 3 seconds before we delete other +// files, Set to 0 to disable deletion rate limiting. +// This option also affect the delete rate of WAL files in the DB. +// @param delete_existing_trash: Deprecated, this argument have no effect, but +// if user provide trash_dir we will schedule deletes for files in the dir +// @param status: If not nullptr, status will contain any errors that happened +// during creating the missing trash_dir or deleting existing files in trash. +// @param max_trash_db_ratio: If the trash size constitutes for more than this +// fraction of the total DB size we will start deleting new files passed to +// DeleteScheduler immediately +// @param bytes_max_delete_chunk: if a file to delete is larger than delete +// chunk, ftruncate the file by this size each time, rather than dropping the +// whole file. 0 means to always delete the whole file. If the file has more +// than one linked names, the file will be deleted as a whole. Either way, +// `rate_bytes_per_sec` will be appreciated. NOTE that with this option, +// files already renamed as a trash may be partial, so users should not +// directly recover them without checking. +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr fs, + std::shared_ptr info_log = nullptr, + const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +// Same as above, but takes a pointer to a legacy Env object, instead of +// Env and FileSystem objects +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr info_log = nullptr, + std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h new file mode 100644 index 000000000..4b8642480 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_reader.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// SstFileReader is used to read sst files that are generated by DB or +// SstFileWriter. +class SstFileReader { + public: + SstFileReader(const Options& options); + + ~SstFileReader(); + + // Prepares to read from the file located at "file_path". + Status Open(const std::string& file_path); + + // Returns a new iterator over the table contents. + // Most read options provide the same control as we read from DB. + // If "snapshot" is nullptr, the iterator returns only the latest keys. + Iterator* NewIterator(const ReadOptions& options); + + std::shared_ptr GetTableProperties() const; + + // Verifies whether there is corruption in this table. + Status VerifyChecksum(const ReadOptions& /*read_options*/); + + Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + + private: + struct Rep; + std::unique_ptr rep_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h new file mode 100644 index 000000000..c85f097a5 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_writer.h @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +class Comparator; + +// ExternalSstFileInfo include information about sst files created +// using SstFileWriter. +struct ExternalSstFileInfo { + ExternalSstFileInfo() + : file_path(""), + smallest_key(""), + largest_key(""), + smallest_range_del_key(""), + largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), + sequence_number(0), + file_size(0), + num_entries(0), + num_range_del_entries(0), + version(0) {} + + ExternalSstFileInfo(const std::string& _file_path, + const std::string& _smallest_key, + const std::string& _largest_key, + SequenceNumber _sequence_number, uint64_t _file_size, + int32_t _num_entries, int32_t _version) + : file_path(_file_path), + smallest_key(_smallest_key), + largest_key(_largest_key), + smallest_range_del_key(""), + largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), + sequence_number(_sequence_number), + file_size(_file_size), + num_entries(_num_entries), + num_range_del_entries(0), + version(_version) {} + + std::string file_path; // external sst file path + std::string smallest_key; // smallest user key in file + std::string largest_key; // largest user key in file + std::string + smallest_range_del_key; // smallest range deletion user key in file + std::string largest_range_del_key; // largest range deletion user key in file + std::string file_checksum; // sst file checksum; + std::string file_checksum_func_name; // The name of file checksum function + SequenceNumber sequence_number; // sequence number of all keys in file + uint64_t file_size; // file size in bytes + uint64_t num_entries; // number of entries in file + uint64_t num_range_del_entries; // number of range deletion entries in file + int32_t version; // file version +}; + +// SstFileWriter is used to create sst files that can be added to database later +// All keys in files generated by SstFileWriter will have sequence number = 0. +class SstFileWriter { + public: + // User can pass `column_family` to specify that the generated file will + // be ingested into this column_family, note that passing nullptr means that + // the column_family is unknown. + // If invalidate_page_cache is set to true, SstFileWriter will give the OS a + // hint that this file pages is not needed every time we write 1MB to the + // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be + // passed. + // The `skip_filters` option is DEPRECATED and could be removed in the + // future. Use `BlockBasedTableOptions::filter_policy` to control filter + // generation. + SstFileWriter(const EnvOptions& env_options, const Options& options, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false) + : SstFileWriter(env_options, options, options.comparator, column_family, + invalidate_page_cache, io_priority, skip_filters) {} + + // Deprecated API + SstFileWriter(const EnvOptions& env_options, const Options& options, + const Comparator* user_comparator, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false); + + ~SstFileWriter(); + + // Prepare SstFileWriter to write into file located at "file_path". + Status Open(const std::string& file_path); + + // Add a Put key with value to currently opened file (deprecated) + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. + ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); + + // Add a Put key with value to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Put(const Slice& user_key, const Slice& value); + + // Add a Put (key with timestamp, value) to the currently opened file + // REQUIRES: key is after any previously added key according to the + // comparator. + // REQUIRES: the timestamp's size is equal to what is expected by + // the comparator. + Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value); + + // Add a Merge key with value to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Merge(const Slice& user_key, const Slice& value); + + // Add a deletion key to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. + Status Delete(const Slice& user_key); + + // Add a deletion key with timestamp to the currently opened file + // REQUIRES: key is after any previously added key according to the + // comparator. + // REQUIRES: the timestamp's size is equal to what is expected by + // the comparator. + Status Delete(const Slice& user_key, const Slice& timestamp); + + // Add a range deletion tombstone to currently opened file + // REQUIRES: comparator is *not* timestamp-aware. + Status DeleteRange(const Slice& begin_key, const Slice& end_key); + + // Add a range deletion tombstone to currently opened file. + // REQUIRES: begin_key and end_key are user keys without timestamp. + // REQUIRES: the timestamp's size is equal to what is expected by + // the comparator. + Status DeleteRange(const Slice& begin_key, const Slice& end_key, + const Slice& timestamp); + + // Finalize writing to sst file and close file. + // + // An optional ExternalSstFileInfo pointer can be passed to the function + // which will be populated with information about the created sst file. + Status Finish(ExternalSstFileInfo* file_info = nullptr); + + // Return the current file size. + uint64_t FileSize(); + + private: + void InvalidatePageCache(bool closing); + struct Rep; + std::unique_ptr rep_; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/sst_partitioner.h b/src/rocksdb/include/rocksdb/sst_partitioner.h new file mode 100644 index 000000000..3af8e9492 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_partitioner.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +enum PartitionerResult : char { + // Partitioner does not require to create new file + kNotRequired = 0x0, + // Partitioner is requesting forcefully to create new file + kRequired = 0x1 + // Additional constants can be added +}; + +struct PartitionerRequest { + PartitionerRequest(const Slice& prev_user_key_, + const Slice& current_user_key_, + uint64_t current_output_file_size_) + : prev_user_key(&prev_user_key_), + current_user_key(¤t_user_key_), + current_output_file_size(current_output_file_size_) {} + const Slice* prev_user_key; + const Slice* current_user_key; + uint64_t current_output_file_size; +}; + +/* + * A SstPartitioner is a generic pluggable way of defining the partition + * of SST files. Compaction job will split the SST files on partition boundary + * to lower the write amplification during SST file promote to higher level. + */ +class SstPartitioner { + public: + virtual ~SstPartitioner() {} + + // Return the name of this partitioner. + virtual const char* Name() const = 0; + + // It is called for all keys in compaction. When partitioner want to create + // new SST file it needs to return true. It means compaction job will finish + // current SST file where last key is "prev_user_key" parameter and start new + // SST file where first key is "current_user_key". Returns decision if + // partition boundary was detected and compaction should create new file. + virtual PartitionerResult ShouldPartition( + const PartitionerRequest& request) = 0; + + // Called with smallest and largest keys in SST file when compaction try to do + // trivial move. Returns true is partitioner allows to do trivial move. + virtual bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) = 0; + + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + // Output level for this compaction + int output_level; + // Smallest key for compaction + Slice smallest_user_key; + // Largest key for compaction + Slice largest_user_key; + }; +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SstPartitionerFactory : public Customizable { + public: + ~SstPartitionerFactory() override {} + static const char* Type() { return "SstPartitionerFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + virtual std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& context) const = 0; + + // Returns a name that identifies this partitioner factory. + const char* Name() const override = 0; +}; + +/* + * Fixed key prefix partitioner. It splits the output SST files when prefix + * defined by size changes. + */ +class SstPartitionerFixedPrefix : public SstPartitioner { + public: + explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {} + + virtual ~SstPartitionerFixedPrefix() override {} + + const char* Name() const override { return "SstPartitionerFixedPrefix"; } + + PartitionerResult ShouldPartition(const PartitionerRequest& request) override; + + bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) override; + + private: + size_t len_; +}; + +/* + * Factory for fixed prefix partitioner. + */ +class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory { + public: + explicit SstPartitionerFixedPrefixFactory(size_t len); + + ~SstPartitionerFixedPrefixFactory() override {} + + static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override; + + private: + size_t len_; +}; + +extern std::shared_ptr +NewSstPartitionerFixedPrefixFactory(size_t prefix_len); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h new file mode 100644 index 000000000..42a938f30 --- /dev/null +++ b/src/rocksdb/include/rocksdb/statistics.h @@ -0,0 +1,707 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * Keep adding tickers here. + * 1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking + * over its old value. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + * 3. Add a corresponding enum value to TickerType.java in the java API + * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType + * and toCppTickers + */ +enum Tickers : uint32_t { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS = 0, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of index blocks added to block cache. + BLOCK_CACHE_INDEX_ADD, + // # of bytes of index blocks inserted into cache + BLOCK_CACHE_INDEX_BYTES_INSERT, + // # of bytes of index block erased from cache + BLOCK_CACHE_INDEX_BYTES_EVICT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of filter blocks added to block cache. + BLOCK_CACHE_FILTER_ADD, + // # of bytes of bloom filter blocks inserted into cache + BLOCK_CACHE_FILTER_BYTES_INSERT, + // # of bytes of bloom filter block erased from cache + BLOCK_CACHE_FILTER_BYTES_EVICT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of data blocks added to block cache. + BLOCK_CACHE_DATA_ADD, + // # of bytes of data blocks inserted into cache + BLOCK_CACHE_DATA_BYTES_INSERT, + // # of bytes read from cache. + BLOCK_CACHE_BYTES_READ, + // # of bytes written into cache. + BLOCK_CACHE_BYTES_WRITE, + + // # of times bloom filter has avoided file reads, i.e., negatives. + BLOOM_FILTER_USEFUL, + // # of times bloom FullFilter has not avoided the reads. + BLOOM_FILTER_FULL_POSITIVE, + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + BLOOM_FILTER_FULL_TRUE_POSITIVE, + + BLOOM_FILTER_MICROS, + + // # persistent cache hit + PERSISTENT_CACHE_HIT, + // # persistent cache miss + PERSISTENT_CACHE_MISS, + + // # total simulation block cache hits + SIM_BLOCK_CACHE_HIT, + // # total simulation block cache misses + SIM_BLOCK_CACHE_MISS, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + // # of Get() queries served by L0 + GET_HIT_L0, + // # of Get() queries served by L1 + GET_HIT_L1, + // # of Get() queries served by L2 and up + GET_HIT_L2_AND_UP, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 4 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + // Also includes keys dropped for range del. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. + // Deletions obsoleted before bottom level due to file gap optimization. + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + // If a compaction was canceled in sfm to prevent ENOSPC + COMPACTION_CANCELLED, + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // The number of uncompressed bytes issued by DB::Put(), DB::Delete(), + // DB::Merge(), and DB::Write(). + BYTES_WRITTEN, + // The number of uncompressed bytes read from DB::Get(). It could be + // either from memtables, cache, or table files. + // For the number of logical bytes read from DB::MultiGet(), + // please use NUMBER_MULTIGET_BYTES_READ. + BYTES_READ, + // The number of calls to seek/next/prev + NUMBER_DB_SEEK, + NUMBER_DB_NEXT, + NUMBER_DB_PREV, + // The number of calls to seek/next/prev that returned data + NUMBER_DB_SEEK_FOUND, + NUMBER_DB_NEXT_FOUND, + NUMBER_DB_PREV_FOUND, + // The number of uncompressed bytes read from an iterator. + // Includes size of key and value. + ITER_BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // DEPRECATED Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // DEPRECATED Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // DEPRECATED write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + // Writer has to wait for compaction or flush to finish. + STALL_MICROS, + // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll + DB_MUTEX_WAIT_MICROS, + RATE_LIMIT_DELAY_MILLIS, + // DEPRECATED number of iterators currently open + NO_ITERATORS, + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpdatesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + // Number of blocks added to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD, + // Number of failures when adding blocks to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, // Equivalent to writes done for others + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + FLUSH_WRITE_BYTES, // Bytes written during flush + + // Compaction read and write statistics broken down by CompactionReason + COMPACT_READ_BYTES_MARKED, + COMPACT_READ_BYTES_PERIODIC, + COMPACT_READ_BYTES_TTL, + COMPACT_WRITE_BYTES_MARKED, + COMPACT_WRITE_BYTES_PERIODIC, + COMPACT_WRITE_BYTES_TTL, + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + + // # of compressions/decompressions executed + NUMBER_BLOCK_COMPRESSED, + NUMBER_BLOCK_DECOMPRESSED, + + NUMBER_BLOCK_NOT_COMPRESSED, + MERGE_OPERATION_TOTAL_TIME, + FILTER_OPERATION_TOTAL_TIME, + + // Row cache. + ROW_CACHE_HIT, + ROW_CACHE_MISS, + + // Read amplification statistics. + // Read amplification can be calculated using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled + READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used. + READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks. + + // Number of refill intervals where rate limiter's bytes are fully consumed. + NUMBER_RATE_LIMITER_DRAINS, + + // Number of internal keys skipped by Iterator + NUMBER_ITER_SKIP, + + // BlobDB specific stats + // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_PUT, + // # of Write to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_WRITE, + // # of Get to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_GET, + // # of MultiGet to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_MULTIGET, + // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only + // applicable to legacy BlobDB. + BLOB_DB_NUM_SEEK, + // # of Next to BlobDB iterator. Only applicable to legacy BlobDB. + BLOB_DB_NUM_NEXT, + // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB. + BLOB_DB_NUM_PREV, + // # of keys written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_KEYS_WRITTEN, + // # of keys read from BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_NUM_KEYS_READ, + // # of bytes (key + value) written to BlobDB. Only applicable to legacy + // BlobDB. + BLOB_DB_BYTES_WRITTEN, + // # of bytes (keys + value) read from BlobDB. Only applicable to legacy + // BlobDB. + BLOB_DB_BYTES_READ, + // # of keys written by BlobDB as non-TTL inlined value. Only applicable to + // legacy BlobDB. + BLOB_DB_WRITE_INLINED, + // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_INLINED_TTL, + // # of keys written by BlobDB as non-TTL blob value. Only applicable to + // legacy BlobDB. + BLOB_DB_WRITE_BLOB, + // # of keys written by BlobDB as TTL blob value. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_BLOB_TTL, + // # of bytes written to blob file. + BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + // # of bytes read from blob file. + BLOB_DB_BLOB_FILE_BYTES_READ, + // # of times a blob files being synced. + BLOB_DB_BLOB_FILE_SYNCED, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of expiration. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of expiration. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of corresponding file deleted. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of corresponding file deleted. Only applicable to legacy BlobDB. + BLOB_DB_BLOB_INDEX_EVICTED_SIZE, + // # of blob files that were obsoleted by garbage collection. Only applicable + // to legacy BlobDB. + BLOB_DB_GC_NUM_FILES, + // # of blob files generated by garbage collection. Only applicable to legacy + // BlobDB. + BLOB_DB_GC_NUM_NEW_FILES, + // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB. + BLOB_DB_GC_FAILURES, + // # of keys dropped by BlobDB garbage collection because they had been + // overwritten. DEPRECATED. + BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, + // # of keys dropped by BlobDB garbage collection because of expiration. + // DEPRECATED. + BLOB_DB_GC_NUM_KEYS_EXPIRED, + // # of keys relocated to new blob file by garbage collection. + BLOB_DB_GC_NUM_KEYS_RELOCATED, + // # of bytes dropped by BlobDB garbage collection because they had been + // overwritten. DEPRECATED. + BLOB_DB_GC_BYTES_OVERWRITTEN, + // # of bytes dropped by BlobDB garbage collection because of expiration. + // DEPRECATED. + BLOB_DB_GC_BYTES_EXPIRED, + // # of bytes relocated to new blob file by garbage collection. + BLOB_DB_GC_BYTES_RELOCATED, + // # of blob files evicted because of BlobDB is full. Only applicable to + // legacy BlobDB. + BLOB_DB_FIFO_NUM_FILES_EVICTED, + // # of keys in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. + BLOB_DB_FIFO_NUM_KEYS_EVICTED, + // # of bytes in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. + BLOB_DB_FIFO_BYTES_EVICTED, + + // These counters indicate a performance issue in WritePrepared transactions. + // We should not seem them ticking them much. + // # of times prepare_mutex_ is acquired in the fast path. + TXN_PREPARE_MUTEX_OVERHEAD, + // # of times old_commit_map_mutex_ is acquired in the fast path. + TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + // # of times we checked a batch for duplicate keys. + TXN_DUPLICATE_KEY_OVERHEAD, + // # of times snapshot_mutex_ is acquired in the fast path. + TXN_SNAPSHOT_MUTEX_OVERHEAD, + // # of times ::Get returned TryAgain due to expired snapshot seq + TXN_GET_TRY_AGAIN, + + // Number of keys actually found in MultiGet calls (vs number requested by + // caller) + // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller + NUMBER_MULTIGET_KEYS_FOUND, + + NO_ITERATOR_CREATED, // number of iterators created + NO_ITERATOR_DELETED, // number of iterators deleted + + BLOCK_CACHE_COMPRESSION_DICT_MISS, + BLOCK_CACHE_COMPRESSION_DICT_HIT, + BLOCK_CACHE_COMPRESSION_DICT_ADD, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, + + // # of blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD + BLOCK_CACHE_ADD_REDUNDANT, + // # of index blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD + BLOCK_CACHE_INDEX_ADD_REDUNDANT, + // # of filter blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD + BLOCK_CACHE_FILTER_ADD_REDUNDANT, + // # of data blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD + BLOCK_CACHE_DATA_ADD_REDUNDANT, + // # of dict blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT + // <= BLOCK_CACHE_COMPRESSION_DICT_ADD + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + + // # of files marked as trash by sst file manager and will be deleted + // later by background thread. + FILES_MARKED_TRASH, + // # of files deleted immediately by sst file manger through delete scheduler. + FILES_DELETED_IMMEDIATELY, + + // The counters for error handler, not that, bg_io_error is the subset of + // bg_error and bg_retryable_io_error is the subset of bg_io_error + ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_AUTORESUME_COUNT, + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + + // Statistics for memtable garbage collection: + // Raw bytes of data (payload) present on memtable at flush time. + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + // Outdated bytes of data present on memtable at flush time. + MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + + // Secondary cache statistics + SECONDARY_CACHE_HITS, + + // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. + VERIFY_CHECKSUM_READ_BYTES, + + // Bytes read/written while creating backups + BACKUP_READ_BYTES, + BACKUP_WRITE_BYTES, + + // Remote compaction read/write statistics + REMOTE_COMPACT_READ_BYTES, + REMOTE_COMPACT_WRITE_BYTES, + + // Tiered storage related statistics + HOT_FILE_READ_BYTES, + WARM_FILE_READ_BYTES, + COLD_FILE_READ_BYTES, + HOT_FILE_READ_COUNT, + WARM_FILE_READ_COUNT, + COLD_FILE_READ_COUNT, + + // Last level and non-last level read statistics + LAST_LEVEL_READ_BYTES, + LAST_LEVEL_READ_COUNT, + NON_LAST_LEVEL_READ_BYTES, + NON_LAST_LEVEL_READ_COUNT, + + BLOCK_CHECKSUM_COMPUTE_COUNT, + MULTIGET_COROUTINE_COUNT, + + // Integrated BlobDB specific stats + // # of times cache miss when accessing blob from blob cache. + BLOB_DB_CACHE_MISS, + // # of times cache hit when accessing blob from blob cache. + BLOB_DB_CACHE_HIT, + // # of data blocks added to blob cache. + BLOB_DB_CACHE_ADD, + // # of failures when adding blobs to blob cache. + BLOB_DB_CACHE_ADD_FAILURES, + // # of bytes read from blob cache. + BLOB_DB_CACHE_BYTES_READ, + // # of bytes written into blob cache. + BLOB_DB_CACHE_BYTES_WRITE, + + // Time spent in the ReadAsync file system call + READ_ASYNC_MICROS, + // Number of errors returned to the async read callback + ASYNC_READ_ERROR_COUNT, + + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +extern const std::vector> TickersNameMap; + +/** + * Keep adding histogram's here. + * Any histogram should have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + * Add a corresponding enum value to HistogramType.java in the java API + */ +enum Histograms : uint32_t { + DB_GET = 0, + DB_WRITE, + COMPACTION_TIME, + COMPACTION_CPU_TIME, + SUBCOMPACTION_SETUP_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + DB_SEEK, + WRITE_STALL, + SST_READ_MICROS, + // The number of subcompactions actually scheduled during a compaction + NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, + + // number of bytes compressed/decompressed + // number of bytes is when uncompressed; i.e. before/after respectively + BYTES_COMPRESSED, + BYTES_DECOMPRESSED, + COMPRESSION_TIMES_NANOS, + DECOMPRESSION_TIMES_NANOS, + // Number of merge operands passed to the merge operator in user read + // requests. + READ_NUM_MERGE_OPERANDS, + + // BlobDB specific stats + // Size of keys written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_KEY_SIZE, + // Size of values written to BlobDB. Only applicable to legacy BlobDB. + BLOB_DB_VALUE_SIZE, + // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy + // BlobDB. + BLOB_DB_WRITE_MICROS, + // BlobDB Get latency. Only applicable to legacy BlobDB. + BLOB_DB_GET_MICROS, + // BlobDB MultiGet latency. Only applicable to legacy BlobDB. + BLOB_DB_MULTIGET_MICROS, + // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to + // legacy BlobDB. + BLOB_DB_SEEK_MICROS, + // BlobDB Next latency. Only applicable to legacy BlobDB. + BLOB_DB_NEXT_MICROS, + // BlobDB Prev latency. Only applicable to legacy BlobDB. + BLOB_DB_PREV_MICROS, + // Blob file write latency. + BLOB_DB_BLOB_FILE_WRITE_MICROS, + // Blob file read latency. + BLOB_DB_BLOB_FILE_READ_MICROS, + // Blob file sync latency. + BLOB_DB_BLOB_FILE_SYNC_MICROS, + // BlobDB garbage collection time. DEPRECATED. + BLOB_DB_GC_MICROS, + // BlobDB compression time. + BLOB_DB_COMPRESSION_MICROS, + // BlobDB decompression time. + BLOB_DB_DECOMPRESSION_MICROS, + // Time spent flushing memtable to disk + FLUSH_TIME, + SST_BATCH_SIZE, + + // MultiGet stats logged per level + // Num of index and filter blocks read from file system per level. + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + // Num of data blocks read from file system per level. + // Obsolete + NUM_DATA_BLOCKS_READ_PER_LEVEL, + // Num of sst files read from file system per level. + NUM_SST_READ_PER_LEVEL, + + // Error handler statistics + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + + // Stats related to asynchronous read requests. + ASYNC_READ_BYTES, + POLL_WAIT_MICROS, + + // Number of prefetched bytes discarded by RocksDB. + PREFETCHED_BYTES_DISCARDED, + + // Number of IOs issued in parallel in a MultiGet batch + MULTIGET_IO_BATCH_SIZE, + + // Number of levels requiring IO for MultiGet + NUM_LEVEL_READ_PER_MULTIGET, + + // Wait time for aborting async read in FilePrefetchBuffer destructor + ASYNC_PREFETCH_ABORT_MICROS, + + HISTOGRAM_ENUM_MAX, +}; + +extern const std::vector> HistogramsNameMap; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; + // zero-initialize new members since old Statistics::histogramData() + // implementations won't write them. + double max = 0.0; + uint64_t count = 0; + uint64_t sum = 0; + double min = 0.0; +}; + +// StatsLevel can be used to reduce statistics overhead by skipping certain +// types of stats in the stats collection process. +// Usage: +// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); +enum StatsLevel : uint8_t { + // Disable all metrics + kDisableAll, + // Disable tickers + kExceptTickers = kDisableAll, + // Disable timer stats, and skip histogram stats + kExceptHistogramOrTimers, + // Skip timer stats + kExceptTimers, + // Collect all stats except time inside mutex lock AND time spent on + // compression. + kExceptDetailedTimers, + // Collect all stats except the counters requiring to get time inside the + // mutex lock. + kExceptTimeForMutex, + // Collect all stats, including measuring duration of mutex operations. + // If getting time is expensive on the platform to run, it can + // reduce scalability to more threads, especially for writes. + kAll, +}; + +// Analyze the performance of a db by providing cumulative stats over time. +// Usage: +// Options options; +// options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// Status s = DB::Open(options, kDBPath, &db); +// ... +// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +// HistogramData hist; +// options.statistics->histogramData(FLUSH_TIME, &hist); +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Statistics : public Customizable { + public: + ~Statistics() override {} + static const char* Type() { return "Statistics"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::shared_ptr* result); + // Default name of empty, for backwards compatibility. Derived classes should + // override this method. + // This default implementation will likely be removed in a future release + const char* Name() const override { return ""; } + virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; + virtual void histogramData(uint32_t type, + HistogramData* const data) const = 0; + virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; } + virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0; + virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0; + virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0; + virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) { + if (get_stats_level() <= StatsLevel::kExceptTimers) { + return; + } + recordInHistogram(histogramType, time); + } + // The function is here only for backward compatibility reason. + // Users implementing their own Statistics class should override + // recordInHistogram() instead and leave measureTime() as it is. + virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) { + // This is not supposed to be called. + assert(false); + } + virtual void recordInHistogram(uint32_t histogramType, uint64_t time) { + // measureTime() is the old and inaccurate function name. + // To keep backward compatible. If users implement their own + // statistics, which overrides measureTime() but doesn't override + // this function. We forward to measureTime(). + measureTime(histogramType, time); + } + + // Resets all ticker and histogram stats + virtual Status Reset() { return Status::NotSupported("Not implemented"); } + +#ifndef ROCKSDB_LITE + using Customizable::ToString; +#endif // ROCKSDB_LITE + // String representation of the statistic object. Must be thread-safe. + virtual std::string ToString() const { + // Do nothing by default + return std::string("ToString(): not implemented"); + } + + virtual bool getTickerMap(std::map*) const { + // Do nothing by default + return false; + } + + // Override this function to disable particular histogram collection + virtual bool HistEnabledForType(uint32_t type) const { + return type < HISTOGRAM_ENUM_MAX; + } + void set_stats_level(StatsLevel sl) { + stats_level_.store(sl, std::memory_order_relaxed); + } + StatsLevel get_stats_level() const { + return stats_level_.load(std::memory_order_relaxed); + } + + private: + std::atomic stats_level_{kExceptDetailedTimers}; +}; + +// Create a concrete DBStatistics object +std::shared_ptr CreateDBStatistics(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h new file mode 100644 index 000000000..57e469295 --- /dev/null +++ b/src/rocksdb/include/rocksdb/stats_history.h @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +// StatsHistoryIterator is the main interface for users to programmatically +// access statistics snapshots that was automatically stored by RocksDB. +// Depending on options, the stats can be in memory or on disk. +// The stats snapshots are indexed by time that they were recorded, and each +// stats snapshot contains individual stat name and value at the time of +// recording. +// Example: +// std::unique_ptr stats_iter; +// Status s = db->GetStatsHistory(0 /* start_time */, +// env->NowMicros() /* end_time*/, +// &stats_iter); +// if (s.ok) { +// for (; stats_iter->Valid(); stats_iter->Next()) { +// uint64_t stats_time = stats_iter->GetStatsTime(); +// const std::map& stats_map = +// stats_iter->GetStatsMap(); +// process(stats_time, stats_map); +// } +// } +class StatsHistoryIterator { + public: + StatsHistoryIterator() {} + virtual ~StatsHistoryIterator() {} + + virtual bool Valid() const = 0; + + // Moves to the next stats history record. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Return the time stamp (in seconds) when stats history is recorded. + // REQUIRES: Valid() + virtual uint64_t GetStatsTime() const = 0; + + // DEPRECATED (was never used) + virtual int GetFormatVersion() const { return -1; } + + // Return the current stats history as an std::map which specifies the + // mapping from stats name to stats value . The underlying storage + // for the returned map is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual const std::map& GetStatsMap() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h new file mode 100644 index 000000000..1ab3dc4cb --- /dev/null +++ b/src/rocksdb/include/rocksdb/status.h @@ -0,0 +1,570 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#pragma once + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include +#include +#endif + +#include +#include + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include "port/stack_trace.h" +#endif + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Status { + public: + // Create a success status. + Status() + : code_(kOk), + subcode_(kNone), + sev_(kNoError), + retryable_(false), + data_loss_(false), + scope_(0), + state_(nullptr) {} + ~Status() { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!checked_) { + fprintf(stderr, "Failed to check Status %p\n", this); + port::PrintStack(); + std::abort(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } + + // Copy the specified status. + Status(const Status& s); + Status& operator=(const Status& s); + Status(Status&& s) noexcept; + Status& operator=(Status&& s) noexcept; + bool operator==(const Status& rhs) const; + bool operator!=(const Status& rhs) const; + + // In case of intentionally swallowing an error, user must explicitly call + // this function. That way we are easily able to search the code to find where + // error swallowing occurs. + inline void PermitUncheckedError() const { MarkChecked(); } + + inline void MustCheck() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } + + enum Code : unsigned char { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7, + kShutdownInProgress = 8, + kTimedOut = 9, + kAborted = 10, + kBusy = 11, + kExpired = 12, + kTryAgain = 13, + kCompactionTooLarge = 14, + kColumnFamilyDropped = 15, + kMaxCode + }; + + Code code() const { + MarkChecked(); + return code_; + } + + enum SubCode : unsigned char { + kNone = 0, + kMutexTimeout = 1, + kLockTimeout = 2, + kLockLimit = 3, + kNoSpace = 4, + kDeadlock = 5, + kStaleFile = 6, + kMemoryLimit = 7, + kSpaceLimit = 8, + kPathNotFound = 9, + KMergeOperandsInsufficientCapacity = 10, + kManualCompactionPaused = 11, + kOverwritten = 12, + kTxnNotPrepared = 13, + kIOFenced = 14, + kMaxSubCode + }; + + SubCode subcode() const { + MarkChecked(); + return subcode_; + } + + enum Severity : unsigned char { + kNoError = 0, + kSoftError = 1, + kHardError = 2, + kFatalError = 3, + kUnrecoverableError = 4, + kMaxSeverity + }; + + Status(const Status& s, Severity sev); + + Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg) + : Status(_code, _subcode, msg, "", _sev) {} + + Severity severity() const { + MarkChecked(); + return sev_; + } + + // Returns a C style string indicating the message of the Status + const char* getState() const { + MarkChecked(); + return state_.get(); + } + + // Return a success status. + static Status OK() { return Status(); } + + // Successful, though an existing something was overwritten + // Note: using variants of OK status for program logic is discouraged, + // but it can be useful for communicating statistical information without + // changing public APIs. + static Status OkOverwritten() { return Status(kOk, kOverwritten); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + + // Fast path for not found without malloc; + static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); } + + static Status NotFound(SubCode sc, const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kNotFound, sc, msg, msg2); + } + + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status Corruption(SubCode msg = kNone) { + return Status(kCorruption, msg); + } + + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status NotSupported(SubCode msg = kNone) { + return Status(kNotSupported, msg); + } + + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status InvalidArgument(SubCode msg = kNone) { + return Status(kInvalidArgument, msg); + } + + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); } + + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status MergeInProgress(SubCode msg = kNone) { + return Status(kMergeInProgress, msg); + } + + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + static Status Incomplete(SubCode msg = kNone) { + return Status(kIncomplete, msg); + } + + static Status ShutdownInProgress(SubCode msg = kNone) { + return Status(kShutdownInProgress, msg); + } + static Status ShutdownInProgress(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kShutdownInProgress, msg, msg2); + } + static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); } + static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, msg, msg2); + } + + static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); } + static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kBusy, msg, msg2); + } + + static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTimedOut, msg, msg2); + } + + static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); } + static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kExpired, msg, msg2); + } + + static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); } + static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTryAgain, msg, msg2); + } + + static Status CompactionTooLarge(SubCode msg = kNone) { + return Status(kCompactionTooLarge, msg); + } + static Status CompactionTooLarge(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kCompactionTooLarge, msg, msg2); + } + + static Status ColumnFamilyDropped(SubCode msg = kNone) { + return Status(kColumnFamilyDropped, msg); + } + + static Status ColumnFamilyDropped(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kColumnFamilyDropped, msg, msg2); + } + + static Status NoSpace() { return Status(kIOError, kNoSpace); } + static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kNoSpace, msg, msg2); + } + + static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); } + static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, kMemoryLimit, msg, msg2); + } + + static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); } + static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kSpaceLimit, msg, msg2); + } + + static Status PathNotFound() { return Status(kIOError, kPathNotFound); } + static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kPathNotFound, msg, msg2); + } + + static Status TxnNotPrepared() { + return Status(kInvalidArgument, kTxnNotPrepared); + } + static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { + MarkChecked(); + return code() == kOk; + } + + // Returns true iff the status indicates success *with* something + // overwritten + bool IsOkOverwritten() const { + MarkChecked(); + return code() == kOk && subcode() == kOverwritten; + } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { + MarkChecked(); + return code() == kNotFound; + } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { + MarkChecked(); + return code() == kCorruption; + } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { + MarkChecked(); + return code() == kNotSupported; + } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { + MarkChecked(); + return code() == kInvalidArgument; + } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { + MarkChecked(); + return code() == kIOError; + } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { + MarkChecked(); + return code() == kMergeInProgress; + } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { + MarkChecked(); + return code() == kIncomplete; + } + + // Returns true iff the status indicates Shutdown In progress + bool IsShutdownInProgress() const { + MarkChecked(); + return code() == kShutdownInProgress; + } + + bool IsTimedOut() const { + MarkChecked(); + return code() == kTimedOut; + } + + bool IsAborted() const { + MarkChecked(); + return code() == kAborted; + } + + bool IsLockLimit() const { + MarkChecked(); + return code() == kAborted && subcode() == kLockLimit; + } + + // Returns true iff the status indicates that a resource is Busy and + // temporarily could not be acquired. + bool IsBusy() const { + MarkChecked(); + return code() == kBusy; + } + + bool IsDeadlock() const { + MarkChecked(); + return code() == kBusy && subcode() == kDeadlock; + } + + // Returns true iff the status indicated that the operation has Expired. + bool IsExpired() const { + MarkChecked(); + return code() == kExpired; + } + + // Returns true iff the status indicates a TryAgain error. + // This usually means that the operation failed, but may succeed if + // re-attempted. + bool IsTryAgain() const { + MarkChecked(); + return code() == kTryAgain; + } + + // Returns true iff the status indicates the proposed compaction is too large + bool IsCompactionTooLarge() const { + MarkChecked(); + return code() == kCompactionTooLarge; + } + + // Returns true iff the status indicates Column Family Dropped + bool IsColumnFamilyDropped() const { + MarkChecked(); + return code() == kColumnFamilyDropped; + } + + // Returns true iff the status indicates a NoSpace error + // This is caused by an I/O error returning the specific "out of space" + // error condition. Stricto sensu, an NoSpace error is an I/O error + // with a specific subcode, enabling users to take the appropriate action + // if needed + bool IsNoSpace() const { + MarkChecked(); + return (code() == kIOError) && (subcode() == kNoSpace); + } + + // Returns true iff the status indicates a memory limit error. There may be + // cases where we limit the memory used in certain operations (eg. the size + // of a write batch) in order to avoid out of memory exceptions. + bool IsMemoryLimit() const { + MarkChecked(); + return (code() == kAborted) && (subcode() == kMemoryLimit); + } + + // Returns true iff the status indicates a PathNotFound error + // This is caused by an I/O error returning the specific "no such file or + // directory" error condition. A PathNotFound error is an I/O error with + // a specific subcode, enabling users to take appropriate action if necessary + bool IsPathNotFound() const { + MarkChecked(); + return (code() == kIOError || code() == kNotFound) && + (subcode() == kPathNotFound); + } + + // Returns true iff the status indicates manual compaction paused. This + // is caused by a call to PauseManualCompaction + bool IsManualCompactionPaused() const { + MarkChecked(); + return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); + } + + // Returns true iff the status indicates a TxnNotPrepared error. + bool IsTxnNotPrepared() const { + MarkChecked(); + return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared); + } + + // Returns true iff the status indicates a IOFenced error. + bool IsIOFenced() const { + MarkChecked(); + return (code() == kIOError) && (subcode() == kIOFenced); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + protected: + Code code_; + SubCode subcode_; + Severity sev_; + bool retryable_; + bool data_loss_; + unsigned char scope_; + // A nullptr state_ (which is at least the case for OK) means the extra + // message is empty. + std::unique_ptr state_; +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + mutable bool checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + + explicit Status(Code _code, SubCode _subcode = kNone) + : code_(_code), + subcode_(_subcode), + sev_(kNoError), + retryable_(false), + data_loss_(false), + scope_(0) {} + + explicit Status(Code _code, SubCode _subcode, bool retryable, bool data_loss, + unsigned char scope) + : code_(_code), + subcode_(_subcode), + sev_(kNoError), + retryable_(retryable), + data_loss_(data_loss), + scope_(scope) {} + + Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2, + Severity sev = kNoError); + Status(Code _code, const Slice& msg, const Slice& msg2) + : Status(_code, kNone, msg, msg2) {} + + static std::unique_ptr CopyState(const char* s); + + inline void MarkChecked() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } +}; + +inline Status::Status(const Status& s) + : code_(s.code_), + subcode_(s.subcode_), + sev_(s.sev_), + retryable_(s.retryable_), + data_loss_(s.data_loss_), + scope_(s.scope_) { + s.MarkChecked(); + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline Status::Status(const Status& s, Severity sev) + : code_(s.code_), + subcode_(s.subcode_), + sev_(sev), + retryable_(s.retryable_), + data_loss_(s.data_loss_), + scope_(s.scope_) { + s.MarkChecked(); + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); +} +inline Status& Status::operator=(const Status& s) { + if (this != &s) { + s.MarkChecked(); + MustCheck(); + code_ = s.code_; + subcode_ = s.subcode_; + sev_ = s.sev_; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get()); + } + return *this; +} + +inline Status::Status(Status&& s) noexcept : Status() { + s.MarkChecked(); + *this = std::move(s); +} + +inline Status& Status::operator=(Status&& s) noexcept { + if (this != &s) { + s.MarkChecked(); + MustCheck(); + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + sev_ = std::move(s.sev_); + s.sev_ = kNoError; + retryable_ = std::move(s.retryable_); + s.retryable_ = false; + data_loss_ = std::move(s.data_loss_); + s.data_loss_ = false; + scope_ = std::move(s.scope_); + s.scope_ = 0; + state_ = std::move(s.state_); + } + return *this; +} + +inline bool Status::operator==(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); + return (code_ == rhs.code_); +} + +inline bool Status::operator!=(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); + return !(*this == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/system_clock.h b/src/rocksdb/include/rocksdb/system_clock.h new file mode 100644 index 000000000..486183d60 --- /dev/null +++ b/src/rocksdb/include/rocksdb/system_clock.h @@ -0,0 +1,116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; + +// A SystemClock is an interface used by the rocksdb implementation to access +// operating system time-related functionality. +class SystemClock : public Customizable { + public: + ~SystemClock() override {} + + static const char* Type() { return "SystemClock"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + // The name of this system clock + virtual const char* Name() const override = 0; + + // The name/nickname for the Default SystemClock. This name can be used + // to determine if the clock is the default one. + static const char* kDefaultName() { return "DefaultClock"; } + + // Return a default SystemClock suitable for the current operating + // system. + static const std::shared_ptr& Default(); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // Returns the number of micro-seconds of CPU time used by the current thread. + // 0 indicates not supported. + virtual uint64_t CPUMicros() { return 0; } + + // Returns the number of nano-seconds of CPU time used by the current thread. + // Default implementation simply relies on CPUMicros. + // 0 indicates not supported. + virtual uint64_t CPUNanos() { return CPUMicros() * 1000; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; +}; + +// Wrapper class for a SystemClock. Redirects all methods (except Name) +// of the SystemClock interface to the target/wrapped class. +class SystemClockWrapper : public SystemClock { + public: + explicit SystemClockWrapper(const std::shared_ptr& t); + + uint64_t NowMicros() override { return target_->NowMicros(); } + + uint64_t NowNanos() override { return target_->NowNanos(); } + + uint64_t CPUMicros() override { return target_->CPUMicros(); } + + uint64_t CPUNanos() override { return target_->CPUNanos(); } + + virtual void SleepForMicroseconds(int micros) override { + return target_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return target_->GetCurrentTime(unix_time); + } + + std::string TimeToString(uint64_t time) override { + return target_->TimeToString(time); + } + + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + const Customizable* Inner() const override { return target_.get(); } + + protected: + std::shared_ptr target_; +}; + +} // end namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h new file mode 100644 index 000000000..3a2bf2629 --- /dev/null +++ b/src/rocksdb/include/rocksdb/table.h @@ -0,0 +1,940 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples + +#pragma once + +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Block-based Table +class Cache; +class FilterPolicy; +class FlushBlockPolicyFactory; +class PersistentCache; +class RandomAccessFile; +struct TableReaderOptions; +struct TableBuilderOptions; +class TableBuilder; +class TableFactory; +class TableReader; +class WritableFileWriter; +struct ConfigOptions; +struct EnvOptions; + +// Types of checksums to use for checking integrity of logical blocks within +// files. All checksums currently use 32 bits of checking power (1 in 4B +// chance of failing to detect random corruption). +enum ChecksumType : char { + kNoChecksum = 0x0, + kCRC32c = 0x1, + kxxHash = 0x2, + kxxHash64 = 0x3, + kXXH3 = 0x4, // Supported since RocksDB 6.27 +}; + +// `PinningTier` is used to specify which tier of block-based tables should +// be affected by a block cache pinning setting (see +// `MetadataCacheOptions` below). +enum class PinningTier { + // For compatibility, this value specifies to fallback to the behavior + // indicated by the deprecated options, + // `pin_l0_filter_and_index_blocks_in_cache` and + // `pin_top_level_index_and_filter`. + kFallback, + + // This tier contains no block-based tables. + kNone, + + // This tier contains block-based tables that may have originated from a + // memtable flush. In particular, it includes tables from L0 that are smaller + // than 1.5 times the current `write_buffer_size`. Note these criteria imply + // it can include intra-L0 compaction outputs and ingested files, as long as + // they are not abnormally large compared to flushed files in L0. + kFlushedAndSimilar, + + // This tier contains all block-based tables. + kAll, +}; + +// `MetadataCacheOptions` contains members indicating the desired caching +// behavior for the different categories of metadata blocks. +struct MetadataCacheOptions { + // The tier of block-based tables whose top-level index into metadata + // partitions will be pinned. Currently indexes and filters may be + // partitioned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise any top-level index into metadata partitions would be + // held in table reader memory, outside the block cache. + PinningTier top_level_index_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose metadata partitions will be pinned. + // Currently indexes and filters may be partitioned. + PinningTier partition_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose unpartitioned metadata blocks will be + // pinned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise the unpartitioned meta-blocks would be held in table + // reader memory, outside the block cache. + PinningTier unpartitioned_pinning = PinningTier::kFallback; +}; + +struct CacheEntryRoleOptions { + enum class Decision { + kEnabled, + kDisabled, + kFallback, + }; + Decision charged = Decision::kFallback; + bool operator==(const CacheEntryRoleOptions& other) const { + return charged == other.charged; + } +}; + +struct CacheUsageOptions { + CacheEntryRoleOptions options; + std::map options_overrides; +}; + +// For advanced user only +struct BlockBasedTableOptions { + static const char* kName() { return "BlockTableOptions"; }; + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. + // + // TODO(ajkr) we need to update names of variables controlling meta-block + // caching as they should now apply to range tombstone and compression + // dictionary meta-blocks, in addition to index and filter meta-blocks. + // + // Whether to put index/filter blocks in the block cache. When false, + // each "table reader" object will pre-load index/filter blocks during + // table initialization. Index and filter partition blocks always use + // block cache regardless of this option. + bool cache_index_and_filter_blocks = false; + + // If cache_index_and_filter_blocks is enabled, cache index and filter + // blocks with high priority. If set to true, depending on implementation of + // block cache, index, filter, and other metadata blocks may be less likely + // to be evicted than data blocks. + bool cache_index_and_filter_blocks_with_high_priority = true; + + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating each of the following variables that + // has the default value, `PinningTier::kFallback`: + // + // - `MetadataCacheOptions::partition_pinning` + // - `MetadataCacheOptions::unpartitioned_pinning` + // + // The updated value is chosen as follows: + // + // - `pin_l0_filter_and_index_blocks_in_cache == false` -> + // `PinningTier::kNone` + // - `pin_l0_filter_and_index_blocks_in_cache == true` -> + // `PinningTier::kFlushedAndSimilar` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // + // if cache_index_and_filter_blocks is true and the below is true, then + // filter and index blocks are stored in the cache, but a reference is + // held in the "table reader" object so the blocks are pinned and only + // evicted from cache when the table reader is freed. + bool pin_l0_filter_and_index_blocks_in_cache = false; + + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating + // `MetadataCacheOptions::top_level_index_pinning` when it has the + // default value, `PinningTier::kFallback`. + // + // The updated value is chosen as follows: + // + // - `pin_top_level_index_and_filter == false` -> + // `PinningTier::kNone` + // - `pin_top_level_index_and_filter == true` -> + // `PinningTier::kAll` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // + // If cache_index_and_filter_blocks is true and the below is true, then + // the top-level index of partitioned filter and index blocks are stored in + // the cache, but a reference is held in the "table reader" object so the + // blocks are pinned and only evicted from cache when the table reader is + // freed. This is not limited to l0 in LSM tree. + bool pin_top_level_index_and_filter = true; + + // The desired block cache pinning behavior for the different categories of + // metadata blocks. While pinning can reduce block cache contention, users + // must take care not to pin excessive amounts of data, which risks + // overflowing block cache. + MetadataCacheOptions metadata_cache_options; + + // The index type that will be used for this table. + enum IndexType : char { + // A space efficient index block that is optimized for + // binary-search-based index. + kBinarySearch = 0x00, + + // The hash index, if enabled, will do the hash lookup when + // `Options.prefix_extractor` is provided. + kHashSearch = 0x01, + + // A two-level index implementation. Both levels are binary search indexes. + // Second level index blocks ("partitions") use block cache even when + // cache_index_and_filter_blocks=false. + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + kBinarySearchWithFirstKey = 0x03, + }; + + IndexType index_type = kBinarySearch; + + // The index type that will be used for the data block. + enum DataBlockIndexType : char { + kDataBlockBinarySearch = 0, // traditional block type + kDataBlockBinaryAndHash = 1, // additional hash index + }; + + DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; + + // #entries/#buckets. It is valid only when data_block_hash_index_type is + // kDataBlockBinaryAndHash. + double data_block_hash_table_util_ratio = 0.75; + + // Option hash_index_allow_collision is now deleted. + // It will behave as if hash_index_allow_collision=true. + + // Use the specified checksum type. Newly created table files will be + // protected with this checksum type. Old table files will still be readable, + // even though they have different checksum type. + ChecksumType checksum = kXXH3; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + bool no_block_cache = false; + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + std::shared_ptr block_cache = nullptr; + + // If non-NULL use the specified cache for pages read from device + // IF NULL, no page cache is used + std::shared_ptr persistent_cache = nullptr; + + // DEPRECATED: This feature is planned for removal in a future release. + // Use SecondaryCache instead. + // + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + // Note: though it looks similar to `block_cache`, RocksDB doesn't put the + // same type of object there. + std::shared_ptr block_cache_compressed = nullptr; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + uint64_t block_size = 4 * 1024; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + int block_size_deviation = 10; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. The minimum value allowed is 1. Any smaller + // value will be silently overwritten with 1. + int block_restart_interval = 16; + + // Same as block_restart_interval but used for the index block. + int index_block_restart_interval = 1; + + // Block size for partitioned metadata. Currently applied to indexes when + // kTwoLevelIndexSearch is used and to filters when partition_filters is used. + // Note: Since in the current implementation the filters and index partitions + // are aligned, an index/filter block is created when either index or filter + // block size reaches the specified limit. + // Note: this limit is currently applied to only index blocks; a filter + // partition is cut right after an index block is cut + // TODO(myabandeh): remove the note above when filter partitions are cut + // separately + uint64_t metadata_block_size = 4096; + + // `cache_usage_options` allows users to specify the default + // options (`cache_usage_options.options`) and the overriding + // options (`cache_usage_options.options_overrides`) + // for different `CacheEntryRole` under various features related to cache + // usage. + // + // For a certain `CacheEntryRole role` and a certain feature `f` of + // `CacheEntryRoleOptions`: + // 1. If `options_overrides` has an entry for `role` and + // `options_overrides[role].f != kFallback`, we use + // `options_overrides[role].f` + // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f` + // 3. Otherwise, we follow the compatible existing behavior for `f` (see + // each feature's comment for more) + // + // `cache_usage_options` currently supports specifying options for the + // following features: + // + // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`) + // Memory charging is a feature of accounting memory usage of specific area + // (represented by `CacheEntryRole`) toward usage in block cache (if + // available), by updating a dynamical charge to the block cache loosely based + // on the actual memory usage of that area. + // + // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer + // (i) If kEnabled: + // Charge memory usage of the buffered data used as training samples for + // dictionary compression. + // If such memory usage exceeds the avaible space left in the block cache + // at some point (i.e, causing a cache full under + // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be + // unbuffered. + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kEnabled. + // + // (b) CacheEntryRole::kFilterConstruction + // (i) If kEnabled: + // Charge memory usage of Bloom Filter + // (format_version >= 5) and Ribbon Filter construction. + // If additional temporary memory of Ribbon Filter exceeds the avaible + // space left in the block cache at some point (i.e, causing a cache full + // under `LRUCacheOptions::strict_capacity_limit` = true), + // construction will fall back to Bloom Filter. + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (c) CacheEntryRole::kBlockBasedTableReader + // (i) If kEnabled: + // Charge memory usage of table properties + + // index block/filter block/uncompression dictionary (when stored in table + // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks == + // false) + some internal data structures during table reader creation. + // If such a table reader exceeds + // the avaible space left in the block cache at some point (i.e, causing + // a cache full under `LRUCacheOptions::strict_capacity_limit` = true), + // creation will fail with Status::MemoryLimit(). + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (d) CacheEntryRole::kFileMetadata + // (i) If kEnabled: + // Charge memory usage of file metadata. RocksDB holds one file metadata + // structure in-memory per on-disk table file. + // If such file metadata's + // memory exceeds the avaible space left in the block cache at some point + // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = + // true), creation will fail with Status::MemoryLimit(). + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (e) Other CacheEntryRole + // Not supported. + // `Status::kNotSupported` will be returned if + // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}. + // + // + // 2. More to come ... + // + CacheUsageOptions cache_usage_options; + + // Note: currently this option requires kTwoLevelIndexSearch to be set as + // well. + // TODO(myabandeh): remove the note above once the limitation is lifted + // Use partitioned full filters for each SST file. This option is + // incompatible with block-based filters. Filter partition blocks use + // block cache even when cache_index_and_filter_blocks=false. + bool partition_filters = false; + + // Option to generate Bloom/Ribbon filters that minimize memory + // internal fragmentation. + // + // When false, malloc_usable_size is not available, or format_version < 5, + // filters are generated without regard to internal fragmentation when + // loaded into memory (historical behavior). When true (and + // malloc_usable_size is available and format_version >= 5), then + // filters are generated to "round up" and "round down" their sizes to + // minimize internal fragmentation when loaded into memory, assuming the + // reading DB has the same memory allocation characteristics as the + // generating DB. This option does not break forward or backward + // compatibility. + // + // While individual filters will vary in bits/key and false positive rate + // when setting is true, the implementation attempts to maintain a weighted + // average FP rate for filters consistent with this option set to false. + // + // With Jemalloc for example, this setting is expected to save about 10% of + // the memory footprint and block cache charge of filters, while increasing + // disk usage of filters by about 1-2% due to encoding efficiency losses + // with variance in bits/key. + // + // NOTE: Because some memory counted by block cache might be unmapped pages + // within internal fragmentation, this option can increase observed RSS + // memory usage. With cache_index_and_filter_blocks=true, this option makes + // the block cache better at using space it is allowed. (These issues + // should not arise with partitioned filters.) + // + // NOTE: Do not set to true if you do not trust malloc_usable_size. With + // this option, RocksDB might access an allocated memory object beyond its + // original size if malloc_usable_size says it is safe to do so. While this + // can be considered bad practice, it should not produce undefined behavior + // unless malloc_usable_size is buggy or broken. + bool optimize_filters_for_memory = false; + + // Use delta encoding to compress keys in blocks. + // ReadOptions::pin_data requires this option to be disabled. + // + // Default: true + bool use_delta_encoding = true; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + std::shared_ptr filter_policy = nullptr; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + bool whole_key_filtering = true; + + // If true, detect corruption during Bloom Filter (format_version >= 5) + // and Ribbon Filter construction. + // + // This is an extra check that is only + // useful in detecting software bugs or CPU+memory malfunction. + // Turning on this feature increases filter construction time by 30%. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{detect_filter_construct_corruption=true;}"}}); + // + // TODO: optimize this performance + bool detect_filter_construct_corruption = false; + + // Verify that decompressing the compressed block gives back the input. This + // is a verification mode that we use to detect bugs in compression + // algorithms. + bool verify_compression = false; + + // If used, For every data block we load into memory, we will create a bitmap + // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap + // will be used to figure out the percentage we actually read of the blocks. + // + // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and + // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the + // read amplification using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // value => memory usage (percentage of loaded blocks memory) + // 1 => 12.50 % + // 2 => 06.25 % + // 4 => 03.12 % + // 8 => 01.56 % + // 16 => 00.78 % + // + // Note: This number must be a power of 2, if not it will be sanitized + // to be the next lowest power of 2, for example a value of 7 will be + // treated as 4, a value of 19 will be treated as 16. + // + // Default: 0 (disabled) + uint32_t read_amp_bytes_per_bit = 0; + + // We currently have these versions: + // 0 -- This version can be read by really old RocksDB's. Doesn't support + // changing checksum type (default is CRC32). + // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default + // checksum, like xxHash. It is written by RocksDB when + // BlockBasedTableOptions::checksum is something other than kCRC32c. (version + // 0 is silently upconverted) + // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we + // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you + // don't plan to run RocksDB before version 3.10, you should probably use + // this. + // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we + // encode the keys in index blocks. If you don't plan to run RocksDB before + // version 5.15, you should probably use this. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we + // encode the values in index blocks. If you don't plan to run RocksDB before + // version 5.16 and you are using index_block_restart_interval > 1, you should + // probably use this as it would reduce the index size. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned + // filters use a generally faster and more accurate Bloom filter + // implementation, with a different schema. + uint32_t format_version = 5; + + // Store index blocks on disk in compressed format. Changing this option to + // false will avoid the overhead of decompression if index blocks are evicted + // and read back + bool enable_index_compression = true; + + // Align data blocks on lesser of page size and block size + bool block_align = false; + + // This enum allows trading off increased index size for improved iterator + // seek performance in some situations, particularly when block cache is + // disabled (ReadOptions::fill_cache = false) and direct IO is + // enabled (DBOptions::use_direct_reads = true). + // The default mode is the best tradeoff for most use cases. + // This option only affects newly written tables. + // + // The index contains a key separating each pair of consecutive blocks. + // Let A be the highest key in one block, B the lowest key in the next block, + // and I the index entry separating these two blocks: + // [ ... A] I [B ...] + // I is allowed to be anywhere in [A, B). + // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + // first block, then immediately fall through to the second block. + // However, if I=A, this can't happen, and we'll read only the second block. + // In kNoShortening mode, we use I=A. In other modes, we use the shortest + // key in [A, B), which usually significantly reduces index size. + // + // There's a similar story for the last index entry, which is an upper bound + // of the highest key in the file. If it's shortened and therefore + // overestimated, iterator is likely to unnecessarily read the last data block + // from each file on each seek. + enum class IndexShorteningMode : char { + // Use full keys. + kNoShortening, + // Shorten index keys between blocks, but use full key for the last index + // key, which is the upper bound of the whole file. + kShortenSeparators, + // Shorten both keys between blocks and key after last block. + kShortenSeparatorsAndSuccessor, + }; + + IndexShorteningMode index_shortening = + IndexShorteningMode::kShortenSeparators; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead + // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB) + // and doubles on every additional read upto max_auto_readahead_size and + // max_auto_readahead_size can be configured. + // + // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable + // the implicit auto prefetching. + // If max_auto_readahead_size provided is less + // than initial_auto_readahead_size, then RocksDB will sanitize the + // initial_auto_readahead_size and set it to max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch + // the blocks. + // + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{max_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 256 KB (256 * 1024). + size_t max_auto_readahead_size = 256 * 1024; + + // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and + // filter blocks) which are already in memory into block cache at the time of + // flush. On a flush, the block that is in memory (in memtables) get flushed + // to the device. If using Direct IO, additional IO is incurred to read this + // data back into memory again, which is avoided by enabling this option. This + // further helps if the workload exhibits high temporal locality, where most + // of the reads go to recently written data. This also helps in case of + // Distributed FileSystem. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{prepopulate_block_cache=kFlushOnly;}"}})); + enum class PrepopulateBlockCache : char { + // Disable prepopulate block cache. + kDisable, + // Prepopulate blocks during flush only. + kFlushOnly, + }; + + PrepopulateBlockCache prepopulate_block_cache = + PrepopulateBlockCache::kDisable; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead size + // starts at initial_auto_readahead_size and doubles on every additional read + // upto BlockBasedTableOptions.max_auto_readahead_size. + // max_auto_readahead_size can also be configured. + // + // Scenarios: + // - If initial_auto_readahead_size is set 0 then it will disabled the + // implicit auto prefetching irrespective of max_auto_readahead_size. + // - If max_auto_readahead_size is set 0, it will disable the internal + // prefetching irrespective of initial_auto_readahead_size. + // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB + // will sanitize the value of initial_auto_readahead_size to + // max_auto_readahead_size and readahead_size will be + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch + // the blocks. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{initial_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 8 KB (8 * 1024). + size_t initial_auto_readahead_size = 8 * 1024; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size and reads are + // sequential. + // num_file_reads_for_auto_readahead indicates after how many + // sequential reads internal auto prefetching should be start. + // + // For example, if value is 2 then after reading 2 sequential data blocks on + // third data block prefetching will start. + // If set 0, it will start prefetching from the first read. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{num_file_reads_for_auto_readahead=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 2 + uint64_t num_file_reads_for_auto_readahead = 2; +}; + +// Table Properties that are specific to block-based table properties. +struct BlockBasedTablePropertyNames { + // value of this properties is a fixed int32 number. + static const std::string kIndexType; + // value is "1" for true and "0" for false. + static const std::string kWholeKeyFiltering; + // value is "1" for true and "0" for false. + static const std::string kPrefixFiltering; +}; + +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + +#ifndef ROCKSDB_LITE + +enum EncodingType : char { + // Always write full keys without any special encoding. + kPlain, + // Find opportunity to write the same prefix once for multiple rows. + // In some cases, when a key follows a previous key with the same prefix, + // instead of writing out the full key, it just writes out the size of the + // shared prefix, as well as other bytes, to save some bytes. + // + // When using this option, the user is required to use the same prefix + // extractor to make sure the same prefix will be extracted from the same key. + // The Name() value of the prefix extractor will be stored in the file. When + // reopening the file, the name of the options.prefix_extractor given will be + // bitwise compared to the prefix extractors stored in the file. An error + // will be returned if the two don't match. + kPrefix, +}; + +// Table Properties that are specific to plain table properties. +struct PlainTablePropertyNames { + static const std::string kEncodingType; + static const std::string kBloomVersion; + static const std::string kNumBloomBlocks; +}; + +const uint32_t kPlainTableVariableLength = 0; + +struct PlainTableOptions { + static const char* kName() { return "PlainTableOptions"; }; + // @user_key_len: plain table has optimization for fix-sized keys, which can + // be specified via user_key_len. Alternatively, you can pass + // `kPlainTableVariableLength` if your keys have variable + // lengths. + uint32_t user_key_len = kPlainTableVariableLength; + + // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. + // You may disable it by passing a zero. + int bloom_bits_per_key = 10; + + // @hash_table_ratio: the desired utilization of the hash table used for + // prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the + // hash table + double hash_table_ratio = 0.75; + + // @index_sparseness: inside each prefix, need to build one index record for + // how many keys for binary search inside each hash bucket. + // For encoding type kPrefix, the value will be used when + // writing to determine an interval to rewrite the full + // key. It will also be used as a suggestion and satisfied + // when possible. + size_t index_sparseness = 16; + + // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. + // Otherwise from huge page TLB. The user needs to + // reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + size_t huge_page_tlb_size = 0; + + // @encoding_type: how to encode the keys. See enum EncodingType above for + // the choices. The value will determine how to encode keys + // when writing to a new SST file. This value will be stored + // inside the SST file which will be used when reading from + // the file, which makes it possible for users to choose + // different encoding type when reopening a DB. Files with + // different encoding types can co-exist in the same DB and + // can be read. + EncodingType encoding_type = kPlain; + + // @full_scan_mode: mode for reading the whole file one record by one without + // using the index. + bool full_scan_mode = false; + + // @store_index_in_file: compute plain table index and bloom filter during + // file building and store it in file. When reading + // file, index will be mapped instead of recomputation. + bool store_index_in_file = false; +}; + +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extractor properly to make +// it work. Look-up will starts with prefix hash lookup for key prefix. Inside +// the hash bucket found, a binary search is executed for hash conflicts. +// Finally, a linear search is used. + +extern TableFactory* NewPlainTableFactory( + const PlainTableOptions& options = PlainTableOptions()); + +struct CuckooTablePropertyNames { + // The key that is used to fill empty buckets. + static const std::string kEmptyKey; + // Fixed length of value. + static const std::string kValueLength; + // Number of hash functions used in Cuckoo Hash. + static const std::string kNumHashFunc; + // It denotes the number of buckets in a Cuckoo Block. Given a key and a + // particular hash function, a Cuckoo Block is a set of consecutive buckets, + // where starting bucket id is given by the hash function on the key. In case + // of a collision during inserting the key, the builder tries to insert the + // key in other locations of the cuckoo block before using the next hash + // function. This reduces cache miss during read operation in case of + // collision. + static const std::string kCuckooBlockSize; + // Size of the hash table. Use this number to compute the modulo of hash + // function. The actual number of buckets will be kMaxHashTableSize + + // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to + // accommodate the Cuckoo Block from end of hash table, due to cache friendly + // implementation. + static const std::string kHashTableSize; + // Denotes if the key sorted in the file is Internal Key (if false) + // or User Key only (if true). + static const std::string kIsLastLevel; + // Indicate if using identity function for the first hash function. + static const std::string kIdentityAsFirstHash; + // Indicate if using module or bit and to calculate hash value + static const std::string kUseModuleHash; + // Fixed user key length + static const std::string kUserKeyLength; +}; + +struct CuckooTableOptions { + static const char* kName() { return "CuckooTableOptions"; }; + + // Determines the utilization of hash tables. Smaller values + // result in larger hash tables with fewer collisions. + double hash_table_ratio = 0.9; + // A property used by builder to determine the depth to go to + // to search for a path to displace elements in case of + // collision. See Builder.MakeSpaceForKey method. Higher + // values result in more efficient hash tables with fewer + // lookups but take more time to build. + uint32_t max_search_depth = 100; + // In case of collision while inserting, the builder + // attempts to insert in the next cuckoo_block_size + // locations before skipping over to the next Cuckoo hash + // function. This makes lookups more cache friendly in case + // of collisions. + uint32_t cuckoo_block_size = 5; + // If this option is enabled, user key is treated as uint64_t and its value + // is used as hash value directly. This option changes builder's behavior. + // Reader ignore this option and behave according to what specified in table + // property. + bool identity_as_first_hash = false; + // If this option is set to true, module is used during hash calculation. + // This often yields better space efficiency at the cost of performance. + // If this option is set to false, # of entries in table is constrained to be + // power of two, and bit and is used to calculate hash, which is faster in + // general. + bool use_module_hash = true; +}; + +// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing +extern TableFactory* NewCuckooTableFactory( + const CuckooTableOptions& table_options = CuckooTableOptions()); + +#endif // ROCKSDB_LITE + +class RandomAccessFileReader; + +// A base class for table factories. +class TableFactory : public Customizable { + public: + virtual ~TableFactory() override {} + + static const char* kBlockCacheOpts() { return "BlockCache"; }; + static const char* kBlockBasedTableName() { return "BlockBasedTable"; }; + static const char* kPlainTableName() { return "PlainTable"; } + static const char* kCuckooTableName() { return "CuckooTable"; }; + + // Creates and configures a new TableFactory from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* factory); + + static const char* Type() { return "TableFactory"; } + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // NewTableReader() is called in three places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (2) SstFileDumper (for SST Dump) opens the table and dump the table + // contents using the iterator of the table. + // (3) DBImpl::IngestExternalFile() calls this function to read the contents + // of the sst file it's attempting to add + // + // table_reader_options is a TableReaderOptions which contain all the + // needed parameters and configuration to open the table. + // file is a file handler to handle the file for the table. + // file_size is the physical file size of the file. + // table_reader is the output table reader. + virtual Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache = true) const { + ReadOptions ro; + return NewTableReader(ro, table_reader_options, std::move(file), file_size, + table_reader, prefetch_index_and_filter_in_cache); + } + + // Overload of the above function that allows the caller to pass in a + // ReadOptions + virtual Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // Multiple configured can be accessed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. + virtual TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file) const = 0; + + // Return is delete range supported + virtual bool IsDeleteRangeSupported() const { return false; } +}; + +#ifndef ROCKSDB_LITE +// Create a special table factory that can open either of the supported +// table formats, based on setting inside the SST files. It should be used to +// convert a DB from one table format to another. +// @table_factory_to_write: the table factory used when writing to new files. +// @block_based_table_factory: block based table factory to use. If NULL, use +// a default one. +// @plain_table_factory: plain table factory to use. If NULL, use a default one. +// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default +// one. +extern TableFactory* NewAdaptiveTableFactory( + std::shared_ptr table_factory_to_write = nullptr, + std::shared_ptr block_based_table_factory = nullptr, + std::shared_ptr plain_table_factory = nullptr, + std::shared_ptr cuckoo_table_factory = nullptr); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h new file mode 100644 index 000000000..cbe87fa3a --- /dev/null +++ b/src/rocksdb/include/rocksdb/table_properties.h @@ -0,0 +1,327 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interpret these values by themselves. +// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do +// something similar to: +// +// UserCollectedProperties props = ...; +// for (auto pos = props.lower_bound(prefix); +// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0; +// ++pos) { +// ... +// } +using UserCollectedProperties = std::map; + +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDbId; + static const std::string kDbSessionId; + static const std::string kDbHostId; + static const std::string kOriginalFileNumber; + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kIndexPartitions; + static const std::string kTopLevelIndexSize; + static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kNumFilterEntries; + static const std::string kDeletedKeys; + static const std::string kMergeOperands; + static const std::string kNumRangeDeletions; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; + static const std::string kColumnFamilyName; + static const std::string kColumnFamilyId; + static const std::string kComparator; + static const std::string kMergeOperator; + static const std::string kPrefixExtractorName; + static const std::string kPropertyCollectors; + static const std::string kCompression; + static const std::string kCompressionOptions; + static const std::string kCreationTime; + static const std::string kOldestKeyTime; + static const std::string kFileCreationTime; + static const std::string kSlowCompressionEstimatedDataSize; + static const std::string kFastCompressionEstimatedDataSize; + static const std::string kSequenceNumberTimeMapping; +}; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own properties that they are interested in. This class is essentially +// a collection of callback functions that will be invoked during table +// building. It is constructed with TablePropertiesCollectorFactory. The methods +// don't need to be thread-safe, as we will create exactly one +// TablePropertiesCollector object per table and then call it sequentially. +// +// Statuses from these callbacks are currently logged when not OK, but +// otherwise ignored by RocksDB. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() {} + + // DEPRECATE User defined collector should implement AddUserKey(), though + // this old function still works for backward compatible reason. + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) { + return Status::InvalidArgument( + "TablePropertiesCollector::Add() deprecated."); + } + + // AddUserKey() will be called when a new key/value pair is inserted into the + // table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status AddUserKey(const Slice& key, const Slice& value, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) { + // For backwards-compatibility. + return Add(key, value); + } + + // Called after each new block is cut + virtual void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) { + // Nothing to do here. Callback registers can override. + return; + } + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* properties) = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; + + // EXPERIMENTAL Return whether the output file should be further compacted + virtual bool NeedCompact() const { return false; } +}; + +// Constructs TablePropertiesCollector. Internals create a new +// TablePropertiesCollector for each new table +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class TablePropertiesCollectorFactory : public Customizable { + public: + struct Context { + uint32_t column_family_id; + // The level at creating the SST file (i.e, table), of which the + // properties are being collected. + int level_at_creation = kUnknownLevelAtCreation; + static const uint32_t kUnknownColumnFamily; + static const int kUnknownLevelAtCreation = -1; + }; + + ~TablePropertiesCollectorFactory() override {} + static const char* Type() { return "TablePropertiesCollectorFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + // has to be thread-safe + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) = 0; + + // The name of the properties collector can be used for debugging purpose. + const char* Name() const override = 0; + + // Can be overridden by sub-classes to return the Name, followed by + // configuration info that will // be logged to the info log when the + // DB is opened + virtual std::string ToString() const { return Name(); } +}; + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the file number at creation time, or 0 for unknown. When known, + // combining with db_session_id must uniquely identify an SST file. + uint64_t orig_file_number = 0; + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // Total number of index partitions if kTwoLevelIndexSearch is used + uint64_t index_partitions = 0; + // Size of the top-level index if kTwoLevelIndexSearch is used + uint64_t top_level_index_size = 0; + // Whether the index key is user key. Otherwise it includes 8 byte of sequence + // number added by internal key format. + uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw (uncompressed, undelineated) key size + uint64_t raw_key_size = 0; + // total raw (uncompressed, undelineated) value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // the number of unique entries (keys or prefixes) added to filters + uint64_t num_filter_entries = 0; + // the number of deletions in the table + uint64_t num_deletions = 0; + // the number of merge operands in the table + uint64_t num_merge_operands = 0; + // the number of range deletions in this table + uint64_t num_range_deletions = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + // ID of column family for this SST file, corresponding to the CF identified + // by column_family_name. + uint64_t column_family_id = ROCKSDB_NAMESPACE:: + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + // Timestamp of the latest key. 0 means unknown. + // TODO(sagar0): Should be changed to latest_key_time ... but don't know the + // full implications of backward compatibility. Hence retaining for now. + uint64_t creation_time = 0; + + // Timestamp of the earliest key. 0 means unknown. + uint64_t oldest_key_time = 0; + // Actual SST file creation time. 0 means unknown. + uint64_t file_creation_time = 0; + // Estimated size of data blocks if compressed using a relatively slower + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t slow_compression_estimated_data_size = 0; + // Estimated size of data blocks if compressed using a relatively faster + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t fast_compression_estimated_data_size = 0; + // Offset of the value of the property "external sst file global seqno" in the + // file if the property exists. + // 0 means not exists. + uint64_t external_sst_file_global_seqno_offset = 0; + + // DB identity + // db_id is an identifier generated the first time the DB is created + // If DB identity is unset or unassigned, `db_id` will be an empty string. + std::string db_id; + + // DB session identity + // db_session_id is an identifier that gets reset every time the DB is opened + // If DB session identity is unset or unassigned, `db_session_id` will be an + // empty string. + std::string db_session_id; + + // Location of the machine hosting the DB instance + // db_host_id identifies the location of the host in some form + // (hostname by default, but can also be any string of the user's choosing). + // It can potentially change whenever the DB is opened + std::string db_host_id; + + // Name of the column family with which this SST file is associated. + // If column family is unknown, `column_family_name` will be an empty string. + std::string column_family_name; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // The name of the comparator used in this table. + std::string comparator_name; + + // The name of the merge operator used in this table. + // If no merge operator is used, `merge_operator_name` will be "nullptr". + std::string merge_operator_name; + + // The name of the prefix extractor used in this table + // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr". + std::string prefix_extractor_name; + + // The names of the property collectors factories used in this table + // separated by commas + // {collector_name[1]},{collector_name[2]},{collector_name[3]} .. + std::string property_collectors_names; + + // The compression algo used to compress the SST files. + std::string compression_name; + + // Compression options used to compress the SST files. + std::string compression_options; + + // Sequence number to time mapping, delta encoded. + std::string seqno_to_time_mapping; + + // user collected properties + UserCollectedProperties user_collected_properties; + UserCollectedProperties readable_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; + + // Aggregate the numerical member variables of the specified + // TableProperties. + void Add(const TableProperties& tp); + + // Subset of properties that make sense when added together + // between tables. Keys match field names in this class instead + // of using full property names. + std::map GetAggregatablePropertiesAsMap() const; + + // Return the approximated memory usage of this TableProperties object, + // including memory used by the string properties and UserCollectedProperties + std::size_t ApproximateMemoryUsage() const; +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +// +// DEPRECATED: these properties now belong as TableProperties members. Please +// use TableProperties::num_deletions and TableProperties::num_merge_operands, +// respectively. +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); +extern uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/table_reader_caller.h b/src/rocksdb/include/rocksdb/table_reader_caller.h new file mode 100644 index 000000000..10ec08130 --- /dev/null +++ b/src/rocksdb/include/rocksdb/table_reader_caller.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// A list of callers for a table reader. It is used to trace the caller that +// accesses on a block. This is only used for block cache tracing and analysis. +// A user may use kUncategorized if the caller is not interesting for analysis +// or the table reader is called in the test environment, e.g., unit test, table +// reader benchmark, etc. +enum TableReaderCaller : char { + kUserGet = 1, + kUserMultiGet = 2, + kUserIterator = 3, + kUserApproximateSize = 4, + kUserVerifyChecksum = 5, + kSSTDumpTool = 6, + kExternalSSTIngestion = 7, + kRepair = 8, + kPrefetch = 9, + kCompaction = 10, + // A compaction job may refill the block cache with blocks in the new SST + // files if paranoid_file_checks is true. + kCompactionRefill = 11, + // After building a table, it may load all its blocks into the block cache if + // paranoid_file_checks is true. + kFlush = 12, + // sst_file_reader. + kSSTFileReader = 13, + // A list of callers that are either not interesting for analysis or are + // calling from a test environment, e.g., unit test, benchmark, etc. + kUncategorized = 14, + // All callers should be added before kMaxBlockCacheLookupCaller. + kMaxBlockCacheLookupCaller +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h new file mode 100644 index 000000000..1b5f8c046 --- /dev/null +++ b/src/rocksdb/include/rocksdb/thread_status.h @@ -0,0 +1,189 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for exposing run-time status of any +// rocksdb-related thread. Such run-time status can be obtained via +// GetThreadList() API. +// +// Note that all thread-status features are still under-development, and +// thus APIs and class definitions might subject to change at this point. +// Will remove this comment once the APIs have been finalized. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) +#define ROCKSDB_USING_THREAD_STATUS +#endif + +namespace ROCKSDB_NAMESPACE { + +// TODO(yhchiang): remove this function once c++14 is available +// as std::max will be able to cover this. +// Current MS compiler does not support constexpr +template +struct constexpr_max { + static const int result = (A > B) ? A : B; +}; + +// A structure that describes the current status of a thread. +// The status of active threads can be fetched using +// ROCKSDB_NAMESPACE::GetThreadList(). +struct ThreadStatus { + // The type of a thread. + enum ThreadType : int { + HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool + LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool + USER, // User thread (Non-RocksDB BG thread) + BOTTOM_PRIORITY, // RocksDB BG thread in bottom-pri thread pool + NUM_THREAD_TYPES + }; + + // The type used to refer to a thread operation. + // A thread operation describes high-level action of a thread. + // Examples include compaction and flush. + enum OperationType : int { + OP_UNKNOWN = 0, + OP_COMPACTION, + OP_FLUSH, + NUM_OP_TYPES + }; + + enum OperationStage : int { + STAGE_UNKNOWN = 0, + STAGE_FLUSH_RUN, + STAGE_FLUSH_WRITE_L0, + STAGE_COMPACTION_PREPARE, + STAGE_COMPACTION_RUN, + STAGE_COMPACTION_PROCESS_KV, + STAGE_COMPACTION_INSTALL, + STAGE_COMPACTION_SYNC_FILE, + STAGE_PICK_MEMTABLES_TO_FLUSH, + STAGE_MEMTABLE_ROLLBACK, + STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + NUM_OP_STAGES + }; + + enum CompactionPropertyType : int { + COMPACTION_JOB_ID = 0, + COMPACTION_INPUT_OUTPUT_LEVEL, + COMPACTION_PROP_FLAGS, + COMPACTION_TOTAL_INPUT_BYTES, + COMPACTION_BYTES_READ, + COMPACTION_BYTES_WRITTEN, + NUM_COMPACTION_PROPERTIES + }; + + enum FlushPropertyType : int { + FLUSH_JOB_ID = 0, + FLUSH_BYTES_MEMTABLES, + FLUSH_BYTES_WRITTEN, + NUM_FLUSH_PROPERTIES + }; + + // The maximum number of properties of an operation. + // This number should be set to the biggest NUM_XXX_PROPERTIES. + static const int kNumOperationProperties = + constexpr_max::result; + + // The type used to refer to a thread state. + // A state describes lower-level action of a thread + // such as reading / writing a file or waiting for a mutex. + enum StateType : int { + STATE_UNKNOWN = 0, + STATE_MUTEX_WAIT = 1, + NUM_STATE_TYPES + }; + + ThreadStatus(const uint64_t _id, const ThreadType _thread_type, + const std::string& _db_name, const std::string& _cf_name, + const OperationType _operation_type, + const uint64_t _op_elapsed_micros, + const OperationStage _operation_stage, + const uint64_t _op_props[], const StateType _state_type) + : thread_id(_id), + thread_type(_thread_type), + db_name(_db_name), + cf_name(_cf_name), + operation_type(_operation_type), + op_elapsed_micros(_op_elapsed_micros), + operation_stage(_operation_stage), + state_type(_state_type) { + for (int i = 0; i < kNumOperationProperties; ++i) { + op_properties[i] = _op_props[i]; + } + } + + // An unique ID for the thread. + const uint64_t thread_id; + + // The type of the thread, it could be HIGH_PRIORITY, + // LOW_PRIORITY, and USER + const ThreadType thread_type; + + // The name of the DB instance where the thread is currently + // involved with. It would be set to empty string if the thread + // does not involve in any DB operation. + const std::string db_name; + + // The name of the column family where the thread is currently + // It would be set to empty string if the thread does not involve + // in any column family. + const std::string cf_name; + + // The operation (high-level action) that the current thread is involved. + const OperationType operation_type; + + // The elapsed time of the current thread operation in microseconds. + const uint64_t op_elapsed_micros; + + // An integer showing the current stage where the thread is involved + // in the current operation. + const OperationStage operation_stage; + + // A list of properties that describe some details about the current + // operation. Same field in op_properties[] might have different + // meanings for different operations. + uint64_t op_properties[kNumOperationProperties]; + + // The state (lower-level action) that the current thread is involved. + const StateType state_type; + + // The followings are a set of utility functions for interpreting + // the information of ThreadStatus + + static std::string GetThreadTypeName(ThreadType thread_type); + + // Obtain the name of an operation given its type. + static const std::string& GetOperationName(OperationType op_type); + + static const std::string MicrosToString(uint64_t op_elapsed_time); + + // Obtain a human-readable string describing the specified operation stage. + static const std::string& GetOperationStageName(OperationStage stage); + + // Obtain the name of the "i"th operation property of the + // specified operation. + static const std::string& GetOperationPropertyName(OperationType op_type, + int i); + + // Translate the "i"th property of the specified operation given + // a property value. + static std::map InterpretOperationProperties( + OperationType op_type, const uint64_t* op_properties); + + // Obtain the name of a state given its type. + static const std::string& GetStateName(StateType state_type); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h new file mode 100644 index 000000000..f1cc55752 --- /dev/null +++ b/src/rocksdb/include/rocksdb/threadpool.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * ThreadPool is a component that will spawn N background threads that will + * be used to execute scheduled work, The number of background threads could + * be modified by calling SetBackgroundThreads(). + * */ +class ThreadPool { + public: + virtual ~ThreadPool() {} + + // Wait for all threads to finish. + // Discard those threads that did not start + // executing + virtual void JoinAllThreads() = 0; + + // Set the number of background threads that will be executing the + // scheduled jobs. + virtual void SetBackgroundThreads(int num) = 0; + virtual int GetBackgroundThreads() = 0; + + // Get the number of jobs scheduled in the ThreadPool queue. + virtual unsigned int GetQueueLen() const = 0; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet. This ensures that everything that was thrown + // on the TP runs even though + // we may not have specified enough threads for the amount + // of jobs + virtual void WaitForJobsAndJoinAllThreads() = 0; + + // Submit a fire and forget jobs + // This allows to submit the same job multiple times + virtual void SubmitJob(const std::function&) = 0; + // This moves the function in for efficiency + virtual void SubmitJob(std::function&&) = 0; + + // Reserve available background threads. This function does not ensure + // so many threads can be reserved, instead it will return the number of + // threads that can be reserved against the desired one. In other words, + // the number of available threads could be less than the input. + virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; } + + // Release a specific number of reserved threads + virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; } +}; + +// NewThreadPool() is a function that could be used to create a ThreadPool +// with `num_threads` background threads. +extern ThreadPool* NewThreadPool(int num_threads); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h new file mode 100644 index 000000000..335e091dc --- /dev/null +++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Allow custom implementations of TraceWriter and TraceReader. +// By default, RocksDB provides a way to capture the traces to a file using the +// factory NewFileTraceWriter(). But users could also choose to export traces to +// any other system by providing custom implementations of TraceWriter and +// TraceReader. + +// TraceWriter allows exporting RocksDB traces to any system, one operation at +// a time. +class TraceWriter { + public: + virtual ~TraceWriter() = default; + + virtual Status Write(const Slice& data) = 0; + virtual Status Close() = 0; + virtual uint64_t GetFileSize() = 0; +}; + +// TraceReader allows reading RocksDB traces from any system, one operation at +// a time. A RocksDB Replayer could depend on this to replay operations. +class TraceReader { + public: + virtual ~TraceReader() = default; + + virtual Status Read(std::string* data) = 0; + virtual Status Close() = 0; + + // Seek back to the trace header. Replayer can call this method to restart + // replaying. Note this method may fail if the reader is already closed. + virtual Status Reset() = 0; +}; + +// Factory methods to write/read traces to/from a file. +// The implementations may not be thread-safe. +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_writer); +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_reader); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/trace_record.h b/src/rocksdb/include/rocksdb/trace_record.h new file mode 100644 index 000000000..c00f5cafb --- /dev/null +++ b/src/rocksdb/include/rocksdb/trace_record.h @@ -0,0 +1,248 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class DB; + +// Supported trace record types. +enum TraceType : char { + kTraceNone = 0, + kTraceBegin = 1, + kTraceEnd = 2, + // Query level tracing related trace types. + kTraceWrite = 3, + kTraceGet = 4, + kTraceIteratorSeek = 5, + kTraceIteratorSeekForPrev = 6, + // Block cache tracing related trace types. + kBlockTraceIndexBlock = 7, + // TODO: split out kinds of filter blocks? + kBlockTraceFilterBlock = 8, + kBlockTraceDataBlock = 9, + kBlockTraceUncompressionDictBlock = 10, + kBlockTraceRangeDeletionBlock = 11, + // IO tracing related trace type. + kIOTracer = 12, + // Query level tracing related trace type. + kTraceMultiGet = 13, + // All trace types should be added before kTraceMax + kTraceMax, +}; + +class GetQueryTraceRecord; +class IteratorSeekQueryTraceRecord; +class MultiGetQueryTraceRecord; +class TraceRecordResult; +class WriteQueryTraceRecord; + +// Base class for all types of trace records. +class TraceRecord { + public: + explicit TraceRecord(uint64_t timestamp); + + virtual ~TraceRecord() = default; + + // Type of the trace record. + virtual TraceType GetTraceType() const = 0; + + // Timestamp (in microseconds) of this trace. + virtual uint64_t GetTimestamp() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + }; + + // Accept the handler and report the corresponding result in `result`. + virtual Status Accept(Handler* handler, + std::unique_ptr* result) = 0; + + // Create a handler for the exeution of TraceRecord. + static Handler* NewExecutionHandler( + DB* db, const std::vector& handles); + + private: + uint64_t timestamp_; +}; + +// Base class for all query types of trace records. +class QueryTraceRecord : public TraceRecord { + public: + explicit QueryTraceRecord(uint64_t timestamp); +}; + +// Trace record for DB::Write() operation. +class WriteQueryTraceRecord : public QueryTraceRecord { + public: + WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp); + + WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp); + + virtual ~WriteQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceWrite; } + + // rep string for the WriteBatch. + virtual Slice GetWriteBatchRep() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + PinnableSlice rep_; +}; + +// Trace record for DB::Get() operation +class GetQueryTraceRecord : public QueryTraceRecord { + public: + GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key, + uint64_t timestamp); + + GetQueryTraceRecord(uint32_t column_family_id, const std::string& key, + uint64_t timestamp); + + virtual ~GetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceGet; } + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to get. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Base class for all Iterator related operations. +class IteratorQueryTraceRecord : public QueryTraceRecord { + public: + explicit IteratorQueryTraceRecord(uint64_t timestamp); + + IteratorQueryTraceRecord(PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorQueryTraceRecord(const std::string& lower_bound, + const std::string& upper_bound, uint64_t timestamp); + + virtual ~IteratorQueryTraceRecord() override; + + // Get the iterator's lower/upper bound. They may be used in ReadOptions to + // create an Iterator instance. + virtual Slice GetLowerBound() const; + virtual Slice GetUpperBound() const; + + private: + PinnableSlice lower_; + PinnableSlice upper_; +}; + +// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation. +class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord { + public: + // Currently we only support Seek() and SeekForPrev(). + enum SeekType { + kSeek = kTraceIteratorSeek, + kSeekForPrev = kTraceIteratorSeekForPrev + }; + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, + const std::string& lower_bound, + const std::string& upper_bound, + uint64_t timestamp); + + virtual ~IteratorSeekQueryTraceRecord() override; + + // Trace type matches the seek type. + TraceType GetTraceType() const override; + + // Type of seek, Seek or SeekForPrev. + virtual SeekType GetSeekType() const; + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to seek to. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + SeekType type_; + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Trace record for DB::MultiGet() operation. +class MultiGetQueryTraceRecord : public QueryTraceRecord { + public: + MultiGetQueryTraceRecord(std::vector column_family_ids, + std::vector&& keys, + uint64_t timestamp); + + MultiGetQueryTraceRecord(std::vector column_family_ids, + const std::vector& keys, + uint64_t timestamp); + + virtual ~MultiGetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceMultiGet; } + + // Column familiy IDs. + virtual std::vector GetColumnFamilyIDs() const; + + // Keys to get. + virtual std::vector GetKeys() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + std::vector cf_ids_; + std::vector keys_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/trace_record_result.h b/src/rocksdb/include/rocksdb/trace_record_result.h new file mode 100644 index 000000000..0cd0004a6 --- /dev/null +++ b/src/rocksdb/include/rocksdb/trace_record_result.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +class IteratorTraceExecutionResult; +class MultiValuesTraceExecutionResult; +class SingleValueTraceExecutionResult; +class StatusOnlyTraceExecutionResult; + +// Base class for the results of all types of trace records. +// Theses classes can be used to report the execution result of +// TraceRecord::Handler::Handle() or TraceRecord::Accept(). +class TraceRecordResult { + public: + explicit TraceRecordResult(TraceType trace_type); + + virtual ~TraceRecordResult() = default; + + // Trace type of the corresponding TraceRecord. + virtual TraceType GetTraceType() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0; + + virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0; + + virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0; + + virtual Status Handle(const IteratorTraceExecutionResult& result) = 0; + }; + + // Accept the handler. + virtual Status Accept(Handler* handler) = 0; + + private: + TraceType trace_type_; +}; + +// Base class for the results from the trace record execution handler (created +// by TraceRecord::NewExecutionHandler()). +// +// The actual execution status or returned values may be hidden from +// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a +// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may +// return Status::NotFound() but TraceRecord::Handler::Handle() or +// TraceRecord::Accept() will still return Status::OK(). The actual status from +// DB::Get() and the returned value string may be saved in a +// SingleValueTraceExecutionResult. +class TraceExecutionResult : public TraceRecordResult { + public: + TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + // Execution start/end timestamps and request latency in microseconds. + virtual uint64_t GetStartTimestamp() const; + virtual uint64_t GetEndTimestamp() const; + inline uint64_t GetLatency() const { + return GetEndTimestamp() - GetStartTimestamp(); + } + + private: + uint64_t ts_start_; + uint64_t ts_end_; +}; + +// Result for operations that only return a single Status. +// Example operation: DB::Write() +class StatusOnlyTraceExecutionResult : public TraceExecutionResult { + public: + StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~StatusOnlyTraceExecutionResult() override = default; + + // Return value of DB::Write(), etc. + virtual const Status& GetStatus() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; +}; + +// Result for operations that return a Status and a value. +// Example operation: DB::Get() +class SingleValueTraceExecutionResult : public TraceExecutionResult { + public: + SingleValueTraceExecutionResult(Status status, const std::string& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + SingleValueTraceExecutionResult(Status status, std::string&& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~SingleValueTraceExecutionResult() override; + + // Return status of DB::Get(). + virtual const Status& GetStatus() const; + + // Value for the searched key. + virtual const std::string& GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; + std::string value_; +}; + +// Result for operations that return multiple Status(es) and values as vectors. +// Example operation: DB::MultiGet() +class MultiValuesTraceExecutionResult : public TraceExecutionResult { + public: + MultiValuesTraceExecutionResult(std::vector multi_status, + std::vector values, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~MultiValuesTraceExecutionResult() override; + + // Returned Status(es) of DB::MultiGet(). + virtual const std::vector& GetMultiStatus() const; + + // Returned values for the searched keys. + virtual const std::vector& GetValues() const; + + virtual Status Accept(Handler* handler) override; + + private: + std::vector multi_status_; + std::vector values_; +}; + +// Result for Iterator operations. +// Example operations: Iterator::Seek(), Iterator::SeekForPrev() +class IteratorTraceExecutionResult : public TraceExecutionResult { + public: + IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key, + PinnableSlice&& value, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + IteratorTraceExecutionResult(bool valid, Status status, + const std::string& key, const std::string& value, + uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + virtual ~IteratorTraceExecutionResult() override; + + // Return if the Iterator is valid. + virtual bool GetValid() const; + + // Return the status of the Iterator. + virtual const Status& GetStatus() const; + + // Key of the current iterating entry, empty if GetValid() is false. + virtual Slice GetKey() const; + + // Value of the current iterating entry, empty if GetValid() is false. + virtual Slice GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + bool valid_; + Status status_; + PinnableSlice key_; + PinnableSlice value_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h new file mode 100644 index 000000000..e13ad8f80 --- /dev/null +++ b/src/rocksdb/include/rocksdb/transaction_log.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFile; +using VectorLogPtr = std::vector>; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +}; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr writeBatchPtr; + + // Add empty __ctor and __dtor for the rule of five + // However, preserve the original semantics and prohibit copying + // as the std::unique_ptr member does not copy. + BatchResult() {} + + ~BatchResult() {} + + BatchResult(const BatchResult&) = delete; + + BatchResult& operator=(const BatchResult&) = delete; + + BatchResult(BatchResult&& bResult) + : sequence(std::move(bResult.sequence)), + writeBatchPtr(std::move(bResult.writeBatchPtr)) {} + + BatchResult& operator=(BatchResult&& bResult) { + sequence = std::move(bResult.sequence); + writeBatchPtr = std::move(bResult.writeBatchPtr); + return *this; + } +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; + + // The read options for TransactionLogIterator. + struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums_; + + ReadOptions() : verify_checksums_(true) {} + + explicit ReadOptions(bool verify_checksums) + : verify_checksums_(verify_checksums) {} + }; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h new file mode 100644 index 000000000..6fb53d846 --- /dev/null +++ b/src/rocksdb/include/rocksdb/types.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +// Define all public custom types here. + +using ColumnFamilyId = uint32_t; + +// Represents a sequence number in a WAL file. +using SequenceNumber = uint64_t; + +const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed + +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + +enum class BlobFileCreationReason { + kFlush, + kCompaction, + kRecovery, +}; + +// The types of files RocksDB uses in a DB directory. (Available for +// advanced options.) +enum FileType { + kWalFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile, + kOptionsFile, + kBlobFile +}; + +// User-oriented representation of internal key types. +// Ordering of this enum entries should not change. +enum EntryType { + kEntryPut, + kEntryDelete, + kEntrySingleDelete, + kEntryMerge, + kEntryRangeDeletion, + kEntryBlobIndex, + kEntryDeleteWithTimestamp, + kEntryWideColumnEntity, + kEntryOther, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/unique_id.h b/src/rocksdb/include/rocksdb/unique_id.h new file mode 100644 index 000000000..eb0c77826 --- /dev/null +++ b/src/rocksdb/include/rocksdb/unique_id.h @@ -0,0 +1,55 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// Computes a stable, universally unique 128-bit (16 binary char) identifier +// for an SST file from TableProperties. This is supported for table (SST) +// files created with RocksDB 6.24 and later. NotSupported will be returned +// for other cases. The first 16 bytes (128 bits) is of sufficient quality +// for almost all applications, and shorter prefixes are usable as a +// hash of the full unique id. +// +// Note: .c_str() is not compatible with binary char strings, so using +// .c_str() on the result will often result in information loss and very +// poor uniqueness probability. +// +// More detail: the value is *guaranteed* unique for SST files +// generated in the same process (even different DBs, RocksDB >= 6.26), +// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26) +// so that the "all zeros" value can be used reliably for a null ID. +// These IDs are more than sufficient for SST uniqueness within each of +// many DBs or hosts. For an extreme example assuming random IDs, consider +// 10^9 hosts each with 10^9 live SST files being replaced at 10^6/second. +// Such a service would need to run for 10 million years to see an ID +// collision among live SST files on any host. +// +// And assuming one generates many SST files in the lifetime of each process, +// the probability of ID collisions is much "better than random"; see +// https://github.com/pdillinger/unique_id +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// Computes a 192-bit (24 binary char) stable, universally unique ID +// with an extra 64 bits of uniqueness compared to the standard ID. It is only +// appropriate to use this ID instead of the 128-bit ID if ID collisions +// between files among any hosts in a vast fleet is a problem, such as a shared +// global namespace for SST file backups. Under this criteria, the extreme +// example above would expect a global file ID collision every 4 days with +// 128-bit IDs (using some worst-case assumptions about process lifetime). +// It's 10^17 years with 192-bit IDs. +Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// Converts a binary string (unique id) to hexadecimal, with each 64 bits +// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B +// Also works on unique id prefix. +std::string UniqueIdToHumanString(const std::string &id); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h new file mode 100644 index 000000000..0b0a85e1c --- /dev/null +++ b/src/rocksdb/include/rocksdb/universal_compaction.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + // Percentage flexibility while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy up to 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require up to + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + // Default: -1 + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Option to optimize the universal multi level compaction by enabling + // trivial move for non overlapping files. + // Default: false + bool allow_trivial_move; + + // EXPERIMENTAL + // If true, try to limit compaction size under max_compaction_bytes. + // This might cause higher write amplification, but can prevent some + // problem caused by large compactions. + // Default: false + bool incremental; + + // Default set of parameters + CompactionOptionsUniversal() + : size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize), + allow_trivial_move(false), + incremental(false) {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/agg_merge.h b/src/rocksdb/include/rocksdb/utilities/agg_merge.h new file mode 100644 index 000000000..4e21082db --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/agg_merge.h @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +// The feature is still in development so the encoding format is subject +// to change. +// +// Aggregation Merge Operator is a merge operator that allows users to +// aggregate merge operands of different keys with different registered +// aggregation functions. The aggregation can also change for the same +// key if the functions store the data in the same format. +// The target application highly overlaps with merge operator in general +// but we try to provide a better interface so that users are more likely +// to use pre-implemented plug-in functions and connect with existing +// third-party aggregation functions (such as those from SQL engines). +// In this case, the need for users to write customized C++ plug-in code +// is reduced. +// If the idea proves to useful, we might consider to move it to be +// a core functionality of RocksDB, and reduce the support of merge +// operators. +// +// Users can implement aggregation functions by implementing abstract +// class Aggregator, and register it using AddAggregator(). +// The merge operator can be retrieved from GetAggMergeOperator() and +// it is a singleton. +// +// Users can push values to be updated with a merge operand encoded with +// registered function name and payload using EncodeAggFuncAndPayload(), +// and the merge operator will invoke the aggregation function. +// An example: +// +// // Assume class ExampleSumAggregator is implemented to do simple sum. +// AddAggregator("sum", std::make_unique()); +// std::shared_ptr mp_guard = CreateAggMergeOperator(); +// options.merge_operator = mp_guard.get(); +// ...... // Creating DB +// +// +// std::string encoded_value; +// s = EncodeAggFuncAndPayload(kUnamedFuncName, "200", encoded_value); +// assert(s.ok()); +// db->Put(WriteOptions(), "foo", encoded_value); +// s = EncodeAggFuncAndPayload("sum", "200", encoded_value); +// assert(s.ok()); +// db->Merge(WriteOptions(), "foo", encoded_value); +// s = EncodeAggFuncAndPayload("sum", "200", encoded_value); +// assert(s.ok()); +// db->Merge(WriteOptions(), "foo", encoded_value); +// +// std::string value; +// Status s = db->Get(ReadOptions, "foo", &value); +// assert(s.ok()); +// Slice func, aggregated_value; +// assert(ExtractAggFuncAndValue(value, func, aggregated_value)); +// assert(func == "sum"); +// assert(aggregated_value == "600"); +// +// +// DB::Put() can also be used to add a payloadin the same way as Merge(). +// +// kUnamedFuncName can be used as a placeholder function name. This will +// be aggregated with merge operands inserted later based on function +// name given there. +// +// If the aggregation function is not registered or there is an error +// returned by aggregation function, the result will be encoded with a fake +// aggregation function kErrorFuncName, with each merge operands to be encoded +// into a list that can be extracted using ExtractList(); +// +// If users add a merge operand using a different aggregation function from +// the previous one, the merge operands for the previous one is aggregated +// and the payload part of the result is treated as the first payload of +// the items for the new aggregation function. For example, users can +// Merge("plus, 1"), merge("plus 2"), merge("minus 3") and the aggregation +// result would be "minus 0". +// + +// A class used to aggregate data per key/value. The plug-in function is +// implemented and registered using AddAggregator(). And then use it +// with merge operator created using CreateAggMergeOperator(). +class Aggregator { + public: + virtual ~Aggregator() {} + // The input list is in reverse insertion order, with values[0] to be + // the one inserted last and values.back() to be the one inserted first. + // The oldest one might be from Get(). + // Return whether aggregation succeeded. False for aggregation error. + virtual bool Aggregate(const std::vector& values, + std::string& result) const = 0; + + // True if a partial aggregation should be invoked. Some aggregators + // might opt to skip partial aggregation if possible. + virtual bool DoPartialAggregate() const { return true; } +}; + +// The function adds aggregation plugin by function name. It is used +// by all the aggregation operator created using CreateAggMergeOperator(). +// It's currently not thread safe to run concurrently with the aggregation +// merge operator. It is recommended that all the aggregation function +// is added before calling CreateAggMergeOperator(). +Status AddAggregator(const std::string& function_name, + std::unique_ptr&& agg); + +// Get the singleton instance of merge operator for aggregation. +// Always the same one is returned with a shared_ptr is hold as a +// static variable by the function. +// This is done so because options.merge_operator is shared_ptr. +std::shared_ptr GetAggMergeOperator(); + +// Encode aggregation function and payload that can be consumed by aggregation +// merge operator. +Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload, + std::string& output); +// Helper function to extract aggregation function name and payload. +// Return false if it fails to decode. +bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value); + +// Extract encoded list. This can be used to extract error merge operands when +// the returned function name is kErrorFuncName. +bool ExtractList(const Slice& encoded_list, std::vector& decoded_list); + +// Special function name that allows it to be merged to subsequent type. +extern const std::string kUnnamedFuncName; + +// Special error function name reserved for merging or aggregation error. +extern const std::string kErrorFuncName; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/backup_engine.h b/src/rocksdb/include/rocksdb/utilities/backup_engine.h new file mode 100644 index 000000000..f28ad9618 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/backup_engine.h @@ -0,0 +1,631 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The default DB file checksum function name. +constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; +// The default BackupEngine file checksum function name. +constexpr char kBackupFileChecksumFuncName[] = "crc32c"; + +struct BackupEngineOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // Default: nullptr + Env* backup_env; + + // share_table_files supports table and blob files. + // + // If share_table_files == true, the backup directory will share table and + // blob files among backups, to save space among backups of the same DB and to + // enable incremental backups by only copying new files. + // If share_table_files == false, each backup will be on its own and will not + // share any data with other backups. + // + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup and + // restore even on a machine crash/reboot. Backup and restore processes are + // slower with sync enabled. If sync == false, we can only guarantee that + // other previously synced backups and restores are not modified while + // creating a new one. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "backup_rate_limiter" + // Default: 0 + uint64_t backup_rate_limit; + + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr backup_rate_limiter{nullptr}; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "restore_rate_limiter" + // Default: 0 + uint64_t restore_rate_limit; + + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr restore_rate_limiter{nullptr}; + + // share_files_with_checksum supports table and blob files. + // + // Only used if share_table_files is set to true. Setting to false is + // DEPRECATED and potentially dangerous because in that case BackupEngine + // can lose data if backing up databases with distinct or divergent + // history, for example if restoring from a backup other than the latest, + // writing to the DB, and creating another backup. Setting to true (default) + // prevents these issues by ensuring that different table files (SSTs) and + // blob files with the same number are treated as distinct. See + // share_files_with_checksum_naming and ShareFilesNaming. + // + // Default: true + bool share_files_with_checksum; + + // Up to this many background threads will copy files for CreateNewBackup() + // and RestoreDBFromBackup() + // Default: 1 + int max_background_operations; + + // During backup user can get callback every time next + // callback_trigger_interval_size bytes being copied. + // Default: 4194304 + uint64_t callback_trigger_interval_size; + + // For BackupEngineReadOnly, Open() will open at most this many of the + // latest non-corrupted backups. + // + // Note: this setting is ignored (behaves like INT_MAX) for any kind of + // writable BackupEngine because it would inhibit accounting for shared + // files for proper backup deletion, including purging any incompletely + // created backups on creation of a new backup. + // + // Default: INT_MAX + int max_valid_backups_to_open; + + // ShareFilesNaming describes possible naming schemes for backup + // table and blob file names when they are stored in the + // shared_checksum directory (i.e., both share_table_files and + // share_files_with_checksum are true). + enum ShareFilesNaming : uint32_t { + // Backup blob filenames are __.blob and + // backup SST filenames are __.sst + // where is an unsigned decimal integer. This is the + // original/legacy naming scheme for share_files_with_checksum, + // with two problems: + // * At massive scale, collisions on this triple with different file + // contents is plausible. + // * Determining the name to use requires computing the checksum, + // so generally requires reading the whole file even if the file + // is already backed up. + // + // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** + kLegacyCrc32cAndFileSize = 1U, + + // Backup SST filenames are _s.sst. This + // pair of values should be very strongly unique for a given SST file + // and easily determined before computing a checksum. The 's' indicates + // the value is a DB session id, not a checksum. + // + // Exceptions: + // * For blob files, kLegacyCrc32cAndFileSize is used as currently + // db_session_id is not supported by the blob file format. + // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize + // will be used instead, matching the names assigned by RocksDB versions + // not supporting the newer naming scheme. + // * See also flags below. + kUseDbSessionId = 2U, + + kMaskNoNamingFlags = 0xffffU, + + // If not already part of the naming scheme, insert + // _ + // before .sst and .blob in the name. In case of user code actually parsing + // the last _ before the .sst and .blob as the file size, this + // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this + // option makes official that unofficial feature of the backup metadata. + // + // We do not consider SST and blob file sizes to have sufficient entropy to + // contribute significantly to naming uniqueness. + kFlagIncludeFileSize = 1U << 31, + + kMaskNamingFlags = ~kMaskNoNamingFlags, + }; + + // Naming option for share_files_with_checksum table and blob files. See + // ShareFilesNaming for details. + // + // Modifying this option cannot introduce a downgrade compatibility issue + // because RocksDB can read, restore, and delete backups using different file + // names, and it's OK for a backup directory to use a mixture of table and + // blob files naming schemes. + // + // However, modifying this option and saving more backups to the same + // directory can lead to the same file getting saved again to that + // directory, under the new shared name in addition to the old shared + // name. + // + // Default: kUseDbSessionId | kFlagIncludeFileSize + // + // Note: This option comes into effect only if both share_files_with_checksum + // and share_table_files are true. + ShareFilesNaming share_files_with_checksum_naming; + + // Major schema version to use when writing backup meta files + // 1 (default) - compatible with very old versions of RocksDB. + // 2 - can be read by RocksDB versions >= 6.19.0. Minimum schema version for + // * (Experimental) saving and restoring file temperature metadata + int schema_version = 1; + + // (Experimental - subject to change or removal) When taking a backup and + // saving file temperature info (minimum schema_version is 2), there are + // two potential sources of truth for the placement of files into temperature + // tiers: (a) the current file temperature reported by the FileSystem or + // (b) the expected file temperature recorded in DB manifest. When this + // option is false (default), (b) overrides (a) if both are not UNKNOWN. + // When true, (a) overrides (b) if both are not UNKNOWN. Regardless of this + // setting, a known temperature overrides UNKNOWN. + bool current_temperatures_override_manifest = false; + + void Dump(Logger* logger) const; + + explicit BackupEngineOptions( + const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, + bool _sync = true, bool _destroy_old_data = false, + bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, + uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, + int _max_valid_backups_to_open = INT_MAX, + ShareFilesNaming _share_files_with_checksum_naming = + static_cast(kUseDbSessionId | kFlagIncludeFileSize)) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(true), + max_background_operations(_max_background_operations), + callback_trigger_interval_size(_callback_trigger_interval_size), + max_valid_backups_to_open(_max_valid_backups_to_open), + share_files_with_checksum_naming(_share_files_with_checksum_naming) { + assert(share_table_files || !share_files_with_checksum); + assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0); + } +}; + +inline BackupEngineOptions::ShareFilesNaming operator&( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert(r == BackupEngineOptions::kMaskNoNamingFlags || + (r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l & r); +} + +inline BackupEngineOptions::ShareFilesNaming operator|( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l | r); +} + +struct CreateBackupOptions { + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + bool flush_before_backup = false; + + // Callback for reporting progress, based on callback_trigger_interval_size. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + std::function progress_callback = []() {}; + + // If false, background_thread_cpu_priority is ignored. + // Otherwise, the cpu priority can be decreased, + // if you try to increase the priority, the priority will not change. + // The initial priority of the threads is CpuPriority::kNormal, + // so you can decrease to priorities lower than kNormal. + bool decrease_background_thread_cpu_priority = false; + CpuPriority background_thread_cpu_priority = CpuPriority::kNormal; +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupEngineOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +using BackupID = uint32_t; + +using BackupFileInfo = FileStorageInfo; + +struct BackupInfo { + BackupID backup_id = 0U; + // Creation time, according to GetCurrentTime + int64_t timestamp = 0; + + // Total size in bytes (based on file payloads, not including filesystem + // overheads or backup meta file) + uint64_t size = 0U; + + // Number of backed up files, some of which might be shared with other + // backups. Does not include backup meta file. + uint32_t number_files = 0U; + + // Backup API user metadata + std::string app_metadata; + + // Backup file details, if requested with include_file_details=true + std::vector file_details; + + // DB "name" (a directory in the backup_env) for opening this backup as a + // read-only DB. This should also be used as the DBOptions::wal_dir, such + // as by default setting wal_dir="". See also env_for_open. + // This field is only set if include_file_details=true + std::string name_for_open; + + // An Env(+FileSystem) for opening this backup as a read-only DB, with + // DB::OpenForReadOnly or similar. This field is only set if + // include_file_details=true. (The FileSystem in this Env takes care + // of making shared backup files openable from the `name_for_open` DB + // directory.) See also name_for_open. + // + // This Env might or might not be shared with other backups. To work + // around DBOptions::env being a raw pointer, this is a shared_ptr so + // that keeping either this BackupInfo, the BackupEngine, or a copy of + // this shared_ptr alive is sufficient to keep the Env alive for use by + // a read-only DB. + std::shared_ptr env_for_open; + + BackupInfo() {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; +}; + +// Read-only functions of a BackupEngine. (Restore writes to another directory +// not the backup directory.) See BackupEngine comments for details on +// safe concurrent operations. +class BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnlyBase() {} + + // Returns info about the latest good backup in backup_info, or NotFound + // no good backup exists. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetLatestBackupInfo( + BackupInfo* backup_info, bool include_file_details = false) const = 0; + + // Returns info about a specific backup in backup_info, or NotFound + // or Corruption status if the requested backup id does not exist or is + // known corrupt. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const = 0; + + // Returns info about non-corrupt backups in backup_infos. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual void GetBackupInfo(std::vector* backup_infos, + bool include_file_details = false) const = 0; + + // Returns info about corrupt backups in corrupt_backups. + // WARNING: Any write to the BackupEngine could trigger automatic + // GarbageCollect(), which could delete files that would be needed to + // manually recover a corrupt backup or to preserve an unrecognized (e.g. + // incompatible future version) backup. + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const = 0; + + // Restore to specified db_dir and wal_dir from backup_id. + virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + } + + // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id + virtual IOStatus RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromLatestBackup(options, db_dir, wal_dir); + } + + // If verify_with_checksum is true, this function + // inspects the current checksums and file sizes of backup files to see if + // they match our expectation. + // + // If verify_with_checksum is false, this function + // checks that each file exists and that the size of the file matches our + // expectation. It does not check file checksum. + // + // If this BackupEngine created the backup, it compares the files' current + // sizes (and current checksum) against the number of bytes written to + // them (and the checksum calculated) during creation. + // Otherwise, it compares the files' current sizes (and checksums) against + // their sizes (and checksums) when the BackupEngine was opened. + // + // Returns Status::OK() if all checks are good + virtual IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const = 0; +}; + +// Append-only functions of a BackupEngine. See BackupEngine comment for +// details on distinction between Append and Write operations and safe +// concurrent operations. +class BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngineAppendOnlyBase() {} + + // same as CreateNewBackup, but stores extra application metadata. + virtual IOStatus CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, + const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0; + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackupWithMetadata(options, db, app_metadata); + } + + // Captures the state of the database by creating a new (latest) backup. + // On success (OK status), the BackupID of the new backup is saved to + // *new_backup_id when not nullptr. + // NOTE: db_paths and cf_paths are not supported for creating backups, + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. + virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db, + BackupID* new_backup_id = nullptr) { + return CreateNewBackupWithMetadata(options, db, "", new_backup_id); + } + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackup( + DB* db, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackup(options, db); + } + + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediately, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. + virtual void StopBackup() = 0; + + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). This does not delete corrupt backups but can delete files that + // would be needed to manually recover a corrupt backup or to preserve an + // unrecognized (e.g. incompatible future version) backup. + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. + // NOTE2: For concurrency and interference purposes (see BackupEngine + // comment), GarbageCollect (GC) is like other Append operations, even + // though it seems different. Although GC can delete physical data, it does + // not delete any logical data read by Read operations. GC can interfere + // with Append or Write operations in another BackupEngine on the same + // backup_dir, because temporary files will be treated as obsolete and + // deleted. + virtual IOStatus GarbageCollect() = 0; +}; + +// A backup engine for organizing and managing backups. +// This class is not user-extensible. +// +// This class declaration adds "Write" operations in addition to the +// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase. +// +// # Concurrency between threads on the same BackupEngine* object +// +// As of version 6.20, BackupEngine* operations are generally thread-safe, +// using a read-write lock, though single-thread operation is still +// recommended to avoid TOCTOU bugs. Specifically, particular kinds of +// concurrent operations behave like this: +// +// op1\op2| Read | Append | Write +// -------|-------|--------|-------- +// Read | conc | block | block +// Append | block | block | block +// Write | block | block | block +// +// conc = operations safely proceed concurrently +// block = one of the operations safely blocks until the other completes. +// There is generally no guarantee as to which completes first. +// +// StopBackup is the only operation that affects an ongoing operation. +// +// # Interleaving operations between BackupEngine* objects open on the +// same backup_dir +// +// It is recommended only to have one BackupEngine* object open for a given +// backup_dir, but it is possible to mix / interleave some operations +// (regardless of whether they are concurrent) with these caveats: +// +// op1\op2| Open | Read | Append | Write +// -------|--------|--------|--------|-------- +// Open | conc | conc | atomic | unspec +// Read | conc | conc | old | unspec +// Append | atomic | old | unspec | unspec +// Write | unspec | unspec | unspec | unspec +// +// Special case: Open with destroy_old_data=true is really a Write +// +// conc = operations safely proceed, concurrently when applicable +// atomic = operations are effectively atomic; if a concurrent Append +// operation has not completed at some key point during Open, the +// opened BackupEngine* will never see the result of the Append op. +// old = Read operations do not include any state changes from other +// BackupEngine* objects; they return the state at their Open time. +// unspec = Behavior is unspecified, including possibly trashing the +// backup_dir, but is "memory safe" (no C++ undefined behavior) +// +class BackupEngine : public BackupEngineReadOnlyBase, + public BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngine() {} + + // BackupEngineOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngine** backup_engine_ptr); + + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngine** backup_engine_ptr) { + return BackupEngine::Open(options, db_env, backup_engine_ptr); + } + + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. + virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. + virtual IOStatus DeleteBackup(BackupID backup_id) = 0; +}; + +// A variant of BackupEngine that only allows "Read" operations. See +// BackupEngine comment for details. This class is not user-extensible. +class BackupEngineReadOnly : public BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnly() {} + + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngineReadOnly** backup_engine_ptr); + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngineReadOnly** backup_engine_ptr) { + return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h new file mode 100644 index 000000000..fde03db7e --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h @@ -0,0 +1,142 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +// The classes and functions in this header file is used for dumping out the +// blocks in a block cache, storing or transfering the blocks to another +// destination host, and load these blocks to the secondary cache at destination +// host. +// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They +// my be changed in the future when the development continues. + +// The major and minor version number of the data format to be stored/trandfered +// via CacheDumpWriter and read out via CacheDumpReader +static const int kCacheDumpMajorVersion = 0; +static const int kCacheDumpMinorVersion = 1; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to write or transfer the data that is created by +// CacheDumper. We pack one block with its block type, dump time, block key in +// the block cache, block len, block crc32c checksum and block itself as a unit +// and it is stored via WritePacket. Before we call WritePacket, we must call +// WriteMetadata once, which stores the sequence number, block unit checksum, +// and block unit size. +// We provide file based CacheDumpWriter to store the metadata and its package +// sequentially in a file as the defualt implementation. Users can implement +// their own CacheDumpWriter to store/transfer the data. For example, user can +// create a subclass which transfer the metadata and package on the fly. +class CacheDumpWriter { + public: + virtual ~CacheDumpWriter() = default; + + // Called ONCE before the calls to WritePacket + virtual IOStatus WriteMetadata(const Slice& metadata) = 0; + virtual IOStatus WritePacket(const Slice& data) = 0; + virtual IOStatus Close() = 0; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to read or receive the data that is stored +// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called +// once before we call a ReadPacket. +class CacheDumpReader { + public: + virtual ~CacheDumpReader() = default; + // Called ONCE before the calls to ReadPacket + virtual IOStatus ReadMetadata(std::string* metadata) = 0; + // Sets data to empty string on EOF + virtual IOStatus ReadPacket(std::string* data) = 0; + // (Close not needed) +}; + +// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any +// dump or load process related control variables can be added here. +struct CacheDumpOptions { + SystemClock* clock; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This the class to dump out the block in the block cache, store/transfer them +// via CacheDumpWriter. In order to dump out the blocks belonging to a certain +// DB or a list of DB (block cache can be shared by many DB), user needs to call +// SetDumpFilter to specify a list of DB to filter out the blocks that do not +// belong to those DB. +// A typical use case is: when we migrate a DB instance from host A to host B. +// We need to reopen the DB at host B after all the files are copied to host B. +// At this moment, the block cache at host B does not have any block from this +// migrated DB. Therefore, the read performance can be low due to cache warm up. +// By using CacheDumper before we shut down the DB at host A and using +// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache +// ahead. This function can be used in other use cases also. +class CacheDumper { + public: + virtual ~CacheDumper() = default; + // Only dump the blocks in the block cache that belong to the DBs in this list + virtual Status SetDumpFilter(std::vector db_list) { + (void)db_list; + return Status::NotSupported("SetDumpFilter is not supported"); + } + // The main function to dump out all the blocks that satisfy the filter + // condition from block cache to a certain CacheDumpWriter in one shot. This + // process may take some time. + virtual IOStatus DumpCacheEntriesToWriter() { + return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported"); + } +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is the class to load the dumped blocks to the destination cache. For now +// we only load the blocks to the SecondaryCache. In the future, we may plan to +// support loading to the block cache. +class CacheDumpedLoader { + public: + virtual ~CacheDumpedLoader() = default; + virtual IOStatus RestoreCacheEntriesToSecondaryCache() { + return IOStatus::NotSupported( + "RestoreCacheEntriesToSecondaryCache is not supported"); + } +}; + +// Get the writer which stores all the metadata and data sequentially to a file +IOStatus NewToFileCacheDumpWriter(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* writer); + +// Get the reader which read out the metadata and data sequentially from a file +IOStatus NewFromFileCacheDumpReader(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* reader); + +// Get the default cache dumper +Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer, + std::unique_ptr* cache_dumper); + +// Get the default cache dump loader +Status NewDefaultCacheDumpedLoader( + const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& toptions, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader, + std::unique_ptr* cache_dump_loader); + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h new file mode 100644 index 000000000..ecf920616 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A checkpoint is an openable snapshot of a database at a point in time. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; +class ColumnFamilyHandle; +struct LiveFileMetaData; +struct ExportImportFilesMetaData; + +class Checkpoint { + public: + // Creates a Checkpoint object to be used for creating openable snapshots + static Status Create(DB* db, Checkpoint** checkpoint_ptr); + + // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an + // absolute path. The specified directory should not exist, since it will be + // created by the API. + // When a checkpoint is created, + // (1) SST and blob files are hard linked if the output directory is on the + // same filesystem as the database, and copied otherwise. + // (2) other required files (like MANIFEST) are always copied. + // log_size_for_flush: if the total log file size is equal or larger than + // this value, then a flush is triggered for all the column families. The + // default value is 0, which means flush is always triggered. If you move + // away from the default, the checkpoint may not contain up-to-date data + // if WAL writing is not always enabled. + // Flush will always trigger if it is 2PC. + // sequence_number_ptr: if it is not nullptr, the value it points to will be + // set to a sequence number guaranteed to be part of the DB, not necessarily + // the latest. The default value of this parameter is nullptr. + // NOTE: db_paths and cf_paths are not supported for creating checkpoints + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. + virtual Status CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush = 0, + uint64_t* sequence_number_ptr = nullptr); + + // Exports all live SST files of a specified Column Family onto export_dir, + // returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata); + + virtual ~Checkpoint() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h new file mode 100644 index 000000000..f61afd69e --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/convenience.h @@ -0,0 +1,10 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// This file was moved to rocksdb/convenience.h" + +#include "rocksdb/convenience.h" diff --git a/src/rocksdb/include/rocksdb/utilities/customizable_util.h b/src/rocksdb/include/rocksdb/utilities/customizable_util.h new file mode 100644 index 000000000..62240763b --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/customizable_util.h @@ -0,0 +1,377 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// The methods in this file are used to instantiate new Customizable +// instances of objects. These methods are most typically used by +// the "CreateFromString" method of a customizable class. +// If not developing a new Type of customizable class, you probably +// do not need the methods in this file. +// +// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects +// for more information on how to develop and use customizable objects + +#pragma once +#include +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/convenience.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { +// The FactoryFunc functions are used to create a new customizable object +// without going through the ObjectRegistry. This methodology is especially +// useful in LITE mode, where there is no ObjectRegistry. The methods take +// in an ID of the object to create and a pointer to store the created object. +// If the factory successfully recognized the input ID, the method should return +// success; otherwise false should be returned. On success, the object +// parameter contains the new object. +template +using SharedFactoryFunc = + std::function*)>; + +template +using UniqueFactoryFunc = + std::function*)>; + +template +using StaticFactoryFunc = std::function; + +// Creates a new shared customizable instance object based on the +// input parameters using the object registry. +// +// The id parameter specifies the instance class of the object to create. +// The opt_map parameter specifies the configuration of the new instance. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewSharedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewSharedObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new managed customizable instance object based on the +// input parameters using the object registry. Unlike "shared" objects, +// managed objects are limited to a single instance per ID. +// +// The id parameter specifies the instance class of the object to create. +// If an object with this id exists in the registry, the existing object +// will be returned. If the object does not exist, a new one will be created. +// +// The opt_map parameter specifies the configuration of the new instance. +// If the object already exists, the existing object is returned "as is" and +// this parameter is ignored. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the object. This string +// will be used by the object registry to locate the appropriate object to +// create or return. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The managed instance. +template +static Status NewManagedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + Status status; + if (!id.empty()) { +#ifndef ROCKSDB_LITE + status = config_options.registry->GetOrCreateManagedObject( + id, result, [config_options, opt_map](T* object) { + return object->ConfigureFromMap(config_options, opt_map); + }); +#else + (void)result; + (void)opt_map; + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } + } else { + status = Status::NotSupported("Cannot reset object "); + } + return status; +} + +// Creates a new shared Customizable object based on the input parameters. +// This method parses the input value to determine the type of instance to +// create. If there is an existing instance (in result) and it is the same ID +// as the object being created, the existing configuration is stored and used as +// the default for the new object. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadSharedObject(const ConfigOptions& config_options, + const std::string& value, + const SharedFactoryFunc& func, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewSharedObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } +} + +// Creates a new shared Customizable object based on the input parameters. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The "id" field from the value (either the whole field or "id=XX") is used +// to determine the type/id of the object to return. For a given id, there +// the same instance of the object will be returned from this method (as opposed +// to LoadSharedObject which would create different objects for the same id. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadManagedObject(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, nullptr, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (value.empty()) { // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else { + return NewManagedObject(config_options, id, opt_map, result); + } +} + +// Creates a new unique pointer customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewUniqueObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::unique_ptr* result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewUniqueObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new unique customizable instance object based on the input +// parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadUniqueObject(const ConfigOptions& config_options, + const std::string& value, + const UniqueFactoryFunc& func, + std::unique_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewUniqueObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewStaticObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, T** result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewStaticObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = + Customizable::ConfigureNewObject(config_options, *result, opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + *result = nullptr; + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadStaticObject(const ConfigOptions& config_options, + const std::string& value, + const StaticFactoryFunc& func, T** result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, *result, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewStaticObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, *result, opt_map); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h new file mode 100644 index 000000000..d57e7473a --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Database with TTL support. +// +// USE-CASES: +// This API should be used to open the db when key-values inserted are +// meant to be removed from the db in a non-strict 'ttl' amount of time +// Therefore, this guarantees that key-values inserted will remain in the +// db for >= ttl amount of time and the db will make efforts to remove the +// key-values as soon as possible after ttl seconds of their insertion. +// +// BEHAVIOUR: +// TTL is accepted in seconds +// (int32_t)Timestamp(creation) is suffixed to values in Put internally +// Expired TTL values deleted in compaction only:(Timestamp+ttl=5 +// read_only=true opens in the usual read-only mode. Compactions will not be +// triggered(neither manual nor automatic), so no expired entries removed +// +// CONSTRAINTS: +// Not specifying/passing or non-positive TTL behaves like TTL = infinity +// +// !!!WARNING!!!: +// Calling DB::Open directly to re-open a db created by this API will get +// corrupt values(timestamp suffixed) and no ttl effect will be there +// during the second Open, so use this API consistently to open the db +// Be careful when passing ttl with a small positive value because the +// whole database may be deleted in a small amount of time + +class DBWithTTL : public StackableDB { + public: + virtual Status CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) = 0; + + static Status Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl = 0, + bool read_only = false); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + DBWithTTL** dbptr, const std::vector& ttls, + bool read_only = false); + + virtual void SetTtl(int32_t ttl) = 0; + + virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0; + + protected: + explicit DBWithTTL(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h new file mode 100644 index 000000000..0e0526557 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/debug.h @@ -0,0 +1,48 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Data associated with a particular version of a key. A database may internally +// store multiple versions of a same user key due to snapshots, compaction not +// happening yet, etc. +struct KeyVersion { + KeyVersion() : user_key(""), value(""), sequence(0), type(0) {} + + KeyVersion(const std::string& _user_key, const std::string& _value, + SequenceNumber _sequence, int _type) + : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {} + + std::string user_key; + std::string value; + SequenceNumber sequence; + int type; + std::string GetTypeName() const; +}; + +// Returns listing of all versions of keys in the provided user key range. +// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or +// `max_num_ikeys` has been reached. Since all those keys returned will be +// copied to memory, if the range covers too many keys, the memory usage +// may be huge. `max_num_ikeys` can be used to cap the memory usage. +// The result is inserted into the provided vector, `key_versions`. +Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, + std::vector* key_versions); + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h new file mode 100644 index 000000000..ffde5effa --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h @@ -0,0 +1,181 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// MirrorEnv is an Env implementation that mirrors all file-related +// operations to two backing Env's (provided at construction time). +// Writes are mirrored. For read operations, we do the read from both +// backends and assert that the results match. +// +// This is useful when implementing a new Env and ensuring that the +// semantics and behavior are correct (in that they match that of an +// existing, stable Env, like the default POSIX one). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class SequentialFileMirror; +class RandomAccessFileMirror; +class WritableFileMirror; + +class EnvMirror : public EnvWrapper { + Env *a_, *b_; + bool free_a_, free_b_; + + public: + EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false) + : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {} + ~EnvMirror() { + if (free_a_) delete a_; + if (free_b_) delete b_; + } + + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) override; + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + std::unique_ptr br; + Status as = a_->NewDirectory(name, result); + Status bs = b_->NewDirectory(name, &br); + assert(as == bs); + return as; + } + Status FileExists(const std::string& f) override { + Status as = a_->FileExists(f); + Status bs = b_->FileExists(f); + assert(as == bs); + return as; + } +#if defined(_MSC_VER) +#pragma warning(push) +// logical operation on address of string constant +#pragma warning(disable : 4130) +#endif + Status GetChildren(const std::string& dir, + std::vector* r) override { + std::vector ar, br; + Status as = a_->GetChildren(dir, &ar); + Status bs = b_->GetChildren(dir, &br); + assert(as == bs); + std::sort(ar.begin(), ar.end()); + std::sort(br.begin(), br.end()); + if (!as.ok() || ar != br) { + assert(0 == "getchildren results don't match"); + } + *r = ar; + return as; + } +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + Status DeleteFile(const std::string& f) override { + Status as = a_->DeleteFile(f); + Status bs = b_->DeleteFile(f); + assert(as == bs); + return as; + } + Status CreateDir(const std::string& d) override { + Status as = a_->CreateDir(d); + Status bs = b_->CreateDir(d); + assert(as == bs); + return as; + } + Status CreateDirIfMissing(const std::string& d) override { + Status as = a_->CreateDirIfMissing(d); + Status bs = b_->CreateDirIfMissing(d); + assert(as == bs); + return as; + } + Status DeleteDir(const std::string& d) override { + Status as = a_->DeleteDir(d); + Status bs = b_->DeleteDir(d); + assert(as == bs); + return as; + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + uint64_t asize, bsize; + Status as = a_->GetFileSize(f, &asize); + Status bs = b_->GetFileSize(f, &bsize); + assert(as == bs); + assert(!as.ok() || asize == bsize); + *s = asize; + return as; + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + uint64_t amtime, bmtime; + Status as = a_->GetFileModificationTime(fname, &amtime); + Status bs = b_->GetFileModificationTime(fname, &bmtime); + assert(as == bs); + assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000); + *file_mtime = amtime; + return as; + } + + Status RenameFile(const std::string& s, const std::string& t) override { + Status as = a_->RenameFile(s, t); + Status bs = b_->RenameFile(s, t); + assert(as == bs); + return as; + } + + Status LinkFile(const std::string& s, const std::string& t) override { + Status as = a_->LinkFile(s, t); + Status bs = b_->LinkFile(s, t); + assert(as == bs); + return as; + } + + class FileLockMirror : public FileLock { + public: + FileLock *a_, *b_; + FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {} + }; + + Status LockFile(const std::string& f, FileLock** l) override { + FileLock *al, *bl; + Status as = a_->LockFile(f, &al); + Status bs = b_->LockFile(f, &bl); + assert(as == bs); + if (as.ok()) *l = new FileLockMirror(al, bl); + return as; + } + + Status UnlockFile(FileLock* l) override { + FileLockMirror* ml = static_cast(l); + Status as = a_->UnlockFile(ml->a_); + Status bs = b_->UnlockFile(ml->b_); + assert(as == bs); + delete ml; + return as; + } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h new file mode 100644 index 000000000..824f8a3df --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// This function can be used to list the Information logs, +// given the db pointer. +Status GetInfoLogList(DB* db, std::vector* info_log_list); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h new file mode 100644 index 000000000..007638192 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h @@ -0,0 +1,318 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/ldb_tool.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/ldb_cmd_execute_result.h" + +namespace ROCKSDB_NAMESPACE { + +class LDBCommand { + public: + // Command-line arguments + static const std::string ARG_ENV_URI; + static const std::string ARG_FS_URI; + static const std::string ARG_DB; + static const std::string ARG_PATH; + static const std::string ARG_SECONDARY_PATH; + static const std::string ARG_HEX; + static const std::string ARG_KEY_HEX; + static const std::string ARG_VALUE_HEX; + static const std::string ARG_CF_NAME; + static const std::string ARG_TTL; + static const std::string ARG_TTL_START; + static const std::string ARG_TTL_END; + static const std::string ARG_TIMESTAMP; + static const std::string ARG_TRY_LOAD_OPTIONS; + static const std::string ARG_IGNORE_UNKNOWN_OPTIONS; + static const std::string ARG_FROM; + static const std::string ARG_TO; + static const std::string ARG_MAX_KEYS; + static const std::string ARG_BLOOM_BITS; + static const std::string ARG_FIX_PREFIX_LEN; + static const std::string ARG_COMPRESSION_TYPE; + static const std::string ARG_COMPRESSION_MAX_DICT_BYTES; + static const std::string ARG_BLOCK_SIZE; + static const std::string ARG_AUTO_COMPACTION; + static const std::string ARG_DB_WRITE_BUFFER_SIZE; + static const std::string ARG_WRITE_BUFFER_SIZE; + static const std::string ARG_FILE_SIZE; + static const std::string ARG_CREATE_IF_MISSING; + static const std::string ARG_NO_VALUE; + static const std::string ARG_DISABLE_CONSISTENCY_CHECKS; + static const std::string ARG_ENABLE_BLOB_FILES; + static const std::string ARG_MIN_BLOB_SIZE; + static const std::string ARG_BLOB_FILE_SIZE; + static const std::string ARG_BLOB_COMPRESSION_TYPE; + static const std::string ARG_ENABLE_BLOB_GARBAGE_COLLECTION; + static const std::string ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF; + static const std::string ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD; + static const std::string ARG_BLOB_COMPACTION_READAHEAD_SIZE; + static const std::string ARG_BLOB_FILE_STARTING_LEVEL; + static const std::string ARG_PREPOPULATE_BLOB_CACHE; + static const std::string ARG_DECODE_BLOB_INDEX; + static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS; + + struct ParsedParams { + std::string cmd; + std::vector cmd_params; + std::map option_map; + std::vector flags; + }; + + static LDBCommand* SelectCommand(const ParsedParams& parsed_parms); + + static LDBCommand* InitFromCmdLineArgs( + const std::vector& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families, + const std::function& selector = + SelectCommand); + + static LDBCommand* InitFromCmdLineArgs( + int argc, char const* const* argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families); + + bool ValidateCmdLineOptions(); + + virtual void PrepareOptions(); + + virtual void OverrideBaseOptions(); + + virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts); + + virtual void SetDBOptions(Options options) { options_ = options; } + + virtual void SetColumnFamilies( + const std::vector* column_families) { + if (column_families != nullptr) { + column_families_ = *column_families; + } else { + column_families_.clear(); + } + } + + void SetLDBOptions(const LDBOptions& ldb_options) { + ldb_options_ = ldb_options; + } + + const std::map& TEST_GetOptionMap() { + return option_map_; + } + + const std::vector& TEST_GetFlags() { return flags_; } + + virtual bool NoDBOpen() { return false; } + + virtual ~LDBCommand() { CloseDB(); } + + /* Run the command, and return the execute result. */ + void Run(); + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { return exec_state_; } + + void ClearPreviousRunState() { exec_state_.Reset(); } + + // Consider using Slice::DecodeHex directly instead if you don't need the + // 0x prefix + static std::string HexToString(const std::string& str); + + // Consider using Slice::ToString(true) directly instead if + // you don't need the 0x prefix + static std::string StringToHex(const std::string& str); + + static const char* DELIM; + + protected: + LDBCommandExecuteResult exec_state_; + std::string env_uri_; + std::string fs_uri_; + std::string db_path_; + // If empty, open DB as primary. If non-empty, open the DB as secondary + // with this secondary path. When running against a database opened by + // another process, ldb wll leave the source directory completely intact. + std::string secondary_path_; + std::string column_family_name_; + DB* db_; + DBWithTTL* db_ttl_; + std::map cf_handles_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + // If true, try to construct options from DB's option files. + bool try_load_options_; + + // The value passed to options.force_consistency_checks. + bool force_consistency_checks_; + + bool enable_blob_files_; + + bool enable_blob_garbage_collection_; + + bool create_if_missing_; + + /** + * Map of options passed on the command-line. + */ + const std::map option_map_; + + /** + * Flags passed on the command-line. + */ + const std::vector flags_; + + /** List of command-line options valid for this command */ + const std::vector valid_cmd_line_options_; + + /** Shared pointer to underlying environment if applicable **/ + std::shared_ptr env_guard_; + + bool ParseKeyValue(const std::string& line, std::string* key, + std::string* value, bool is_key_hex, bool is_value_hex); + + LDBCommand(const std::map& options, + const std::vector& flags, bool is_read_only, + const std::vector& valid_cmd_line_options); + + void OpenDB(); + + void CloseDB(); + + ColumnFamilyHandle* GetCfHandle(); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_key_hex, + bool is_value_hex); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_hex); + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const std::vector& flags, + const std::string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static std::string HelpRangeCmdArgs(); + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + static std::vector BuildCmdLineOptions( + std::vector options); + + bool ParseIntOption(const std::map& options, + const std::string& option, int& value, + LDBCommandExecuteResult& exec_state); + + bool ParseDoubleOption(const std::map& options, + const std::string& option, double& value, + LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const std::map& options, + const std::string& option, std::string* value); + + bool ParseCompressionTypeOption( + const std::map& options, + const std::string& option, CompressionType& value, + LDBCommandExecuteResult& exec_state); + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const std::map& options, + const std::string& option, bool default_val); + + Options options_; + std::vector column_families_; + ConfigOptions config_options_; + LDBOptions ldb_options_; + + private: + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const std::map& options, + const std::vector& flags); + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const std::map& options, + const std::vector& flags); + + bool IsTryLoadOptions(const std::map& options, + const std::vector& flags); + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(std::string val); +}; + +class LDBCommandRunner { + public: + static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name, + bool to_stderr = true); + + // Returns the status code to return. 0 is no error. + static int RunCommand( + int argc, char const* const* argv, Options options, + const LDBOptions& ldb_options, + const std::vector* column_families); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h new file mode 100644 index 000000000..57bac3346 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef FAILED +#undef FAILED +#endif + +namespace ROCKSDB_NAMESPACE { + +class LDBCommandExecuteResult { + public: + enum State { + EXEC_NOT_STARTED = 0, + EXEC_SUCCEED = 1, + EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {} + + LDBCommandExecuteResult(State state, std::string& msg) + : state_(state), message_(msg) {} + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { return state_ == EXEC_SUCCEED; } + + bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; } + + bool IsFailed() { return state_ == EXEC_FAILED; } + + static LDBCommandExecuteResult Succeed(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult Failed(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + + private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h new file mode 100644 index 000000000..7e4a6faa4 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +struct Options; +class Snapshot; + +// Options to control the behavior of a database (passed to +// DB::Open). A LevelDBOptions object can be initialized as though +// it were a LevelDB Options object, and then it can be converted into +// a RocksDB Options object. +struct LevelDBOptions { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + Logger* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + + // Create a LevelDBOptions object with default values for all fields. + LevelDBOptions(); +}; + +// Converts a LevelDBOptions object into a RocksDB Options object. +Options ConvertOptions(const LevelDBOptions& leveldb_options); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h new file mode 100644 index 000000000..f617da02b --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h @@ -0,0 +1,43 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifdef LUA + +// lua headers +extern "C" { +#include +#include +#include +} + +namespace ROCKSDB_NAMESPACE { +namespace lua { +// A class that used to define custom C Library that is callable +// from Lua script +class RocksLuaCustomLibrary { + public: + virtual ~RocksLuaCustomLibrary() {} + // The name of the C library. This name will also be used as the table + // (namespace) in Lua that contains the C library. + virtual const char* Name() const = 0; + + // Returns a "static const struct luaL_Reg[]", which includes a list of + // C functions. Note that the last entry of this static array must be + // {nullptr, nullptr} as required by Lua. + // + // More details about how to implement Lua C libraries can be found + // in the official Lua document http://www.lua.org/pil/26.2.html + virtual const struct luaL_Reg* Lib() const = 0; + + // A function that will be called right after the library has been created + // and pushed on the top of the lua_State. This custom setup function + // allows developers to put additional table or constant values inside + // the same table / namespace. + virtual void CustomSetup(lua_State* /*L*/) const {} +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h new file mode 100644 index 000000000..3427b65ef --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h @@ -0,0 +1,55 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +// lua headers +extern "C" { +#include +#include +#include +} + +#ifdef LUA +#include +#include + +#include "rocksdb/utilities/lua/rocks_lua_custom_library.h" + +namespace ROCKSDB_NAMESPACE { +namespace lua { +class LuaStateWrapper { + public: + explicit LuaStateWrapper(const std::string& lua_script) { + lua_state_ = luaL_newstate(); + Init(lua_script, {}); + } + LuaStateWrapper( + const std::string& lua_script, + const std::vector>& libraries) { + lua_state_ = luaL_newstate(); + Init(lua_script, libraries); + } + lua_State* GetLuaState() const { return lua_state_; } + ~LuaStateWrapper() { lua_close(lua_state_); } + + private: + void Init( + const std::string& lua_script, + const std::vector>& libraries) { + if (lua_state_) { + luaL_openlibs(lua_state_); + for (const auto& library : libraries) { + luaL_openlib(lua_state_, library->Name(), library->Lib(), 0); + library->CustomSetup(lua_state_); + } + luaL_dostring(lua_state_, lua_script.c_str()); + } + } + + lua_State* lua_state_; +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h new file mode 100644 index 000000000..4f1606b51 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the current memory usage of the specified DB instances. +class MemoryUtil { + public: + enum UsageType : int { + // Memory usage of all the mem-tables. + kMemTableTotal = 0, + // Memory usage of those un-flushed mem-tables. + kMemTableUnFlushed = 1, + // Memory usage of all the table readers. + kTableReadersTotal = 2, + // Memory usage by Cache. + kCacheTotal = 3, + kNumUsageTypes = 4 + }; + + // Returns the approximate memory usage of different types in the input + // list of DBs and Cache set. For instance, in the output map + // usage_by_type, usage_by_type[kMemTableTotal] will store the memory + // usage of all the mem-tables from all the input rocksdb instances. + // + // Note that for memory usage inside Cache class, we will + // only report the usage of the input "cache_set" without + // including those Cache usage inside the input list "dbs" + // of DBs. + static Status GetApproximateMemoryUsageByType( + const std::vector& dbs, + const std::unordered_set cache_set, + std::map* usage_by_type); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h new file mode 100644 index 000000000..3bafb837c --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h @@ -0,0 +1,585 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Customizable; +class Logger; +class ObjectLibrary; + +// Returns a new T when called with a string. Populates the std::unique_ptr +// argument if granting ownership to caller. +template +using FactoryFunc = + std::function*, std::string*)>; + +// The signature of the function for loading factories +// into an object library. This method is expected to register +// factory functions in the supplied ObjectLibrary. +// The ObjectLibrary is the library in which the factories will be loaded. +// The std::string is the argument passed to the loader function. +// The RegistrarFunc should return the number of objects loaded into this +// library +using RegistrarFunc = std::function; + +template +using ConfigureFunc = std::function; + +class ObjectLibrary { + private: + // Base class for an Entry in the Registry. + class Entry { + public: + virtual ~Entry() {} + virtual bool Matches(const std::string& target) const = 0; + virtual const char* Name() const = 0; + }; + + public: + // Class for matching target strings to a pattern. + // Entries consist of a name that starts the pattern and attributes + // The following attributes can be added to the entry: + // -Suffix: Comparable to name(suffix) + // -Separator: Comparable to name(separator).+ or name(separator).* + // -Number: Comparable to name(separator).[0-9]+ + // -AltName: Comparable to (name|alt) + // -Optional: Comparable to name(separator)? + // Multiple separators can be combined and cause multiple matches. + // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#") + // is roughly equivalent to "(A|B)@.+#.+" + // + // Note that though this class does provide some regex-style matching, + // it is not a full regex parser and has some key differences: + // - Separators are matched left-most. For example, an entry + // Name("Hello").AddSeparator(" ").AddSuffix("!") would match + // "Hello world!", but not "Hello world!!" + // - No backtracking is necessary, enabling reliably efficient matching + class PatternEntry : public Entry { + private: + enum Quantifier { + kMatchZeroOrMore, // [suffix].* + kMatchAtLeastOne, // [suffix].+ + kMatchExact, // [suffix] + kMatchInteger, // [suffix][0-9]+ + kMatchDecimal, // [suffix][0-9]+[.][0-9]+ + }; + + public: + // Short-cut for creating an entry that matches to a + // Customizable::IndividualId + static PatternEntry AsIndividualId(const std::string& name) { + PatternEntry entry(name, true); + entry.AddSeparator("@"); + entry.AddSeparator("#"); + return entry; + } + + // Creates a new PatternEntry for "name". If optional is true, + // Matches will also return true if name==target + explicit PatternEntry(const std::string& name, bool optional = true) + : name_(name), optional_(optional), slength_(0) { + nlength_ = name_.size(); + } + + // Adds a suffix (exact match of separator with no trailing characters) to + // the separator + PatternEntry& AddSuffix(const std::string& suffix) { + separators_.emplace_back(suffix, kMatchExact); + slength_ += suffix.size(); + return *this; + } + + // Adds a separator (exact match of separator with trailing characters) to + // the entry + // If at_least_one is true, the separator must be followed by at least + // one character (e.g. separator.+). + // If at_least_one is false, the separator may be followed by zero or + // more characters (e.g. separator.*). + PatternEntry& AddSeparator(const std::string& separator, + bool at_least_one = true) { + slength_ += separator.size(); + if (at_least_one) { + separators_.emplace_back(separator, kMatchAtLeastOne); + ++slength_; + } else { + separators_.emplace_back(separator, kMatchZeroOrMore); + } + return *this; + } + + // Adds a separator (exact match of separator with trailing numbers) to the + // entry + PatternEntry& AddNumber(const std::string& separator, bool is_int = true) { + separators_.emplace_back(separator, + (is_int) ? kMatchInteger : kMatchDecimal); + slength_ += separator.size() + 1; + return *this; + } + + // Sets another name that this entry will match, similar to (name|alt) + PatternEntry& AnotherName(const std::string& alt) { + names_.emplace_back(alt); + return *this; + } + + // Sets whether the separators are required -- similar to name(separator)? + // If optional is true, then name(separator)? would match + // If optional is false, then the separators must also match + PatternEntry& SetOptional(bool optional) { + optional_ = optional; + return *this; + } + + // Checks to see if the target matches this entry + bool Matches(const std::string& target) const override; + const char* Name() const override { return name_.c_str(); } + + private: + size_t MatchSeparatorAt(size_t start, Quantifier mode, + const std::string& target, size_t tlen, + const std::string& pattern) const; + + bool MatchesTarget(const std::string& name, size_t nlen, + const std::string& target, size_t ylen) const; + std::string name_; // The base name for this entry + size_t nlength_; // The length of name_ + std::vector names_; // Alternative names for this entry + bool optional_; // Whether matching of separators is required + size_t slength_; // The minimum required length to match the separators + std::vector> + separators_; // What to match + }; // End class Entry + + private: + // An Entry containing a FactoryFunc for creating new Objects + template + class FactoryEntry : public Entry { + public: + FactoryEntry(Entry* e, FactoryFunc f) + : entry_(e), factory_(std::move(f)) {} + bool Matches(const std::string& target) const override { + return entry_->Matches(target); + } + const char* Name() const override { return entry_->Name(); } + + // Creates a new T object. + T* NewFactoryObject(const std::string& target, std::unique_ptr* guard, + std::string* msg) const { + return factory_(target, guard, msg); + } + const FactoryFunc& GetFactory() const { return factory_; } + + private: + std::unique_ptr entry_; // What to match for this entry + FactoryFunc factory_; + }; // End class FactoryEntry + public: + explicit ObjectLibrary(const std::string& id) { id_ = id; } + + const std::string& GetID() const { return id_; } + + // Finds the factory function for the input target. + // @see PatternEntry for the matching rules to target + // @return If matched, the FactoryFunc for this target, else nullptr + template + FactoryFunc FindFactory(const std::string& target) const { + std::unique_lock lock(mu_); + auto factories = factories_.find(T::Type()); + if (factories != factories_.end()) { + for (const auto& e : factories->second) { + if (e->Matches(target)) { + const auto* fe = + static_cast*>(e.get()); + return fe->GetFactory(); + } + } + } + return nullptr; + } + + // Returns the total number of factories registered for this library. + // This method returns the sum of all factories registered for all types. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(size_t* num_types) const; + + // Returns the number of factories registered for this library + // for the input type. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(const std::string& type) const; + + // Returns the registered factory names for the input type + // names is updated to include the names for the type + void GetFactoryNames(const std::string& type, + std::vector* names) const; + + void GetFactoryTypes(std::unordered_set* types) const; + + void Dump(Logger* logger) const; + + // Registers the factory with the library for the name. + // If name==target, the factory may be used to create a new object. + template + const FactoryFunc& AddFactory(const std::string& name, + const FactoryFunc& func) { + std::unique_ptr entry( + new FactoryEntry(new PatternEntry(name), func)); + AddFactoryEntry(T::Type(), std::move(entry)); + return func; + } + + // Registers the factory with the library for the entry. + // If the entry matches the target, the factory may be used to create a new + // object. + // @see PatternEntry for the matching rules. + // NOTE: This function replaces the old ObjectLibrary::Register() + template + const FactoryFunc& AddFactory(const PatternEntry& entry, + const FactoryFunc& func) { + std::unique_ptr factory( + new FactoryEntry(new PatternEntry(entry), func)); + AddFactoryEntry(T::Type(), std::move(factory)); + return func; + } + + // Invokes the registrar function with the supplied arg for this library. + int Register(const RegistrarFunc& registrar, const std::string& arg) { + return registrar(*this, arg); + } + + // Returns the default ObjectLibrary + static std::shared_ptr& Default(); + + private: + void AddFactoryEntry(const char* type, std::unique_ptr&& entry) { + std::unique_lock lock(mu_); + auto& factories = factories_[type]; + factories.emplace_back(std::move(entry)); + } + + // Protects the entry map + mutable std::mutex mu_; + // ** FactoryFunctions for this loader, organized by type + std::unordered_map>> + factories_; + + // The name for this library + std::string id_; +}; + +// The ObjectRegistry is used to register objects that can be created by a +// name/pattern at run-time where the specific implementation of the object may +// not be known in advance. +class ObjectRegistry { + public: + static std::shared_ptr NewInstance(); + static std::shared_ptr NewInstance( + const std::shared_ptr& parent); + static std::shared_ptr Default(); + explicit ObjectRegistry(const std::shared_ptr& parent) + : parent_(parent) {} + explicit ObjectRegistry(const std::shared_ptr& library); + + std::shared_ptr AddLibrary(const std::string& id) { + auto library = std::make_shared(id); + AddLibrary(library); + return library; + } + + void AddLibrary(const std::shared_ptr& library) { + std::unique_lock lock(library_mutex_); + libraries_.push_back(library); + } + + void AddLibrary(const std::string& id, const RegistrarFunc& registrar, + const std::string& arg) { + auto library = AddLibrary(id); + library->Register(registrar, arg); + } + + // Finds the factory for target and instantiates a new T. + // Returns NotSupported if no factory is found + // Returns InvalidArgument if a factory is found but the factory failed. + template + Status NewObject(const std::string& target, T** object, + std::unique_ptr* guard) { + assert(guard != nullptr); + guard->reset(); + auto factory = FindFactory(target); + if (factory != nullptr) { + std::string errmsg; + *object = factory(target, guard, &errmsg); + if (*object != nullptr) { + return Status::OK(); + } else if (errmsg.empty()) { + return Status::InvalidArgument( + std::string("Could not load ") + T::Type(), target); + } else { + return Status::InvalidArgument(errmsg, target); + } + } else { + return Status::NotSupported(std::string("Could not load ") + T::Type(), + target); + } + } + // Creates a new unique T using the input factory functions. + // Returns OK if a new unique T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a unique ptr) + template + Status NewUniqueObject(const std::string& target, + std::unique_ptr* result) { + T* ptr = nullptr; + std::unique_ptr guard; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a unique ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new shared T using the input factory functions. + // Returns OK if a new shared T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a shared ptr) + template + Status NewSharedObject(const std::string& target, + std::shared_ptr* result) { + std::unique_ptr guard; + T* ptr = nullptr; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a shared ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new static T using the input factory functions. + // Returns OK if a new static T was successfully created + // Returns NotSupported if the type/target could not be created + // Returns InvalidArgument if the factory return a guarded object + // (meaning it is managed by a unique ptr) + template + Status NewStaticObject(const std::string& target, T** result) { + std::unique_ptr guard; + T* ptr = nullptr; + Status s = NewObject(target, &ptr, &guard); + if (!s.ok()) { + return s; + } else if (guard.get()) { + return Status::InvalidArgument(std::string("Cannot make a static ") + + T::Type() + " from a guarded one ", + target); + } else { + *result = ptr; + return Status::OK(); + } + } + + // Sets the object for the given id/type to be the input object + // If the registry does not contain this id/type, the object is added and OK + // is returned. If the registry contains a different object, an error is + // returned. If the registry contains the input object, OK is returned. + template + Status SetManagedObject(const std::shared_ptr& object) { + assert(object != nullptr); + return SetManagedObject(object->GetId(), object); + } + + template + Status SetManagedObject(const std::string& id, + const std::shared_ptr& object) { + const auto c = std::static_pointer_cast(object); + return SetManagedObject(T::Type(), id, c); + } + + // Returns the object for the given id, if one exists. + // If the object is not found in the registry, a nullptr is returned + template + std::shared_ptr GetManagedObject(const std::string& id) const { + auto c = GetManagedObject(T::Type(), id); + return std::static_pointer_cast(c); + } + + // Returns the set of managed objects found in the registry matching + // the input type and ID. + // If the input id is not empty, then only objects of that class + // (IsInstanceOf(id)) will be returned (for example, only return LRUCache + // objects) If the input id is empty, then all objects of that type (all Cache + // objects) + template + Status ListManagedObjects(const std::string& id, + std::vector>* results) const { + std::vector> customizables; + results->clear(); + Status s = ListManagedObjects(T::Type(), id, &customizables); + if (s.ok()) { + for (const auto& c : customizables) { + results->push_back(std::static_pointer_cast(c)); + } + } + return s; + } + + template + Status ListManagedObjects(std::vector>* results) const { + return ListManagedObjects("", results); + } + + // Creates a new ManagedObject in the registry for the id if one does not + // currently exist. If an object with that ID already exists, the current + // object is returned. + // + // The ID is the identifier of the object to be returned/created and returned + // in result + // If a new object is created (using the object factories), the cfunc + // parameter will be invoked to configure the new object. + template + Status GetOrCreateManagedObject(const std::string& id, + std::shared_ptr* result, + const ConfigureFunc& cfunc = nullptr) { + if (parent_ != nullptr) { + auto object = parent_->GetManagedObject(T::Type(), id); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + { + std::unique_lock lock(objects_mutex_); + auto key = ToManagedObjectKey(T::Type(), id); + auto iter = managed_objects_.find(key); + if (iter != managed_objects_.end()) { + auto object = iter->second.lock(); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + std::shared_ptr object; + Status s = NewSharedObject(id, &object); + if (s.ok() && cfunc != nullptr) { + s = cfunc(object.get()); + } + if (s.ok()) { + auto c = std::static_pointer_cast(object); + if (id != c->Name()) { + // If the ID is not the base name of the class, add the new + // object under the input ID + managed_objects_[key] = c; + } + if (id != c->GetId() && c->GetId() != c->Name()) { + // If the input and current ID do not match, and the + // current ID is not the base bame, add the new object under + // its new ID + key = ToManagedObjectKey(T::Type(), c->GetId()); + managed_objects_[key] = c; + } + *result = object; + } + return s; + } + } + + // Returns the number of factories registered for this library + // for the input type. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(const std::string& type) const; + + // Returns the names of registered factories for the input type. + // names is updated to include the names for the type + void GetFactoryNames(const std::string& type, + std::vector* names) const; + + void GetFactoryTypes(std::unordered_set* types) const; + + // Dump the contents of the registry to the logger + void Dump(Logger* logger) const; + + // Invokes the input function to retrieve the properties for this plugin. + int RegisterPlugin(const std::string& name, const RegistrarFunc& func); + + private: + static std::string ToManagedObjectKey(const std::string& type, + const std::string& id) { + return type + "://" + id; + } + + // Returns the Customizable managed object associated with the key (Type/ID). + // If not found, nullptr is returned. + std::shared_ptr GetManagedObject(const std::string& type, + const std::string& id) const; + Status ListManagedObjects( + const std::string& type, const std::string& pattern, + std::vector>* results) const; + // Sets the managed object associated with the key (Type/ID) to c. + // If the named managed object does not exist, the object is added and OK is + // returned If the object exists and is the same as c, OK is returned + // Otherwise, an error status is returned. + Status SetManagedObject(const std::string& type, const std::string& id, + const std::shared_ptr& c); + + // Searches (from back to front) the libraries looking for the + // factory that matches this name. + // Returns the factory if it is found, and nullptr otherwise + template + const FactoryFunc FindFactory(const std::string& name) const { + { + std::unique_lock lock(library_mutex_); + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); + ++iter) { + const auto factory = iter->get()->FindFactory(name); + if (factory != nullptr) { + return factory; + } + } + } + if (parent_ == nullptr) { + return nullptr; + } else { + return parent_->FindFactory(name); + } + } + + // The set of libraries to search for factories for this registry. + // The libraries are searched in reverse order (back to front) when + // searching for entries. + std::vector> libraries_; + std::vector plugins_; + static std::unordered_map builtins_; + std::map> managed_objects_; + std::shared_ptr parent_; + mutable std::mutex objects_mutex_; // Mutex for managed objects + mutable std::mutex library_mutex_; // Mutex for managed libraries +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h new file mode 100644 index 000000000..c070e49a3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +class Transaction; + +// Database with Transaction support. +// +// See optimistic_transaction.h and examples/transaction_example.cc + +// Options to use when starting an Optimistic Transaction +struct OptimisticTransactionOptions { + // Setting set_snapshot=true is the same as calling SetSnapshot(). + bool set_snapshot = false; + + // Should be set if the DB has a non-default comparator. + // See comment in WriteBatchWithIndex constructor. + const Comparator* cmp = BytewiseComparator(); +}; + +enum class OccValidationPolicy { + // Validate serially at commit stage, AFTER entering the write-group. + // Isolation validation is processed single-threaded(since in the + // write-group). + // May suffer from high mutex contention, as per this link: + // https://github.com/facebook/rocksdb/issues/4402 + kValidateSerial = 0, + // Validate parallelly before commit stage, BEFORE entering the write-group to + // reduce mutex contention. Each txn acquires locks for its write-set + // records in some well-defined order. + kValidateParallel = 1 +}; + +struct OptimisticTransactionDBOptions { + OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel; + + // works only if validate_policy == OccValidationPolicy::kValidateParallel + uint32_t occ_lock_buckets = (1 << 20); +}; + +// Range deletions (including those in `WriteBatch`es passed to `Write()`) are +// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status` +class OptimisticTransactionDB : public StackableDB { + public: + // Open an OptimisticTransactionDB similar to DB::Open(). + static Status Open(const Options& options, const std::string& dbname, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr); + + virtual ~OptimisticTransactionDB() {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + OptimisticTransactionDB(const OptimisticTransactionDB&) = delete; + void operator=(const OptimisticTransactionDB&) = delete; + + protected: + // To Create an OptimisticTransactionDB, call Open() + explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h new file mode 100644 index 000000000..a73324a9e --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// Try to migrate DB created with old_opts to be use new_opts. +// Multiple column families is not supported. +// It is best-effort. No guarantee to succeed. +// A full compaction may be executed. +// If the target options use FIFO compaction, the FIFO condition might be +// sacrificed: for data migrated, data inserted later might be dropped +// earlier. This is to gurantee FIFO compaction won't drop all the +// migrated data to fit max_table_files_size. +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/options_type.h b/src/rocksdb/include/rocksdb/utilities/options_type.h new file mode 100644 index 000000000..cd340ed59 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/options_type.h @@ -0,0 +1,1221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// The OptionTypeInfo and related classes provide a framework for +// configuring and validating RocksDB classes via the Options framework. +// This file is part of the public API to allow developers who wish to +// write their own extensions and plugins to take use the Options +// framework in their custom implementations. +// +// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects +// for more information on how to develop and use custom extensions + +#pragma once + +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class OptionTypeInfo; +struct ColumnFamilyOptions; +struct DBOptions; + +// The underlying "class/type" of the option. +// This enum is used to determine how the option should +// be converted to/from strings and compared. +enum class OptionType { + kBoolean, + kInt, + kInt32T, + kInt64T, + kUInt, + kUInt8T, + kUInt32T, + kUInt64T, + kSizeT, + kString, + kDouble, + kCompactionStyle, + kCompactionPri, + kCompressionType, + kCompactionStopStyle, + kChecksumType, + kEncodingType, + kEnv, + kEnum, + kStruct, + kVector, + kConfigurable, + kCustomizable, + kEncodedString, + kTemperature, + kArray, + kUnknown, +}; + +enum class OptionVerificationType { + kNormal, + kByName, // The option is pointer typed so we can only verify + // based on it's name. + kByNameAllowNull, // Same as kByName, but it also allows the case + // where one of them is a nullptr. + kByNameAllowFromNull, // Same as kByName, but it also allows the case + // where the old option is nullptr. + kDeprecated, // The option is no longer used in rocksdb. The RocksDB + // OptionsParser will still accept this option if it + // happen to exists in some Options file. However, + // the parser will not include it in serialization + // and verification processes. + kAlias, // This option represents is a name/shortcut for + // another option and should not be written or verified + // independently +}; + +// A set of modifier flags used to alter how an option is evaluated or +// processed. These flags can be combined together (e.g. kMutable | kShared). +// The kCompare flags can be used to control if/when options are compared. +// If kCompareNever is set, two related options would never be compared (always +// equal) If kCompareExact is set, the options will only be compared if the +// sanity mode +// is exact +// kMutable means the option can be changed after it is prepared +// kShared means the option is contained in a std::shared_ptr +// kUnique means the option is contained in a std::uniqued_ptr +// kRawPointer means the option is a raw pointer value. +// kAllowNull means that an option is allowed to be null for verification +// purposes. +// kDontSerialize means this option should not be serialized and included in +// the string representation. +// kDontPrepare means do not call PrepareOptions for this pointer value. +enum class OptionTypeFlags : uint32_t { + kNone = 0x00, // No flags + kCompareDefault = 0x0, + kCompareNever = ConfigOptions::kSanityLevelNone, + kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible, + kCompareExact = ConfigOptions::kSanityLevelExactMatch, + + kMutable = 0x0100, // Option is mutable + kRawPointer = 0x0200, // The option is stored as a raw pointer + kShared = 0x0400, // The option is stored as a shared_ptr + kUnique = 0x0800, // The option is stored as a unique_ptr + kAllowNull = 0x1000, // The option can be null + kDontSerialize = 0x2000, // Don't serialize the option + kDontPrepare = 0x4000, // Don't prepare or sanitize this option + kStringNameOnly = 0x8000, // The option serializes to a name only +}; + +inline OptionTypeFlags operator|(const OptionTypeFlags& a, + const OptionTypeFlags& b) { + return static_cast(static_cast(a) | + static_cast(b)); +} + +inline OptionTypeFlags operator&(const OptionTypeFlags& a, + const OptionTypeFlags& b) { + return static_cast(static_cast(a) & + static_cast(b)); +} + +// Converts an string into its enumerated value. +// @param type_map Mapping between strings and enum values +// @param type The string representation of the enum +// @param value Returns the enum value represented by the string +// @return true if the string was found in the enum map, false otherwise. +template +bool ParseEnum(const std::unordered_map& type_map, + const std::string& type, T* value) { + auto iter = type_map.find(type); + if (iter != type_map.end()) { + *value = iter->second; + return true; + } + return false; +} + +// Converts an enum into its string representation. +// @param type_map Mapping between strings and enum values +// @param type The enum +// @param value Returned as the string representation of the enum +// @return true if the enum was found in the enum map, false otherwise. +template +bool SerializeEnum(const std::unordered_map& type_map, + const T& type, std::string* value) { + for (const auto& pair : type_map) { + if (pair.second == type) { + *value = pair.first; + return true; + } + } + return false; +} + +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result); + +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::array& vec, + std::string* value); + +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch); + +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result); + +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value); +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch); + +// Function for converting a option string value into its underlying +// representation in "addr" +// On success, Status::OK is returned and addr is set to the parsed form +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the value is parsed +// @param name The name of the options being parsed +// @param value The string representation of the option +// @param addr Pointer to the object +using ParseFunc = std::function; + +// Function for converting an option "addr" into its string representation. +// On success, Status::OK is returned and value is the serialized form. +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the values are serialized +// @param name The name of the options being serialized +// @param addr Pointer to the value being serialized +// @param value The result of the serialization. +using SerializeFunc = std::function; + +// Function for comparing two option values +// If they are not equal, updates "mismatch" with the name of the bad option +// @param opts The ConfigOptions controlling how the values are compared +// @param name The name of the options being compared +// @param addr1 The first address to compare +// @param addr2 The address to compare to +// @param mismatch If the values are not equal, the name of the option that +// first differs +using EqualsFunc = std::function; + +// Function for preparing/initializing an option. +using PrepareFunc = + std::function; + +// Function for validating an option. +using ValidateFunc = std::function; + +// A struct for storing constant option information such as option name, +// option type, and offset. +class OptionTypeInfo { + public: + // A simple "normal", non-mutable Type "type" at offset + OptionTypeInfo(int offset, OptionType type) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(OptionVerificationType::kNormal), + flags_(OptionTypeFlags::kNone) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(serialize_func), + equals_func_(equals_func), + type_(type), + verification_(verification), + flags_(flags) {} + + // Creates an OptionTypeInfo for an enum type. Enums use an additional + // map to convert the enums to/from their string representation. + // To create an OptionTypeInfo that is an Enum, one should: + // - Create a static map of string values to the corresponding enum value + // - Call this method passing the static map in as a parameter. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + template + static OptionTypeInfo Enum( + int offset, const std::unordered_map* const map, + OptionTypeFlags flags = OptionTypeFlags::kNone) { + OptionTypeInfo info(offset, OptionType::kEnum, + OptionVerificationType::kNormal, flags); + info.SetParseFunc( + // Uses the map argument to convert the input string into + // its corresponding enum value. If value is found in the map, + // addr is updated to the corresponding map entry. + // @return OK if the value is found in the map + // @return InvalidArgument if the value is not found in the map + [map](const ConfigOptions&, const std::string& name, + const std::string& value, void* addr) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (ParseEnum(*map, value, static_cast(addr))) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }); + info.SetSerializeFunc( + // Uses the map argument to convert the input enum into + // its corresponding string value. If enum value is found in the map, + // value is updated to the corresponding string value in the map. + // @return OK if the enum is found in the map + // @return InvalidArgument if the enum is not found in the map + [map](const ConfigOptions&, const std::string& name, const void* addr, + std::string* value) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (SerializeEnum(*map, (*static_cast(addr)), + value)) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }); + info.SetEqualsFunc( + // Casts addr1 and addr2 to the enum type and returns true if + // they are equal, false otherwise. + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { + return (*static_cast(addr1) == + *static_cast(addr2)); + }); + return info; + } // End OptionTypeInfo::Enum + + // Creates an OptionTypeInfo for a Struct type. Structs have a + // map of string-OptionTypeInfo associated with them that describes how + // to process the object for parsing, serializing, and matching. + // Structs also have a struct_name, which is the name of the object + // as registered in the parent map. + // When processing a struct, the option name can be specified as: + // - Meaning to process the entire struct. + // - Meaning to process the single field + // - Process the single fields + // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions + // are all examples of Struct options. + // + // To create an OptionTypeInfo that is a Struct, one should: + // - Create a static map of string-OptionTypeInfo corresponding to the + // properties of the object that can be set via the options. + // - Call this method passing the name and map in as parameters. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kStruct, verification, flags); + info.SetParseFunc( + // Parses the struct and updates the fields at addr + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + return ParseStruct(opts, struct_name, struct_map, name, value, addr); + }); + info.SetSerializeFunc( + // Serializes the struct options into value + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr, + std::string* value) { + return SerializeStruct(opts, struct_name, struct_map, name, addr, + value); + }); + info.SetEqualsFunc( + // Compares the struct fields of addr1 and addr2 for equality + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + return StructsAreEqual(opts, struct_name, struct_map, name, addr1, + addr2, mismatch); + }); + return info; + } + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) { + OptionTypeInfo info( + Struct(struct_name, struct_map, offset, verification, flags)); + return info.SetParseFunc(parse_func); + } + + template + static OptionTypeInfo Array(int _offset, OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags); + info.SetParseFunc([elem_info, separator]( + const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseArray(opts, elem_info, separator, name, value, + result); + }); + info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const void* addr, + std::string* value) { + const auto& array = *(static_cast*>(addr)); + return SerializeArray(opts, elem_info, separator, name, array, + value); + }); + info.SetEqualsFunc([elem_info](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto& array1 = *(static_cast*>(addr1)); + const auto& array2 = *(static_cast*>(addr2)); + return ArraysAreEqual(opts, elem_info, name, array1, array2, + mismatch); + }); + return info; + } + + template + static OptionTypeInfo Vector(int _offset, + OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + OptionTypeInfo info(_offset, OptionType::kVector, _verification, _flags); + info.SetParseFunc([elem_info, separator]( + const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseVector(opts, elem_info, separator, name, value, result); + }); + info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const void* addr, + std::string* value) { + const auto& vec = *(static_cast*>(addr)); + return SerializeVector(opts, elem_info, separator, name, vec, value); + }); + info.SetEqualsFunc([elem_info](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto& vec1 = *(static_cast*>(addr1)); + const auto& vec2 = *(static_cast*>(addr2)); + return VectorsAreEqual(opts, elem_info, name, vec1, vec2, mismatch); + }); + return info; + } + + // Create a new std::shared_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::shared_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kShared); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto* shared = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + shared->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, shared); + } + }); + } + + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomSharedPtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + // Create a new std::unique_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::unique_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kUnique); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto* unique = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + unique->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, unique); + } + }); + } + + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomUniquePtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + // Create a new Customizable* OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // T object. + // + // @param _offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags) { + OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kRawPointer); + return info.SetParseFunc([](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto** pointer = static_cast(addr); + if (name == kIdPropName() && value.empty()) { + *pointer = nullptr; + return Status::OK(); + } else { + return T::CreateFromString(opts, value, pointer); + } + }); + } + + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + OptionTypeInfo info(AsCustomRawPtr(offset, ovt, flags)); + info.SetSerializeFunc(serialize_func); + info.SetEqualsFunc(equals_func); + return info; + } + + OptionTypeInfo& SetParseFunc(const ParseFunc& f) { + parse_func_ = f; + return *this; + } + + OptionTypeInfo& SetSerializeFunc(const SerializeFunc& f) { + serialize_func_ = f; + return *this; + } + OptionTypeInfo& SetEqualsFunc(const EqualsFunc& f) { + equals_func_ = f; + return *this; + } + + OptionTypeInfo& SetPrepareFunc(const PrepareFunc& f) { + prepare_func_ = f; + return *this; + } + + OptionTypeInfo& SetValidateFunc(const ValidateFunc& f) { + validate_func_ = f; + return *this; + } + + bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; } + + bool IsEditable(const ConfigOptions& opts) const { + if (opts.mutable_options_only) { + return IsMutable(); + } else { + return true; + } + } + bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); } + + bool IsDeprecated() const { + return IsEnabled(OptionVerificationType::kDeprecated); + } + + // Returns true if the option is marked as an Alias. + // Aliases are valid options that are parsed but are not converted to strings + // or compared. + bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); } + + bool IsEnabled(OptionVerificationType ovf) const { + return verification_ == ovf; + } + + // Returns the sanity level for comparing the option. + // If the options should not be compared, returns None + // If the option has a compare flag, returns it. + // Otherwise, returns "exact" + ConfigOptions::SanityLevel GetSanityLevel() const { + if (IsDeprecated() || IsAlias()) { + return ConfigOptions::SanityLevel::kSanityLevelNone; + } else { + auto match = (flags_ & OptionTypeFlags::kCompareExact); + if (match == OptionTypeFlags::kCompareDefault) { + return ConfigOptions::SanityLevel::kSanityLevelExactMatch; + } else { + return (ConfigOptions::SanityLevel)match; + } + } + } + + // Returns true if the option should be serialized. + // Options should be serialized if the are not deprecated, aliases, + // or marked as "Don't Serialize". + bool ShouldSerialize() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { + return false; + } else { + return true; + } + } + + bool ShouldPrepare() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else if (IsEnabled(OptionTypeFlags::kDontPrepare)) { + return false; + } else { + return (prepare_func_ != nullptr || IsConfigurable()); + } + } + + bool ShouldValidate() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else { + return (validate_func_ != nullptr || IsConfigurable()); + } + } + + // Returns true if the option is allowed to be null. + // Options can be null if the verification type is allow from null + // or if the flags specify allow null. + bool CanBeNull() const { + return (IsEnabled(OptionTypeFlags::kAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowFromNull)); + } + + bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); } + + bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); } + + bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); } + + bool IsByName() const { + return (verification_ == OptionVerificationType::kByName || + verification_ == OptionVerificationType::kByNameAllowNull || + verification_ == OptionVerificationType::kByNameAllowFromNull); + } + + bool IsStruct() const { return (type_ == OptionType::kStruct); } + + bool IsConfigurable() const { + return (type_ == OptionType::kConfigurable || + type_ == OptionType::kCustomizable); + } + + bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); } + + inline const void* GetOffset(const void* base) const { + return static_cast(base) + offset_; + } + + inline void* GetOffset(void* base) const { + return static_cast(base) + offset_; + } + + template + const T* GetOffsetAs(const void* base) const { + const void* addr = GetOffset(base); + return static_cast(addr); + } + + template + T* GetOffsetAs(void* base) const { + void* addr = GetOffset(base); + return static_cast(addr); + } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + const T* AsRawPointer(const void* const base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + if (IsUniquePtr()) { + const auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + const auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsRawPtr()) { + const T* const* ptr = GetOffsetAs(base_addr); + return *ptr; + } else { + return GetOffsetAs(base_addr); + } + } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + T* AsRawPointer(void* base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + if (IsUniquePtr()) { + auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + auto ptr = GetOffsetAs>(base_addr); + return ptr->get(); + } else if (IsRawPtr()) { + auto ptr = GetOffsetAs(base_addr); + return *ptr; + } else { + return GetOffsetAs(base_addr); + } + } + + // Parses the option in "opt_value" according to the rules of this class + // and updates the value at "opt_ptr". + // On success, Status::OK() is returned. On failure: + // NotFound means the opt_name is not valid for this option + // NotSupported means we do not know how to parse the value for this option + // InvalidArgument means the opt_value is not valid for this option. + Status Parse(const ConfigOptions& config_options, const std::string& opt_name, + const std::string& opt_value, void* const opt_ptr) const; + + // Serializes the option in "opt_addr" according to the rules of this class + // into the value at "opt_value". + Status Serialize(const ConfigOptions& config_options, + const std::string& opt_name, const void* const opt_ptr, + std::string* opt_value) const; + + // Compares the "addr1" and "addr2" values according to the rules of this + // class and returns true if they match. On a failed match, mismatch is the + // name of the option that failed to match. + bool AreEqual(const ConfigOptions& config_options, + const std::string& opt_name, const void* const addr1, + const void* const addr2, std::string* mismatch) const; + + // Used to override the match rules for "ByName" options. + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr) const; + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const std::string& that_value) const; + + Status Prepare(const ConfigOptions& config_options, const std::string& name, + void* opt_ptr) const; + Status Validate(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts, + const std::string& name, const void* opt_ptr) const; + + // Parses the input opts_map according to the type_map for the opt_addr + // For each name-value pair in opts_map, find the corresponding name in + // type_map If the name is found: + // - set the corresponding value in opt_addr, returning the status on + // failure; + // If the name is not found: + // - If unused is specified, add the name-value to unused and continue + // - If ingore_unknown_options is false, return NotFound + // Returns OK if all options were either: + // - Successfully set + // - options were not found and ignore_unknown_options=true + // - options were not found and unused was specified + // Note that this method is much less sophisticated than the comparable + // Configurable::Configure methods. For example, on error, there is no + // attempt to return opt_addr to the initial state. Additionally, there + // is no effort to initialize (Configurable::PrepareOptions) the object + // on success. This method should typically only be used for simpler, + // standalone structures and not those that contain shared and embedded + // objects. + static Status ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + static Status ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + + // Parses the input value according to the map for the struct at opt_addr + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it, based on struct_name and + // opt_name. + static Status ParseStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const std::string& value, void* opt_addr); + + // Serializes the values from opt_addr using the rules in type_map. + // Returns the serialized form in result. + // Returns OK on success or non-OK if some option could not be serialized. + static Status SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* value); + + // Serializes the input addr according to the map for the struct to value. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static Status SerializeStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* opt_addr, std::string* value); + + // Compares the values in this_addr and that_addr using the rules in type_map. + // If the values are equal, returns true + // If the values are not equal, returns false and sets mismatch to the name + // of the first value that did not match. + static bool TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& map, + const void* this_addr, const void* that_addr, std::string* mismatch); + + // Compares the input offsets according to the map for the struct and returns + // true if they are equivalent, false otherwise. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static bool StructsAreEqual( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* this_offset, + const void* that_offset, std::string* mismatch); + + // Finds the entry for the opt_name in the opt_map, returning + // nullptr if not found. + // If found, elem_name will be the name of option to find. + // This may be opt_name, or a substring of opt_name. + // For "simple" options, opt_name will be equal to elem_name. Given the + // opt_name "opt", elem_name will equal "opt". + // For "embedded" options (like structs), elem_name may be opt_name + // or a field within the opt_name. For example, given the struct "struct", + // and opt_name of "struct.field", elem_name will be "field" + static const OptionTypeInfo* Find( + const std::string& opt_name, + const std::unordered_map& opt_map, + std::string* elem_name); + + // Returns the next token marked by the delimiter from "opts" after start in + // token and updates end to point to where that token stops. Delimiters inside + // of braces are ignored. Returns OK if a token is found and an error if the + // input opts string is mis-formatted. + // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points + // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B" + // + // @param opts The string in which to find the next token + // @param delimiter The delimiter between tokens + // @param start The position in opts to start looking for the token + // @param ed Returns the end position in opts of the token + // @param token Returns the token + // @returns OK if a token was found + // @return InvalidArgument if the braces mismatch + // (e.g. "{a={b=c;}" ) -- missing closing brace + // @return InvalidArgument if an expected delimiter is not found + // e.g. "{a=b}c=d;" -- missing delimiter before "c" + static Status NextToken(const std::string& opts, char delimiter, size_t start, + size_t* end, std::string* token); + + constexpr static const char* kIdPropName() { return "id"; } + constexpr static const char* kIdPropSuffix() { return ".id"; } + + private: + int offset_; + + // The optional function to convert a string to its representation + ParseFunc parse_func_; + + // The optional function to convert a value to its string representation + SerializeFunc serialize_func_; + + // The optional function to match two option values + EqualsFunc equals_func_; + + PrepareFunc prepare_func_; + ValidateFunc validate_func_; + OptionType type_; + OptionVerificationType verification_; + OptionTypeFlags flags_; +}; + +// Parses the input value into elements of the result array, which has fixed +// array size. For example, if the value=1:2:3 and elem_info parses integers, +// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo +// the input value has to match with that. +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this array option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or element number +// doesn't match array size defined in OptionTypeInfo +// or if the token could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result) { + Status status; + + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + size_t i = 0, start = 0, end = 0; + for (; status.ok() && i < kSize && start < value.size() && + end != std::string::npos; + i++, start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + status = elem_info.Parse(copy, name, token, &((*result)[i])); + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + if (!status.ok()) { + return status; + } + // make sure the element number matches the array size + if (i < kSize) { + return Status::InvalidArgument( + "Serialized value has less elements than array size", name); + } + if (start < value.size() && end != std::string::npos) { + return Status::InvalidArgument( + "Serialized value has more elements than array size", name); + } + return status; +} + +// Serializes the fixed size input array into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in array into their serialized form, using elem_info to perform the +// serialization. +// For example, if the array contains the integers 1,2,3 and elem_info +// serializes the output would be 1:2:3 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this array option +// @param array The input array to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, + const std::array& array, std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : array) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input arrays array1 and array2 for equality +// Elements of the array are compared one by one using elem_info to perform the +// comparison. +// +// @param config_options Controls how the arrays are compared. +// @param elem_info Controls how individual elements in the arrays are compared +// @param name The name associated with this array option +// @param array1,array2 The arrays to compare. +// @param mismatch If the arrays are not equivalent, mismatch will point to +// the first element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch) { + assert(array1.size() == kSize); + assert(array2.size() == kSize); + for (size_t i = 0; i < kSize; ++i) { + if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i], + mismatch)) { + return false; + } + } + return true; +} + +// Parses the input value into elements of the result vector. This method +// will break the input value into the individual tokens (based on the +// separator), where each of those tokens will be parsed based on the rules of +// elem_info. The result vector will be populated with elements based on the +// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses +// integers, the result vector will contain the integers 1,2,3,4,5 +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this vector option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result) { + result->clear(); + Status status; + + // Turn off ignore_unknown_objects so we can tell if the returned + // object is valid or not. + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + for (size_t start = 0, end = 0; + status.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + T elem; + status = elem_info.Parse(copy, name, token, &elem); + if (status.ok()) { + result->emplace_back(elem); + } else if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + return status; +} + +// Serializes the input vector into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in vec into their serialized form, using elem_info to perform the +// serialization. +// For example, if the vec contains the integers 1,2,3,4,5 and elem_info +// serializes the output would be 1:2:3:4:5 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this vector option +// @param vec The input vector to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : vec) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input vectors vec1 and vec2 for equality +// If the vectors are the same size, elements of the vectors are compared one by +// one using elem_info to perform the comparison. +// +// @param config_options Controls how the vectors are compared. +// @param elem_info Controls how individual elements in the vectors are compared +// @param name The name associated with this vector option +// @param vec1,vec2 The vectors to compare. +// @param mismatch If the vectors are not equivalent, mismatch will point to +// the first +// element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch) { + if (vec1.size() != vec2.size()) { + *mismatch = name; + return false; + } else { + for (size_t i = 0; i < vec1.size(); ++i) { + if (!elem_info.AreEqual( + config_options, name, reinterpret_cast(&vec1[i]), + reinterpret_cast(&vec2[i]), mismatch)) { + return false; + } + } + return true; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h new file mode 100644 index 000000000..064c087f0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/options_util.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file contains utility functions for RocksDB Options. +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; +// Constructs the DBOptions and ColumnFamilyDescriptors by loading the +// latest RocksDB options file stored in the specified rocksdb database. +// +// Note that the all the pointer options (except table_factory, which will +// be described in more details below) will be initialized with the default +// values. Developers can further initialize them after this function call. +// Below is an example list of pointer options which will be initialized +// +// * env +// * memtable_factory +// * compaction_filter_factory +// * prefix_extractor +// * comparator +// * merge_operator +// * compaction_filter +// +// User can also choose to load customized comparator, env, and/or +// merge_operator through object registry: +// * comparator needs to be registered through Registrar +// * env needs to be registered through Registrar +// * merge operator needs to be registered through +// Registrar>. +// +// For table_factory, this function further supports deserializing +// BlockBasedTableFactory and its BlockBasedTableOptions except the +// pointer options of BlockBasedTableOptions (flush_block_policy_factory, +// block_cache, and block_cache_compressed), which will be initialized with +// default values. Developers can further specify these three options by +// casting the return value of TableFactory::GetOptions() to +// BlockBasedTableOptions and making necessary changes. +// +// ignore_unknown_options can be set to true if you want to ignore options +// that are from a newer version of the db, essentially for forward +// compatibility. +// +// config_options contains a set of options that controls the processing +// of the options. The LoadLatestOptions(ConfigOptions...) should be preferred; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding options +// in the ConfigOptions parameter. +// +// examples/options_file_example.cc demonstrates how to use this function +// to open a RocksDB instance. +// +// @return the function returns an OK status when it went successfully. If +// the specified "dbpath" does not contain any option file, then a +// Status::NotFound will be returned. A return value other than +// Status::OK or Status::NotFound indicates there is some error related +// to the options file itself. +// +// @see LoadOptionsFromFile +Status LoadLatestOptions(const std::string& dbpath, Env* env, + DBOptions* db_options, + std::vector* cf_descs, + bool ignore_unknown_options = false, + std::shared_ptr* cache = {}); +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); + +// Similar to LoadLatestOptions, this function constructs the DBOptions +// and ColumnFamilyDescriptors based on the specified RocksDB Options file. +// +// The LoadOptionsFile(ConfigOptions...) should be preferred; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding +// options in the ConfigOptions parameter. +// +// @see LoadLatestOptions +Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, + DBOptions* db_options, + std::vector* cf_descs, + bool ignore_unknown_options = false, + std::shared_ptr* cache = {}); +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& options_file_name, + DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); + +// Returns the latest options file name under the specified db path. +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name); + +// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors +// are compatible with the latest options stored in the specified DB path. +// +// If the return status is non-ok, it means the specified RocksDB instance +// might not be correctly opened with the input set of options. Currently, +// changing one of the following options will fail the compatibility check: +// +// * comparator +// * prefix_extractor +// * table_factory +// * merge_operator +Status CheckOptionsCompatibility( + const std::string& dbpath, Env* env, const DBOptions& db_options, + const std::vector& cf_descs, + bool ignore_unknown_options = false); +Status CheckOptionsCompatibility( + const ConfigOptions& config_options, const std::string& dbpath, + const DBOptions& db_options, + const std::vector& cf_descs); + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/replayer.h b/src/rocksdb/include/rocksdb/utilities/replayer.h new file mode 100644 index 000000000..4fdd8d73a --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/replayer.h @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class TraceRecord; +class TraceRecordResult; + +struct ReplayOptions { + // Number of threads used for replaying. If 0 or 1, replay using + // single thread. + uint32_t num_threads; + + // Enables fast forwarding a replay by increasing/reducing the delay between + // the ingested traces. + // If > 0.0 and < 1.0, slow down the replay by this amount. + // If 1.0, replay the operations at the same rate as in the trace stream. + // If > 1, speed up the replay by this amount. + double fast_forward; + + ReplayOptions() : num_threads(1), fast_forward(1.0) {} + + ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio) + : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {} +}; + +// Replayer helps to replay the captured RocksDB query level operations. +// The Replayer can either be created from DB::NewReplayer method, or be +// instantiated via db_bench today, on using "replay" benchmark. +class Replayer { + public: + virtual ~Replayer() = default; + + // Make some preparation before replaying the trace. This will also reset the + // replayer in order to restart replaying. + virtual Status Prepare() = 0; + + // Return the timestamp when the trace recording was started. + virtual uint64_t GetHeaderTimestamp() const = 0; + + // Atomically read one trace into a TraceRecord (excluding the header and + // footer traces). + // Return Status::OK() on success; + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the read trace type is not supported. + virtual Status Next(std::unique_ptr* record) = 0; + + // Execute one TraceRecord. + // Return Status::OK() if the execution was successful. Get/MultiGet traces + // will still return Status::OK() even if they got Status::NotFound() + // from DB::Get() or DB::MultiGet(); + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the operation is not supported; + // Otherwise, return the corresponding error status. + // + // The actual operation execution status and result(s) will be saved in + // result. For example, a GetQueryTraceRecord will have its DB::Get() status + // and the returned value saved in a SingleValueTraceExecutionResult. + virtual Status Execute(const std::unique_ptr& record, + std::unique_ptr* result) = 0; + + // Replay all the traces from the provided trace stream, taking the delay + // between the traces into consideration. + // + // result_callback reports the status of executing a trace record, and the + // actual operation execution result (See the description for Execute()). + virtual Status Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h new file mode 100644 index 000000000..a682c7748 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class SimCache; + +// For instrumentation purpose, use NewSimCache instead of NewLRUCache API +// NewSimCache is a wrapper function returning a SimCache instance that can +// have additional interface provided in Simcache class besides Cache interface +// to predict block cache hit rate without actually allocating the memory. It +// can help users tune their current block cache size, and determine how +// efficient they are using the memory. +// +// Since GetSimCapacity() returns the capacity for simulation, it differs from +// actual memory usage, which can be estimated as: +// sim_capacity * entry_size / (entry_size + block_size), +// where 76 <= entry_size <= 104, +// BlockBasedTableOptions.block_size = 4096 by default but is configurable, +// Therefore, generally the actual memory overhead of SimCache is Less than +// sim_capacity * 2% +extern std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, + int num_shard_bits); + +extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits); + +class SimCache : public Cache { + public: + SimCache() {} + + ~SimCache() override {} + + const char* Name() const override { return "SimCache"; } + + // returns the maximum configured capacity of the simcache for simulation + virtual size_t GetSimCapacity() const = 0; + + // simcache doesn't provide internal handler reference to user, so always + // PinnedUsage = 0 and the behavior will be not exactly consistent the + // with real cache. + // returns the memory size for the entries residing in the simcache. + virtual size_t GetSimUsage() const = 0; + + // sets the maximum configured capacity of the simcache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will purge old entries + // to fit new capacity. + virtual void SetSimCapacity(size_t capacity) = 0; + + // returns the lookup times of simcache + virtual uint64_t get_miss_counter() const = 0; + // returns the hit times of simcache + virtual uint64_t get_hit_counter() const = 0; + // reset the lookup and hit counters + virtual void reset_counter() = 0; + // String representation of the statistics of the simcache + virtual std::string ToString() const = 0; + + // Start storing logs of the cache activity (Add/Lookup) into + // a file located at activity_log_file, max_logging_size option can be used to + // stop logging to the file automatically after reaching a specific size in + // bytes, a values of 0 disable this feature + virtual Status StartActivityLogging(const std::string& activity_log_file, + Env* env, + uint64_t max_logging_size = 0) = 0; + + // Stop cache activity logging if any + virtual void StopActivityLogging() = 0; + + // Status of cache logging happening in background + virtual Status GetActivityLoggingStatus() = 0; + + private: + SimCache(const SimCache&); + SimCache& operator=(const SimCache&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h new file mode 100644 index 000000000..9b13c3bdf --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h @@ -0,0 +1,566 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "rocksdb/db.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +namespace ROCKSDB_NAMESPACE { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB take sole ownership of the underlying db. + explicit StackableDB(DB* db) : db_(db) {} + + // StackableDB take shared ownership of the underlying db. + explicit StackableDB(std::shared_ptr db) + : db_(db.get()), shared_db_ptr_(db) {} + + ~StackableDB() { + if (shared_db_ptr_ == nullptr) { + delete db_; + } else { + assert(shared_db_ptr_.get() == db_); + } + db_ = nullptr; + } + + virtual Status Close() override { return db_->Close(); } + + virtual DB* GetBaseDB() { return db_; } + + virtual DB* GetRootDB() override { return db_->GetRootDB(); } + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamily(options, column_family_name, handle); + } + + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) override { + return db_->CreateColumnFamilies(options, column_family_names, handles); + } + + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override { + return db_->CreateColumnFamilies(column_families, handles); + } + + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override { + return db_->DropColumnFamily(column_family); + } + + virtual Status DropColumnFamilies( + const std::vector& column_families) override { + return db_->DropColumnFamilies(column_families); + } + + virtual Status DestroyColumnFamilyHandle( + ColumnFamilyHandle* column_family) override { + return db_->DestroyColumnFamilyHandle(column_family); + } + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override { + return db_->Put(options, column_family, key, val); + } + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) override { + return db_->Put(options, column_family, key, ts, val); + } + + using DB::PutEntity; + Status PutEntity(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override { + return db_->PutEntity(options, column_family, key, columns); + } + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override { + return db_->Get(options, column_family, key, value); + } + + using DB::GetEntity; + Status GetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableWideColumns* columns) override { + return db_->GetEntity(options, column_family, key, columns); + } + + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + return db_->GetMergeOperands(options, column_family, key, slice, + get_merge_operands_options, + number_of_operands); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + return db_->MultiGet(options, column_family, keys, values); + } + + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override { + return db_->MultiGet(options, column_family, num_keys, keys, values, + statuses, sorted_input); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& options) override { + return db_->IngestExternalFile(column_family, external_files, options); + } + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override { + return db_->IngestExternalFiles(args); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadata, handle); + } + + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_opts) override { + return db_->VerifyFileChecksums(read_opts); + } + + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + + virtual Status VerifyChecksum(const ReadOptions& options) override { + return db_->VerifyChecksum(options); + } + + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, column_family, key, value, value_found); + } + + using DB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->Delete(wopts, column_family, key); + } + Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) override { + return db_->Delete(wopts, column_family, key, ts); + } + + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->SingleDelete(wopts, column_family, key); + } + Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override { + return db_->SingleDelete(wopts, column_family, key, ts); + } + + using DB::DeleteRange; + Status DeleteRange(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, const Slice& start_key, + const Slice& end_key) override { + return db_->DeleteRange(wopts, column_family, start_key, end_key); + } + + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return db_->Merge(options, column_family, key, value); + } + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override { + return db_->Merge(options, column_family, key, ts, value); + } + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { + return db_->Write(opts, updates); + } + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { + return db_->NewIterator(opts, column_family); + } + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override { + return db_->NewIterators(options, column_families, iterators); + } + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + using DB::GetMapProperty; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return db_->GetProperty(column_family, property, value); + } + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override { + return db_->GetMapProperty(column_family, property, value); + } + + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override { + return db_->GetIntProperty(column_family, property, value); + } + + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return db_->GetAggregatedIntProperty(property, value); + } + + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(options, column_family, r, n, sizes); + } + + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override { + return db_->GetApproximateMemTableStats(column_family, range, count, size); + } + + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override { + return db_->CompactRange(options, column_family, begin, end); + } + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override { + return db_->CompactFiles(compact_options, column_family, input_file_names, + output_level, output_path_id, output_file_names, + compaction_job_info); + } + + virtual Status PauseBackgroundWork() override { + return db_->PauseBackgroundWork(); + } + virtual Status ContinueBackgroundWork() override { + return db_->ContinueBackgroundWork(); + } + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override { + return db_->EnableAutoCompaction(column_family_handles); + } + + virtual void EnableManualCompaction() override { + return db_->EnableManualCompaction(); + } + virtual void DisableManualCompaction() override { + return db_->DisableManualCompaction(); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return db_->NumberLevels(column_family); + } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel( + ColumnFamilyHandle* column_family) override { + return db_->MaxMemCompactionLevel(column_family); + } + + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override { + return db_->Level0StopWriteTrigger(column_family); + } + + virtual const std::string& GetName() const override { return db_->GetName(); } + + virtual Env* GetEnv() const override { return db_->GetEnv(); } + + virtual FileSystem* GetFileSystem() const override { + return db_->GetFileSystem(); + } + + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + return db_->GetOptions(column_family); + } + + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override { + return db_->GetDBOptions(); + } + + using DB::Flush; + virtual Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { + return db_->Flush(fopts, column_family); + } + virtual Status Flush( + const FlushOptions& fopts, + const std::vector& column_families) override { + return db_->Flush(fopts, column_families); + } + + virtual Status SyncWAL() override { return db_->SyncWAL(); } + + virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } + + virtual Status LockWAL() override { return db_->LockWAL(); } + + virtual Status UnlockWAL() override { return db_->UnlockWAL(); } + +#ifndef ROCKSDB_LITE + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override { + db_->GetLiveFilesMetaData(metadata); + } + + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override { + return db_->GetLiveFilesChecksumInfo(checksum_list); + } + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override { + return db_->GetLiveFilesStorageInfo(opts, files); + } + + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) override { + db_->GetColumnFamilyMetaData(column_family, cf_meta); + } + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer)); + } + + Status StartBlockCacheTrace( + const BlockCacheTraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(options, std::move(trace_writer)); + } + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartIOTrace(options, std::move(trace_writer)); + } + + using DB::EndIOTrace; + Status EndIOTrace() override { return db_->EndIOTrace(); } + + using DB::StartTrace; + Status StartTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartTrace(options, std::move(trace_writer)); + } + + using DB::EndTrace; + Status EndTrace() override { return db_->EndTrace(); } + + using DB::NewDefaultReplayer; + Status NewDefaultReplayer(const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override { + return db_->NewDefaultReplayer(handles, std::move(reader), replayer); + } + +#endif // ROCKSDB_LITE + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override { + return db_->IncreaseFullHistoryTsLow(column_family, ts_low); + } + + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override { + return db_->GetFullHistoryTsLow(column_family, ts_low); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override { + return db_->GetCurrentWalFile(current_log_file); + } + + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { + return db_->GetCreationTimeOfOldestFile(creation_time); + } + + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) const override { + return db_->GetDbIdentity(identity); + } + + virtual Status GetDbSessionId(std::string& session_id) const override { + return db_->GetDbSessionId(session_id); + } + + using DB::SetOptions; + virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, + const std::unordered_map& + new_options) override { + return db_->SetOptions(column_family_handle, new_options); + } + + virtual Status SetDBOptions( + const std::unordered_map& new_options) + override { + return db_->SetDBOptions(new_options); + } + + using DB::ResetStats; + virtual Status ResetStats() override { return db_->ResetStats(); } + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfAllTables(column_family, props); + } + + using DB::GetPropertiesOfTablesInRange; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); + } + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) override { + return db_->GetUpdatesSince(seq_number, iter, read_options); + } + + virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, + const Slice* end) override { + return db_->SuggestCompactRange(column_family, begin, end); + } + + virtual Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override { + return db_->PromoteL0(column_family, target_level); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return db_->DefaultColumnFamily(); + } + +#ifndef ROCKSDB_LITE + Status TryCatchUpWithPrimary() override { + return db_->TryCatchUpWithPrimary(); + } +#endif // ROCKSDB_LITE + + protected: + DB* db_; + std::shared_ptr shared_db_ptr_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h new file mode 100644 index 000000000..f3a4ba005 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// A factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entries or the ratio of tombstone +// entries in the whole file >= the specified deletion ratio. +class CompactOnDeletionCollectorFactory + : public TablePropertiesCollectorFactory { + public: + // A factory of a table property collector that marks a SST + // file as need-compaction when it observe at least "D" deletion + // entries in any "N" consecutive entries, or the ratio of tombstone + // entries >= deletion_ratio. + // + // @param sliding_window_size "N" + // @param deletion_trigger "D" + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + CompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio); + + ~CompactOnDeletionCollectorFactory() {} + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override; + + // Change the value of sliding_window_size "N" + // Setting it to 0 disables the delete triggered compaction + void SetWindowSize(size_t sliding_window_size) { + sliding_window_size_.store(sliding_window_size); + } + size_t GetWindowSize() const { return sliding_window_size_.load(); } + + // Change the value of deletion_trigger "D" + void SetDeletionTrigger(size_t deletion_trigger) { + deletion_trigger_.store(deletion_trigger); + } + + size_t GetDeletionTrigger() const { return deletion_trigger_.load(); } + // Change deletion ratio. + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + void SetDeletionRatio(double deletion_ratio) { + deletion_ratio_.store(deletion_ratio); + } + + double GetDeletionRatio() const { return deletion_ratio_.load(); } + static const char* kClassName() { return "CompactOnDeletionCollector"; } + const char* Name() const override { return kClassName(); } + + std::string ToString() const override; + + private: + std::atomic sliding_window_size_; + std::atomic deletion_trigger_; + std::atomic deletion_ratio_; +}; + +// Creates a factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entries, or the ratio of tombstone +// entries >= deletion_ratio. +// +// @param sliding_window_size "N". Note that this number will be +// round up to the smallest multiple of 128 that is no less +// than the specified size. +// @param deletion_trigger "D". Note that even when "N" is changed, +// the specified number for "D" will not be changed. +// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction +// based on deletion ratio. Disabled by default. +extern std::shared_ptr +NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio = 0); +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h new file mode 100644 index 000000000..1d2822988 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction.h @@ -0,0 +1,686 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator; +class TransactionDB; +class WriteBatchWithIndex; + +using TransactionName = std::string; + +using TransactionID = uint64_t; + +using TxnTimestamp = uint64_t; + +constexpr TxnTimestamp kMaxTxnTimestamp = + std::numeric_limits::max(); + +/* + class Endpoint allows to define prefix ranges. + + Prefix ranges are introduced below. + + == Basic Ranges == + Let's start from basic ranges. Key Comparator defines ordering of rowkeys. + Then, one can specify finite closed ranges by just providing rowkeys of their + endpoints: + + lower_endpoint <= X <= upper_endpoint + + However our goal is to provide a richer set of endpoints. Read on. + + == Lexicographic ordering == + A lexicographic (or dictionary) ordering satisfies these criteria: If there + are two keys in form + key_a = {prefix_a, suffix_a} + key_b = {prefix_b, suffix_b} + and + prefix_a < prefix_b + then + key_a < key_b. + + == Prefix ranges == + With lexicographic ordering, one may want to define ranges in form + + "prefix is $PREFIX" + + which translates to a range in form + + {$PREFIX, -infinity} < X < {$PREFIX, +infinity} + + where -infinity will compare less than any possible suffix, and +infinity + will compare as greater than any possible suffix. + + class Endpoint allows to define these kind of rangtes. + + == Notes == + BytewiseComparator and ReverseBytewiseComparator produce lexicographic + ordering. + + The row comparison function is able to compare key prefixes. If the data + domain includes keys A and B, then the comparison function is able to compare + equal-length prefixes: + + min_len= min(byte_length(A), byte_length(B)); + cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid + + == Other options == + As far as MyRocks is concerned, the alternative to prefix ranges would be to + support both open (non-inclusive) and closed (inclusive) range endpoints. +*/ + +class Endpoint { + public: + Slice slice; + + /* + true : the key has a "+infinity" suffix. A suffix that would compare as + greater than any other suffix + false : otherwise + */ + bool inf_suffix; + + explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false) + : slice(slice_arg), inf_suffix(inf_suffix_arg) {} + + explicit Endpoint(const char* s, bool inf_suffix_arg = false) + : slice(s), inf_suffix(inf_suffix_arg) {} + + Endpoint(const char* s, size_t size, bool inf_suffix_arg = false) + : slice(s, size), inf_suffix(inf_suffix_arg) {} + + Endpoint() : inf_suffix(false) {} +}; + +// Provides notification to the caller of SetSnapshotOnNextOperation when +// the actual snapshot gets created +class TransactionNotifier { + public: + virtual ~TransactionNotifier() {} + + // Implement this method to receive notification when a snapshot is + // requested via SetSnapshotOnNextOperation. + // Do not take exclusive ownership of `newSnapshot` because it is shared with + // the underlying transaction. + virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0; +}; + +// Provides BEGIN/COMMIT/ROLLBACK transactions. +// +// To use transactions, you must first create either an OptimisticTransactionDB +// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for +// more information. +// +// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction(). +// +// It is up to the caller to synchronize access to this object. +// +// See examples/transaction_example.cc for some simple examples. +// +// TODO(agiardullo): Not yet implemented +// -PerfContext statistics +// -Support for using Transactions with DBWithTTL +class Transaction { + public: + // No copying allowed + Transaction(const Transaction&) = delete; + void operator=(const Transaction&) = delete; + + virtual ~Transaction() {} + + // If a transaction has a snapshot set, the transaction will ensure that + // any keys successfully written(or fetched via GetForUpdate()) have not + // been modified outside of this transaction since the time the snapshot was + // set. + // If a snapshot has not been set, the transaction guarantees that keys have + // not been modified since the time each key was first written (or fetched via + // GetForUpdate()). + // + // Using SetSnapshot() will provide stricter isolation guarantees at the + // expense of potentially more transaction failures due to conflicts with + // other writes. + // + // Calling SetSnapshot() has no effect on keys written before this function + // has been called. + // + // SetSnapshot() may be called multiple times if you would like to change + // the snapshot used for different operations in this transaction. + // + // Calling SetSnapshot will not affect the version of Data returned by Get() + // methods. See Transaction::Get() for more details. + virtual void SetSnapshot() = 0; + + // Similar to SetSnapshot(), but will not change the current snapshot + // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called. + // By calling this function, the transaction will essentially call + // SetSnapshot() for you right before performing the next write/GetForUpdate. + // + // Calling SetSnapshotOnNextOperation() will not affect what snapshot is + // returned by GetSnapshot() until the next write/GetForUpdate is executed. + // + // When the snapshot is created the notifier's SnapshotCreated method will + // be called so that the caller can get access to the snapshot. + // + // This is an optimization to reduce the likelihood of conflicts that + // could occur in between the time SetSnapshot() is called and the first + // write/GetForUpdate operation. Eg, this prevents the following + // race-condition: + // + // txn1->SetSnapshot(); + // txn2->Put("A", ...); + // txn2->Commit(); + // txn1->GetForUpdate(opts, "A", ...); // FAIL! + // + // WriteCommittedTxn only: a new snapshot will be taken upon next operation, + // and next operation can be a Commit. + // TODO(yanqin) remove the "write-committed only" limitation. + virtual void SetSnapshotOnNextOperation( + std::shared_ptr notifier = nullptr) = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // + // REQUIRED: The returned Snapshot is only valid up until the next time + // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot() + // is called, or the Transaction is deleted. + virtual const Snapshot* GetSnapshot() const = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // The returned snapshot can outlive the transaction. + virtual std::shared_ptr GetTimestampedSnapshot() const = 0; + + // Clears the current snapshot (i.e. no snapshot will be 'set') + // + // This removes any snapshot that currently exists or is set to be created + // on the next update operation (SetSnapshotOnNextOperation). + // + // Calling ClearSnapshot() has no effect on keys written before this function + // has been called. + // + // If a reference to a snapshot was retrieved via GetSnapshot(), it will no + // longer be valid and should be discarded after a call to ClearSnapshot(). + virtual void ClearSnapshot() = 0; + + // Prepare the current transaction for 2PC + virtual Status Prepare() = 0; + + // Write all batched keys to the db atomically. + // + // Returns OK on success. + // + // May return any error status that could be returned by DB:Write(). + // + // If this transaction was created by an OptimisticTransactionDB(), + // Status::Busy() may be returned if the transaction could not guarantee + // that there are no write conflicts. Status::TryAgain() may be returned + // if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain). + // + // If this transaction was created by a TransactionDB(), Status::Expired() + // may be returned if this transaction has lived for longer than + // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if + // TransactionOptions.skip_prepare is false and Prepare is not called on this + // transaction before Commit. + virtual Status Commit() = 0; + + // In addition to Commit(), also creates a snapshot of the db after all + // writes by this txn are visible to other readers. + // Caller is responsible for ensuring that + // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts + // in which snapshot1 and snapshot2 are created by this API. + // + // Currently only supported by WriteCommittedTxn. Calling this method on + // other types of transactions will return non-ok Status resulting from + // Commit() or a `NotSupported` error. + // This method returns OK if and only if the transaction successfully + // commits. It is possible that transaction commits successfully but fails to + // create a timestamped snapshot. Therefore, the caller should check that the + // snapshot is created. + // notifier will be notified upon next snapshot creation. Nullable. + // ret non-null output argument storing a shared_ptr to the newly created + // snapshot. + Status CommitAndTryCreateSnapshot( + std::shared_ptr notifier = + std::shared_ptr(), + TxnTimestamp ts = kMaxTxnTimestamp, + std::shared_ptr* snapshot = nullptr); + + // Discard all batched writes in this transaction. + virtual Status Rollback() = 0; + + // Records the state of the transaction for future calls to + // RollbackToSavePoint(). May be called multiple times to set multiple save + // points. + virtual void SetSavePoint() = 0; + + // Undo all operations in this transaction (Put, Merge, Delete, PutLogData) + // since the most recent call to SetSavePoint() and removes the most recent + // SetSavePoint(). + // If there is no previous call to SetSavePoint(), returns Status::NotFound() + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // This function is similar to DB::Get() except it will also read pending + // changes in this transaction. Currently, this function will return + // Status::MergeInProgress if the most recent write to the queried key in + // this batch is a Merge. + // + // If read_options.snapshot is not set, the current version of the key will + // be read. Calling SetSnapshot() does not affect the version of the data + // returned. + // + // Note that setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, column_family, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are + // expected to override this with an implementation that calls + // DBImpl::MultiGet() + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Get(options, column_family, keys[i], &values[i]); + } + } + + // Read this key and ensure that this transaction will only + // be able to be committed if this key is not written outside this + // transaction after it has first been read (or after the snapshot if a + // snapshot is set in this transaction and do_validate is true). If + // do_validate is false, ReadOptions::snapshot is expected to be nullptr so + // that GetForUpdate returns the latest committed value. The transaction + // behavior is the same regardless of whether the key exists or not. + // + // Note: Currently, this function will return Status::MergeInProgress + // if the most recent write to the queried key in this batch is a Merge. + // + // The values returned by this function are similar to Transaction::Get(). + // If value==nullptr, then this function will not read any data, but will + // still ensure that this key cannot be written to by outside of this + // transaction. + // + // If this transaction was created by an OptimisticTransaction, GetForUpdate() + // could cause commit() to fail. Otherwise, it could return any error + // that could be returned by DB::Get(). + // + // If this transaction was created by a TransactionDB, it can return + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // Status::MergeInProgress() if merge operations cannot be resolved. + // or other errors if this key could not be read. + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive = true, + const bool do_validate = true) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, + bool exclusive = true, + const bool do_validate = true) { + if (pinnable_val == nullptr) { + std::string* null_str = nullptr; + return GetForUpdate(options, column_family, key, null_str, exclusive, + do_validate); + } else { + auto s = GetForUpdate(options, column_family, key, + pinnable_val->GetSelf(), exclusive, do_validate); + pinnable_val->PinSelf(); + return s; + } + } + + // Get a range lock on [start_endpoint; end_endpoint]. + virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&, + const Endpoint&) { + return Status::NotSupported(); + } + + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive = true, + const bool do_validate = true) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + + virtual std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) = 0; + + // Returns an iterator that will iterate on all keys in the default + // column family including both keys in the DB and uncommitted keys in this + // transaction. + // + // Setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + // + // Caller is responsible for deleting the returned Iterator. + // + // The returned iterator is only valid until Commit(), Rollback(), or + // RollbackToSavePoint() is called. + virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; + + virtual Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) = 0; + + // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding + // functions in WriteBatch, but will also do conflict checking on the + // keys being written. + // + // assume_tracked=true expects the key be already tracked. More + // specifically, it means the the key was previous tracked in the same + // savepoint, with the same exclusive flag, and at a lower sequence number. + // If valid then it skips ValidateSnapshot. Returns error otherwise. + // + // If this Transaction was created on an OptimisticTransactionDB, these + // functions should always return Status::OK(). + // + // If this Transaction was created on a TransactionDB, the status returned + // can be: + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // or other errors on unexpected failures. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) = 0; + virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; + + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const SliceParts& key) = 0; + + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const SliceParts& key) = 0; + + // PutUntracked() will write a Put to the batch of operations to be committed + // in this transaction. This write will only happen if this transaction + // gets committed successfully. But unlike Transaction::Put(), + // no conflict checking will be done for this key. + // + // If this Transaction was created on a PessimisticTransactionDB, this + // function will still acquire locks necessary to make sure this write doesn't + // cause conflicts in other transactions and may return Status::Busy(). + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) = 0; + virtual Status PutUntracked(const SliceParts& key, + const SliceParts& value) = 0; + + virtual Status MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; + + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status DeleteUntracked(const Slice& key) = 0; + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status DeleteUntracked(const SliceParts& key) = 0; + virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status SingleDeleteUntracked(const Slice& key) = 0; + + // Similar to WriteBatch::PutLogData + virtual void PutLogData(const Slice& blob) = 0; + + // By default, all Put/Merge/Delete operations will be indexed in the + // transaction so that Get/GetForUpdate/GetIterator can search for these + // keys. + // + // If the caller does not want to fetch the keys about to be written, + // they may want to avoid indexing as a performance optimization. + // Calling DisableIndexing() will turn off indexing for all future + // Put/Merge/Delete operations until EnableIndexing() is called. + // + // If a key is Put/Merge/Deleted after DisableIndexing is called and then + // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is + // undefined. + virtual void DisableIndexing() = 0; + virtual void EnableIndexing() = 0; + + // Returns the number of distinct Keys being tracked by this transaction. + // If this transaction was created by a TransactionDB, this is the number of + // keys that are currently locked by this transaction. + // If this transaction was created by an OptimisticTransactionDB, this is the + // number of keys that need to be checked for conflicts at commit time. + virtual uint64_t GetNumKeys() const = 0; + + // Returns the number of Puts/Deletes/Merges that have been applied to this + // transaction so far. + virtual uint64_t GetNumPuts() const = 0; + virtual uint64_t GetNumDeletes() const = 0; + virtual uint64_t GetNumMerges() const = 0; + + // Returns the elapsed time in milliseconds since this Transaction began. + virtual uint64_t GetElapsedTime() const = 0; + + // Fetch the underlying write batch that contains all pending changes to be + // committed. + // + // Note: You should not write or delete anything from the batch directly and + // should only use the functions in the Transaction class to + // write to this transaction. + virtual WriteBatchWithIndex* GetWriteBatch() = 0; + + // Change the value of TransactionOptions.lock_timeout (in milliseconds) for + // this transaction. + // Has no effect on OptimisticTransactions. + virtual void SetLockTimeout(int64_t timeout) = 0; + + // Return the WriteOptions that will be used during Commit() + virtual WriteOptions* GetWriteOptions() = 0; + + // Reset the WriteOptions that will be used during Commit(). + virtual void SetWriteOptions(const WriteOptions& write_options) = 0; + + // If this key was previously fetched in this transaction using + // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell + // the transaction that it no longer needs to do any conflict checking + // for this key. + // + // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(), + // then UndoGetForUpdate will only have an effect if it is also called N + // times. If this key has been written to in this transaction, + // UndoGetForUpdate() will have no effect. + // + // If SetSavePoint() has been called after the GetForUpdate(), + // UndoGetForUpdate() will not have any effect. + // + // If this Transaction was created by an OptimisticTransactionDB, + // calling UndoGetForUpdate can affect whether this key is conflict checked + // at commit time. + // If this Transaction was created by a TransactionDB, + // calling UndoGetForUpdate may release any held locks for this key. + virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual void UndoGetForUpdate(const Slice& key) = 0; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0; + + // Note: data in the commit-time-write-batch bypasses concurrency control, + // thus should be used with great caution. + // For write-prepared/write-unprepared transactions, + // GetCommitTimeWriteBatch() can be used only if the transaction is started + // with + // `TransactionOptions::use_only_the_last_commit_time_batch_for_recovery` set + // to true. Otherwise, it is possible that two uncommitted versions of the + // same key exist in the database due to the current implementation (see the + // explanation in WritePreparedTxn::CommitInternal). + // During bottommost compaction, RocksDB may + // set the sequence numbers of both to zero once becoming committed, causing + // output SST file to have two identical internal keys. + virtual WriteBatch* GetCommitTimeWriteBatch() = 0; + + virtual void SetLogNumber(uint64_t log) { log_number_ = log; } + + virtual uint64_t GetLogNumber() const { return log_number_; } + + virtual Status SetName(const TransactionName& name) = 0; + + virtual TransactionName GetName() const { return name_; } + + virtual TransactionID GetID() const { return 0; } + + virtual bool IsDeadlockDetect() const { return false; } + + virtual std::vector GetWaitingTxns( + uint32_t* /*column_family_id*/, std::string* /*key*/) const { + assert(false); + return std::vector(); + } + + enum TransactionState { + STARTED = 0, + AWAITING_PREPARE = 1, + PREPARED = 2, + AWAITING_COMMIT = 3, + COMMITTED = 4, + COMMITED = COMMITTED, // old misspelled name + AWAITING_ROLLBACK = 5, + ROLLEDBACK = 6, + LOCKS_STOLEN = 7, + }; + + TransactionState GetState() const { return txn_state_; } + void SetState(TransactionState state) { txn_state_ = state; } + + // NOTE: Experimental feature + // The globally unique id with which the transaction is identified. This id + // might or might not be set depending on the implementation. Similarly the + // implementation decides the point in lifetime of a transaction at which it + // assigns the id. Although currently it is the case, the id is not guaranteed + // to remain the same across restarts. + uint64_t GetId() { return id_; } + + virtual Status SetReadTimestampForValidation(TxnTimestamp /*ts*/) { + return Status::NotSupported("timestamp not supported"); + } + + virtual Status SetCommitTimestamp(TxnTimestamp /*ts*/) { + return Status::NotSupported("timestamp not supported"); + } + + virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; } + + protected: + explicit Transaction(const TransactionDB* /*db*/) {} + Transaction() : log_number_(0), txn_state_(STARTED) {} + + // the log in which the prepared section for this txn resides + // (for two phase commit) + uint64_t log_number_; + TransactionName name_; + + // Execution status of the transaction. + std::atomic txn_state_; + + uint64_t id_ = 0; + virtual void SetId(uint64_t id) { + assert(id_ == 0); + id_ = id; + } + + virtual uint64_t GetLastLogNumber() const { return log_number_; } + + private: + friend class PessimisticTransactionDB; + friend class WriteUnpreparedTxnDB; + friend class TransactionTest_TwoPhaseLogRollingTest_Test; + friend class TransactionTest_TwoPhaseLogRollingTest2_Test; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h new file mode 100644 index 000000000..741c59574 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h @@ -0,0 +1,508 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" + +// Database with Transaction support. +// +// See transaction.h and examples/transaction_example.cc + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexFactory; + +enum TxnDBWritePolicy { + WRITE_COMMITTED = 0, // write only the committed data + WRITE_PREPARED, // write data after the prepare phase of 2pc + WRITE_UNPREPARED // write data before the prepare phase of 2pc +}; + +constexpr uint32_t kInitialMaxDeadlocks = 5; + +class LockManager; +struct RangeLockInfo; + +// A lock manager handle +// The workflow is as follows: +// * Use a factory method (like NewRangeLockManager()) to create a lock +// manager and get its handle. +// * A Handle for a particular kind of lock manager will have extra +// methods and parameters to control the lock manager +// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It +// will be used to perform locking. +class LockManagerHandle { + public: + // PessimisticTransactionDB will call this to get the Lock Manager it's going + // to use. + virtual LockManager* getLockManager() = 0; + + virtual ~LockManagerHandle() {} +}; + +// Same as class Endpoint, but use std::string to manage the buffer allocation +struct EndpointWithString { + std::string slice; + bool inf_suffix; +}; + +struct RangeDeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + + EndpointWithString m_start; + EndpointWithString m_end; +}; + +struct RangeDeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit RangeDeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +// A handle to control RangeLockManager (Range-based lock manager) from outside +// RocksDB +class RangeLockManagerHandle : public LockManagerHandle { + public: + // Set total amount of lock memory to use. + // + // @return 0 Ok + // @return EDOM Failed to set because currently using more memory than + // specified + virtual int SetMaxLockMemory(size_t max_lock_memory) = 0; + virtual size_t GetMaxLockMemory() = 0; + + using RangeLockStatus = + std::unordered_multimap; + + // Lock Escalation barrier check function. + // It is called for a couple of endpoints A and B, such that A < B. + // If escalation_barrier_check_func(A, B)==true, then there's a lock + // escalation barrier between A and B, and lock escalation is not allowed + // to bridge the gap between A and B. + // + // The function may be called from any thread that acquires or releases + // locks. It should not throw exceptions. There is currently no way to return + // an error. + using EscalationBarrierFunc = + std::function; + + // Set the user-provided barrier check function + virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0; + + virtual RangeLockStatus GetRangeLockStatusData() = 0; + + class Counters { + public: + // Number of times lock escalation was triggered (for all column families) + uint64_t escalation_count; + + // Number of times lock acquisition had to wait for a conflicting lock + // to be released. This counts both successful waits (where the desired + // lock was acquired) and waits that timed out or got other error. + uint64_t lock_wait_count; + + // How much memory is currently used for locks (total for all column + // families) + uint64_t current_lock_memory; + }; + + // Get the current counter values + virtual Counters GetStatus() = 0; + + // Functions for range-based Deadlock reporting. + virtual std::vector GetRangeDeadlockInfoBuffer() = 0; + virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; + + virtual ~RangeLockManagerHandle() {} +}; + +// A factory function to create a Range Lock Manager. The created object should +// be: +// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in +// range-locking mode +// 2. Used to control the lock manager when the DB is already open. +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr mutex_factory); + +struct TransactionDBOptions { + // Specifies the maximum number of keys that can be locked at the same time + // per column family. + // If the number of locked keys is greater than max_num_locks, transaction + // writes (or GetForUpdate) will return an error. + // If this value is not positive, no limit will be enforced. + int64_t max_num_locks = -1; + + // Stores the number of latest deadlocks to track + uint32_t max_num_deadlocks = kInitialMaxDeadlocks; + + // Increasing this value will increase the concurrency by dividing the lock + // table (per column family) into more sub-tables, each with their own + // separate mutex. + size_t num_stripes = 16; + + // If positive, specifies the default wait timeout in milliseconds when + // a transaction attempts to lock a key if not specified by + // TransactionOptions::lock_timeout. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout. Not using a timeout is not recommended + // as it can lead to deadlocks. Currently, there is no deadlock-detection to + // recover from a deadlock. + int64_t transaction_lock_timeout = 1000; // 1 second + + // If positive, specifies the wait timeout in milliseconds when writing a key + // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() + // directly). + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout and will block indefinitely when acquiring + // a lock. + // + // Not using a timeout can lead to deadlocks. Currently, there + // is no deadlock-detection to recover from a deadlock. While DB writes + // cannot deadlock with other DB writes, they can deadlock with a transaction. + // A negative timeout should only be used if all transactions have a small + // expiration set. + int64_t default_lock_timeout = 1000; // 1 second + + // If set, the TransactionDB will use this implementation of a mutex and + // condition variable for all transaction locking instead of the default + // mutex/condvar implementation. + std::shared_ptr custom_mutex_factory; + + // The policy for when to write the data into the DB. The default policy is to + // write only the committed data (WRITE_COMMITTED). The data could be written + // before the commit phase. The DB then needs to provide the mechanisms to + // tell apart committed from uncommitted data. + TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + + // TODO(myabandeh): remove this option + // Note: this is a temporary option as a hot fix in rollback of writeprepared + // txns in myrocks. MyRocks uses merge operands for autoinc column id without + // however obtaining locks. This breaks the assumption behind the rollback + // logic in myrocks. This hack of simply not rolling back merge operands works + // for the special way that myrocks uses this operands. + bool rollback_merge_operands = false; + + // nullptr means use default lock manager. + // Other value means the user provides a custom lock manager. + std::shared_ptr lock_mgr_handle; + + // If true, the TransactionDB implementation might skip concurrency control + // unless it is overridden by TransactionOptions or + // TransactionDBWriteOptimizations. This can be used in conjunction with + // DBOptions::unordered_write when the TransactionDB is used solely for write + // ordering rather than concurrency control. + bool skip_concurrency_control = false; + + // This option is only valid for write unprepared. If a write batch exceeds + // this threshold, then the transaction will implicitly flush the currently + // pending writes into the database. A value of 0 or less means no limit. + int64_t default_write_batch_flush_threshold = 0; + + // This option is valid only for write-prepared/write-unprepared. Transaction + // will rely on this callback to determine if a key should be rolled back + // with Delete or SingleDelete when necessary. If the callback returns true, + // then SingleDelete should be used. If the callback is not callable or the + // callback returns false, then a Delete is used. + // The application should ensure thread-safety of this callback. + // The callback should not throw because RocksDB is not exception-safe. + // The callback may be removed if we allow mixing Delete and SingleDelete in + // the future. + std::function + rollback_deletion_type_callback; + + private: + // 128 entries + // Should the default value change, please also update wp_snapshot_cache_bits + // in db_stress_gflags.cc + size_t wp_snapshot_cache_bits = static_cast(7); + // 8m entry, 64MB size + // Should the default value change, please also update wp_commit_cache_bits + // in db_stress_gflags.cc + size_t wp_commit_cache_bits = static_cast(23); + + // For testing, whether transaction name should be auto-generated or not. This + // is useful for write unprepared which requires named transactions. + bool autogenerate_name = false; + + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxn; + friend class WritePreparedTransactionTestBase; + friend class TransactionTestBase; + friend class MySQLStyleTransactionTest; + friend class StressTest; +}; + +struct TransactionOptions { + // Setting set_snapshot=true is the same as calling + // Transaction::SetSnapshot(). + bool set_snapshot = false; + + // Setting to true means that before acquiring locks, this transaction will + // check if doing so will cause a deadlock. If so, it will return with + // Status::Busy. The user should retry their transaction. + bool deadlock_detect = false; + + // If set, it states that the CommitTimeWriteBatch represents the latest state + // of the application, has only one sub-batch, i.e., no duplicate keys, and + // meant to be used later during recovery. It enables an optimization to + // postpone updating the memtable with CommitTimeWriteBatch to only + // SwitchMemtable or recovery. + // This option does not affect write-committed. Only + // write-prepared/write-unprepared transactions will be affected. + bool use_only_the_last_commit_time_batch_for_recovery = false; + + // TODO(agiardullo): TransactionDB does not yet support comparators that allow + // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only + // return 0 if + // a.compare(b) returns 0. + + // If positive, specifies the wait timeout in milliseconds when + // a transaction attempts to lock a key. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, TransactionDBOptions::transaction_lock_timeout will be used. + int64_t lock_timeout = -1; + + // Expiration duration in milliseconds. If non-negative, transactions that + // last longer than this many milliseconds will fail to commit. If not set, + // a forgotten transaction that is never committed, rolled back, or deleted + // will never relinquish any locks it holds. This could prevent keys from + // being written by other writers. + int64_t expiration = -1; + + // The number of traversals to make during deadlock detection. + int64_t deadlock_detect_depth = 50; + + // The maximum number of bytes used for the write batch. 0 means no limit. + size_t max_write_batch_size = 0; + + // Skip Concurrency Control. This could be as an optimization if the + // application knows that the transaction would not have any conflict with + // concurrent transactions. It could also be used during recovery if (i) + // application guarantees no conflict between prepared transactions in the WAL + // (ii) application guarantees that recovered transactions will be rolled + // back/commit before new transactions start. + // Default: false + bool skip_concurrency_control = false; + + // In pessimistic transaction, if this is true, then you can skip Prepare + // before Commit, otherwise, you must Prepare before Commit. + bool skip_prepare = true; + + // See TransactionDBOptions::default_write_batch_flush_threshold for + // description. If a negative value is specified, then the default value from + // TransactionDBOptions is used. + int64_t write_batch_flush_threshold = -1; +}; + +// The per-write optimizations that do not involve transactions. TransactionDB +// implementation might or might not make use of the specified optimizations. +struct TransactionDBWriteOptimizations { + // If it is true it means that the application guarantees that the + // key-set in the write batch do not conflict with any concurrent transaction + // and hence the concurrency control mechanism could be skipped for this + // write. + bool skip_concurrency_control = false; + // If true, the application guarantees that there is no duplicate in the write batch and any employed mechanism to handle + // duplicate keys could be skipped. + bool skip_duplicate_key_check = false; +}; + +struct KeyLockInfo { + std::string key; + std::vector ids; + bool exclusive; +}; + +struct RangeLockInfo { + EndpointWithString start; + EndpointWithString end; + std::vector ids; + bool exclusive; +}; + +struct DeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +struct DeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit DeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +class TransactionDB : public StackableDB { + public: + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations&, + WriteBatch* updates) { + // The default implementation ignores TransactionDBWriteOptimizations and + // falls back to the un-optimized version of ::Write + return Write(opts, updates); + } + // Transactional `DeleteRange()` is not yet supported. + // However, users who know their deleted range does not conflict with + // anything can still use it via the `Write()` API. In all cases, the + // `Write()` overload specifying `TransactionDBWriteOptimizations` must be + // used and `skip_concurrency_control` must be set. When using either + // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must + // additionally be set. + using StackableDB::DeleteRange; + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + // Open a TransactionDB similar to DB::Open(). + // Internally call PrepareWrap() and WrapDB() + // If the return status is not ok, then dbptr is set to nullptr. + static Status Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + TransactionDB** dbptr); + // Note: PrepareWrap() may change parameters, make copies before the + // invocation if needed. + static void PrepareWrap(DBOptions* db_options, + std::vector* column_families, + std::vector* compaction_enabled_cf_indices); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, + TransactionDB** dbptr); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapStackableDB( + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr); + // Since the destructor in StackableDB is virtual, this destructor is virtual + // too. The root db will be deleted by the base's destructor. + ~TransactionDB() override {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; + virtual void GetAllPreparedTransactions(std::vector* trans) = 0; + + // Returns set of all locks held. + // + // The mapping is column family id -> KeyLockInfo + virtual std::unordered_multimap + GetLockStatusData() = 0; + + virtual std::vector GetDeadlockInfoBuffer() = 0; + virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; + + // Create a snapshot and assign ts to it. Return the snapshot to caller. The + // snapshot-timestamp mapping is also tracked by the database. + // Caller must ensure there are no active writes when this API is called. + virtual std::pair> + CreateTimestampedSnapshot(TxnTimestamp ts) = 0; + + // Return the latest timestamped snapshot if present. + std::shared_ptr GetLatestTimestampedSnapshot() const { + return GetTimestampedSnapshot(kMaxTxnTimestamp); + } + // Return the snapshot correponding to given timestamp. If ts is + // kMaxTxnTimestamp, then we return the latest timestamped snapshot if + // present. Othersise, we return the snapshot whose timestamp is equal to + // `ts`. If no such snapshot exists, then we return null. + virtual std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const = 0; + // Release timestamped snapshots whose timestamps are less than or equal to + // ts. + virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0; + + // Get all timestamped snapshots which will be stored in + // timestamped_snapshots. + Status GetAllTimestampedSnapshots( + std::vector>& timestamped_snapshots) + const { + return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp, + timestamped_snapshots); + } + + // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub). + // timestamped_snapshots will be cleared and contain returned snapshots. + virtual Status GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) + const = 0; + + protected: + // To Create an TransactionDB, call Open() + // The ownership of db is transferred to the base StackableDB + explicit TransactionDB(DB* db) : StackableDB(db) {} + // No copying allowed + TransactionDB(const TransactionDB&) = delete; + void operator=(const TransactionDB&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h new file mode 100644 index 000000000..e352f325a --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// TransactionDBMutex and TransactionDBCondVar APIs allows applications to +// implement custom mutexes and condition variables to be used by a +// TransactionDB when locking keys. +// +// To open a TransactionDB with a custom TransactionDBMutexFactory, set +// TransactionDBOptions.custom_mutex_factory. +class TransactionDBMutex { + public: + virtual ~TransactionDBMutex() {} + + // Attempt to acquire lock. Return OK on success, or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status Lock() = 0; + + // Attempt to acquire lock. If timeout is non-negative, operation may be + // failed after this many microseconds. + // Returns OK on success, + // TimedOut if timed out, + // or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status TryLockFor(int64_t timeout_time) = 0; + + // Unlock Mutex that was successfully locked by Lock() or TryLockUntil() + virtual void UnLock() = 0; +}; + +class TransactionDBCondVar { + public: + virtual ~TransactionDBCondVar() {} + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(). Wait() will be called with mutex locked. + // Returns OK if notified. + // Returns non-OK if TransactionDB should stop waiting and fail the operation. + // May return OK spuriously even if not notified. + virtual Status Wait(std::shared_ptr mutex) = 0; + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(), or if the timeout is reached. + // Wait() will be called with mutex locked. + // + // If timeout is non-negative, operation should be failed after this many + // microseconds. + // If implementing a custom version of this class, the implementation may + // choose to ignore the timeout. + // + // Returns OK if notified. + // Returns TimedOut if timeout is reached. + // Returns other status if TransactionDB should otherwise stop waiting and + // fail the operation. + // May return OK spuriously even if not notified. + virtual Status WaitFor(std::shared_ptr mutex, + int64_t timeout_time) = 0; + + // If any threads are waiting on *this, unblock at least one of the + // waiting threads. + virtual void Notify() = 0; + + // Unblocks all threads waiting on *this. + virtual void NotifyAll() = 0; +}; + +// Factory class that can allocate mutexes and condition variables. +class TransactionDBMutexFactory { + public: + // Create a TransactionDBMutex object. + virtual std::shared_ptr AllocateMutex() = 0; + + // Create a TransactionDBCondVar object. + virtual std::shared_ptr AllocateCondVar() = 0; + + virtual ~TransactionDBMutexFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h new file mode 100644 index 000000000..84dc11a31 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h @@ -0,0 +1,309 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class Comparator; +class DB; +class ReadCallback; +struct ReadOptions; +struct DBOptions; + +enum WriteType { + kPutRecord, + kMergeRecord, + kDeleteRecord, + kSingleDeleteRecord, + kDeleteRangeRecord, + kLogDataRecord, + kXIDRecord, + kUnknownRecord, +}; + +// an entry for Put, Merge, Delete, or SingleDelete entry for write batches. +// Used in WBWIIterator. +struct WriteEntry { + WriteType type = kUnknownRecord; + Slice key; + Slice value; +}; + +// Iterator of one column family out of a WriteBatchWithIndex. +class WBWIIterator { + public: + virtual ~WBWIIterator() {} + + virtual bool Valid() const = 0; + + virtual void SeekToFirst() = 0; + + virtual void SeekToLast() = 0; + + virtual void Seek(const Slice& key) = 0; + + virtual void SeekForPrev(const Slice& key) = 0; + + virtual void Next() = 0; + + virtual void Prev() = 0; + + // the return WriteEntry is only valid until the next mutation of + // WriteBatchWithIndex + virtual WriteEntry Entry() const = 0; + + virtual Status status() const = 0; +}; + +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +// In Put(), Merge() Delete(), or SingleDelete(), the same function of the +// wrapped will be called. At the same time, indexes will be built. +// By calling GetWriteBatch(), a user will get the WriteBatch for the data +// they inserted, which can be used for DB::Write(). +// A user can call NewIterator() to create an iterator. +class WriteBatchWithIndex : public WriteBatchBase { + public: + // backup_index_comparator: the backup comparator used to compare keys + // within the same column family, if column family is not given in the + // interface, or we can't find a column family from the column family handle + // passed in, backup_index_comparator will be used for the column family. + // reserved_bytes: reserved bytes in underlying WriteBatch + // max_bytes: maximum size of underlying WriteBatch in bytes + // overwrite_key: if true, overwrite the key in the index when inserting + // the same key as previously, so iterator will never + // show two entries with the same key. + explicit WriteBatchWithIndex( + const Comparator* backup_index_comparator = BytewiseComparator(), + size_t reserved_bytes = 0, bool overwrite_key = false, + size_t max_bytes = 0, size_t protection_bytes_per_key = 0); + + ~WriteBatchWithIndex() override; + WriteBatchWithIndex(WriteBatchWithIndex&&); + WriteBatchWithIndex& operator=(WriteBatchWithIndex&&); + + using WriteBatchBase::Put; + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Put(const Slice& key, const Slice& value) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) override; + + Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */, + const WideColumns& /* columns */) override { + if (!column_family) { + return Status::InvalidArgument( + "Cannot call this method without a column family handle"); + } + + return Status::NotSupported( + "PutEntity not supported by WriteBatchWithIndex"); + } + + using WriteBatchBase::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Merge(const Slice& key, const Slice& value) override; + Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*ts*/, const Slice& /*value*/) override { + return Status::NotSupported( + "Merge does not support user-defined timestamp"); + } + + using WriteBatchBase::Delete; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using WriteBatchBase::SingleDelete; + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + using WriteBatchBase::DeleteRange; + Status DeleteRange(ColumnFamilyHandle* /* column_family */, + const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + Status DeleteRange(const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + Status DeleteRange(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin_key*/, const Slice& /*end_key*/, + const Slice& /*ts*/) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + + using WriteBatchBase::PutLogData; + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + void Clear() override; + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override; + + // Create an iterator of a column family. User can call iterator.Seek() to + // search to the next entry of or after a key. Keys will be iterated in the + // order given by index_comparator. For multiple updates on the same key, + // each update will be returned as a separate entry, in the order of update + // time. + // + // The returned iterator should be deleted by the caller. + WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); + // Create an iterator of the default column family. + WBWIIterator* NewIterator(); + + // Will create a new Iterator that will use WBWIIterator as a delta and + // base_iterator as base. + // + // This function is only supported if the WriteBatchWithIndex was + // constructed with overwrite_key=true. + // + // The returned iterator should be deleted by the caller. + // The base_iterator is now 'owned' by the returned iterator. Deleting the + // returned iterator will also delete the base_iterator. + // + // Updating write batch with the current key of the iterator is not safe. + // We strongly recommend users not to do it. It will invalidate the current + // key() and value() of the iterator. This invalidation happens even before + // the write batch update finishes. The state may recover after Next() is + // called. + Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, + Iterator* base_iterator, + const ReadOptions* opts = nullptr); + // default column family + Iterator* NewIteratorWithBase(Iterator* base_iterator); + + // Similar to DB::Get() but will only read the key from this batch. + // If the batch does not have enough data to resolve Merge operations, + // MergeInProgress status may be returned. + Status GetFromBatch(ColumnFamilyHandle* column_family, + const DBOptions& options, const Slice& key, + std::string* value); + + // Similar to previous function but does not require a column_family. + // Note: An InvalidArgument status will be returned if there are any Merge + // operators for this key. Use previous method instead. + Status GetFromBatch(const DBOptions& options, const Slice& key, + std::string* value) { + return GetFromBatch(nullptr, options, key, value); + } + + // Similar to DB::Get() but will also read writes from this batch. + // + // This function will query both this batch and the DB and then merge + // the results using the DB's merge operator (if the batch contains any + // merge requests). + // + // Setting read_options.snapshot will affect what is read from the DB + // but will NOT change which keys are read from the batch (the keys in + // this batch do not yet belong to any snapshot and will be fetched + // regardless). + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, PinnableSlice* value); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value); + + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input); + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, SingleDelete, + // PutLogData) since the most recent call to SetSavePoint() and removes the + // most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + // + // Calling RollbackToSavePoint invalidates any open iterators on this batch. + // + // Returns Status::OK() on success, + // Status::NotFound() if no previous call to SetSavePoint(), + // or other Status on corruption. + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + void SetMaxBytes(size_t max_bytes) override; + size_t GetDataSize() const; + + private: + friend class PessimisticTransactionDB; + friend class WritePreparedTxn; + friend class WriteUnpreparedTxn; + friend class WriteBatchWithIndex_SubBatchCnt_Test; + friend class WriteBatchWithIndexInternal; + // Returns the number of sub-batches inside the write batch. A sub-batch + // starts right before inserting a key that is a duplicate of a key in the + // last sub-batch. + size_t SubBatchCnt(); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, ReadCallback* callback); + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input, ReadCallback* callback); + struct Rep; + std::unique_ptr rep; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h new file mode 100644 index 000000000..c54f3a2c3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/version.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +// NOTE: in 'main' development branch, this should be the *next* +// minor or major version number planned for release. +#define ROCKSDB_MAJOR 7 +#define ROCKSDB_MINOR 9 +#define ROCKSDB_PATCH 2 + +// Do not use these. We made the mistake of declaring macros starting with +// double underscore. Now we have to live with our choice. We'll deprecate these +// at some point +#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR +#define __ROCKSDB_MINOR__ ROCKSDB_MINOR +#define __ROCKSDB_PATCH__ ROCKSDB_PATCH + +namespace ROCKSDB_NAMESPACE { +// Returns a set of properties indicating how/when/where this version of RocksDB +// was created. +const std::unordered_map& GetRocksBuildProperties(); + +// Returns the current version of RocksDB as a string (e.g. "6.16.0"). +// If with_patch is true, the patch is included (6.16.x). +// Otherwise, only major and minor version is included (6.16) +std::string GetRocksVersionAsString(bool with_patch = true); + +// Gets the set of build properties (@see GetRocksBuildProperties) into a +// string. Properties are returned one-per-line, with the first line being: +// " from RocksDB . +// If verbose is true, the full set of properties is +// printed. If verbose is false, only the version information (@see +// GetRocksVersionString) is printed. +std::string GetRocksBuildInfoAsString(const std::string& program, + bool verbose = false); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h new file mode 100644 index 000000000..3e66c39e4 --- /dev/null +++ b/src/rocksdb/include/rocksdb/wal_filter.h @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteBatch; +struct ConfigOptions; + +// WALFilter allows an application to inspect write-ahead-log (WAL) +// records or modify their processing on recovery. +// Please see the details below. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class WalFilter : public Customizable { + public: + static const char* Type() { return "WalFilter"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, WalFilter** result); + enum class WalProcessingOption { + // Continue processing as usual + kContinueProcessing = 0, + // Ignore the current record but continue processing of log(s) + kIgnoreCurrentRecord = 1, + // Stop replay of logs and discard logs + // Logs won't be replayed on subsequent recovery + kStopReplay = 2, + // Corrupted record detected by filter + kCorruptedRecord = 3, + // Marker for enum count + kWalProcessingOptionMax = 4 + }; + + virtual ~WalFilter() {} + + // Provide ColumnFamily->LogNumber map to filter + // so that filter can determine whether a log number applies to a given + // column family (i.e. that log hasn't been flushed to SST already for the + // column family). + // We also pass in name->id map as only name is known during + // recovery (as handles are opened post-recovery). + // while write batch callbacks happen in terms of column family id. + // + // @params cf_lognumber_map column_family_id to lognumber map + // @params cf_name_id_map column_family_name to column_family_id map + + virtual void ColumnFamilyLogNumberMap( + const std::map& /*cf_lognumber_map*/, + const std::map& /*cf_name_id_map*/) {} + + // LogRecord is invoked for each log record encountered for all the logs + // during replay on logs on recovery. This method can be used to: + // * inspect the record (using the batch parameter) + // * ignoring current record + // (by returning WalProcessingOption::kIgnoreCurrentRecord) + // * reporting corrupted record + // (by returning WalProcessingOption::kCorruptedRecord) + // * stop log replay + // (by returning kStop replay) - please note that this implies + // discarding the logs from current record onwards. + // + // @params log_number log_number of the current log. + // Filter might use this to determine if the log + // record is applicable to a certain column family. + // @params log_file_name log file name - only for informational purposes + // @params batch batch encountered in the log during recovery + // @params new_batch new_batch to populate if filter wants to change + // the batch (for example to filter some records out, + // or alter some records). + // Please note that the new batch MUST NOT contain + // more records than original, else recovery would + // be failed. + // @params batch_changed Whether batch was changed by the filter. + // It must be set to true if new_batch was populated, + // else new_batch has no effect. + // @returns Processing option for the current record. + // Please see WalProcessingOption enum above for + // details. + virtual WalProcessingOption LogRecordFound( + unsigned long long /*log_number*/, const std::string& /*log_file_name*/, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { + // Default implementation falls back to older function for compatibility + return LogRecord(batch, new_batch, batch_changed); + } + + // Please see the comments for LogRecord above. This function is for + // compatibility only and contains a subset of parameters. + // New code should use the function above. + virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const { + return WalProcessingOption::kContinueProcessing; + } + + // Returns a name that identifies this WAL filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const override = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/wide_columns.h b/src/rocksdb/include/rocksdb/wide_columns.h new file mode 100644 index 000000000..7ddc61f03 --- /dev/null +++ b/src/rocksdb/include/rocksdb/wide_columns.h @@ -0,0 +1,171 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Class representing a wide column, which is defined as a pair of column name +// and column value. +class WideColumn { + public: + WideColumn() = default; + + // Initializes a WideColumn object by forwarding the name and value + // arguments to the corresponding member Slices. This makes it possible to + // construct a WideColumn using combinations of const char*, const + // std::string&, const Slice& etc., for example: + // + // constexpr char foo[] = "foo"; + // const std::string bar("bar"); + // WideColumn column(foo, bar); + template + WideColumn(N&& name, V&& value) + : name_(std::forward(name)), value_(std::forward(value)) {} + + // Initializes a WideColumn object by forwarding the elements of + // name_tuple and value_tuple to the constructors of the corresponding member + // Slices. This makes it possible to initialize the Slices using the Slice + // constructors that take more than one argument, for example: + // + // constexpr char foo_name[] = "foo_name"; + // constexpr char bar_value[] = "bar_value"; + // WideColumn column(std::piecewise_construct, + // std::forward_as_tuple(foo_name, 3), + // std::forward_as_tuple(bar_value, 3)); + template + WideColumn(std::piecewise_construct_t, NTuple&& name_tuple, + VTuple&& value_tuple) + : name_(std::make_from_tuple(std::forward(name_tuple))), + value_(std::make_from_tuple(std::forward(value_tuple))) { + } + + const Slice& name() const { return name_; } + const Slice& value() const { return value_; } + + Slice& name() { return name_; } + Slice& value() { return value_; } + + private: + Slice name_; + Slice value_; +}; + +// Note: column names and values are compared bytewise. +inline bool operator==(const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name() == rhs.name() && lhs.value() == rhs.value(); +} + +inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) { + return !(lhs == rhs); +} + +inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) { + const bool hex = + (os.flags() & std::ios_base::basefield) == std::ios_base::hex; + os << column.name().ToString(hex) << ':' << column.value().ToString(hex); + + return os; +} + +// A collection of wide columns. +using WideColumns = std::vector; + +// The anonymous default wide column (an empty Slice). +extern const Slice kDefaultWideColumnName; + +// An empty set of wide columns. +extern const WideColumns kNoWideColumns; + +// A self-contained collection of wide columns. Used for the results of +// wide-column queries. +class PinnableWideColumns { + public: + const WideColumns& columns() const { return columns_; } + size_t serialized_size() const { return value_.size(); } + + void SetPlainValue(const Slice& value); + void SetPlainValue(const Slice& value, Cleanable* cleanable); + + Status SetWideColumnValue(const Slice& value); + Status SetWideColumnValue(const Slice& value, Cleanable* cleanable); + + void Reset(); + + private: + void CopyValue(const Slice& value); + void PinOrCopyValue(const Slice& value, Cleanable* cleanable); + void CreateIndexForPlainValue(); + Status CreateIndexForWideColumns(); + + PinnableSlice value_; + WideColumns columns_; +}; + +inline void PinnableWideColumns::CopyValue(const Slice& value) { + value_.PinSelf(value); +} + +inline void PinnableWideColumns::PinOrCopyValue(const Slice& value, + Cleanable* cleanable) { + if (!cleanable) { + CopyValue(value); + return; + } + + value_.PinSlice(value, cleanable); +} + +inline void PinnableWideColumns::CreateIndexForPlainValue() { + columns_ = WideColumns{{kDefaultWideColumnName, value_}}; +} + +inline void PinnableWideColumns::SetPlainValue(const Slice& value) { + CopyValue(value); + CreateIndexForPlainValue(); +} + +inline void PinnableWideColumns::SetPlainValue(const Slice& value, + Cleanable* cleanable) { + PinOrCopyValue(value, cleanable); + CreateIndexForPlainValue(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) { + CopyValue(value); + return CreateIndexForWideColumns(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value, + Cleanable* cleanable) { + PinOrCopyValue(value, cleanable); + return CreateIndexForWideColumns(); +} + +inline void PinnableWideColumns::Reset() { + value_.Reset(); + columns_.clear(); +} + +inline bool operator==(const PinnableWideColumns& lhs, + const PinnableWideColumns& rhs) { + return lhs.columns() == rhs.columns(); +} + +inline bool operator!=(const PinnableWideColumns& lhs, + const PinnableWideColumns& rhs) { + return !(lhs == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h new file mode 100644 index 000000000..61ba5a739 --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_batch.h @@ -0,0 +1,494 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class ColumnFamilyHandle; +struct SavePoints; +struct SliceParts; + +struct SavePoint { + size_t size; // size of rep_ + int count; // count of elements in rep_ + uint32_t content_flags; + + SavePoint() : size(0), count(0), content_flags(0) {} + + SavePoint(size_t _size, int _count, uint32_t _flags) + : size(_size), count(_count), content_flags(_flags) {} + + void clear() { + size = 0; + count = 0; + content_flags = 0; + } + + bool is_cleared() const { return (size | count | content_flags) == 0; } +}; + +class WriteBatch : public WriteBatchBase { + public: + explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0) + : WriteBatch(reserved_bytes, max_bytes, 0, 0) {} + + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key, size_t default_cf_ts_sz); + ~WriteBatch() override; + + using WriteBatchBase::Put; + // Store the mapping "key->value" in the database. + // The following Put(..., const Slice& key, ...) API can also be used when + // user-defined timestamp is enabled as long as `key` points to a contiguous + // buffer with timestamp appended after user key. The caller is responsible + // for setting up the memory buffer pointed to by `key`. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) override; + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + // The following Put(..., const SliceParts& key, ...) API can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the + // column family specified by "column_family". + using WriteBatchBase::PutEntity; + Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) override; + + using WriteBatchBase::Delete; + // If the database contains a mapping for "key", erase it. Else do nothing. + // The following Delete(..., const Slice& key) can be used when user-defined + // timestamp is enabled as long as `key` points to a contiguous buffer with + // timestamp appended after user key. The caller is responsible for setting + // up the memory buffer pointed to by `key`. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + // variant that takes SliceParts + // These two variants of Delete(..., const SliceParts& key) can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + using WriteBatchBase::SingleDelete; + // WriteBatch implementation of DB::SingleDelete(). See db.h. + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) override; + + // variant that takes SliceParts + Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + using WriteBatchBase::DeleteRange; + // WriteBatch implementation of DB::DeleteRange(). See db.h. + Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key) override; + Status DeleteRange(const Slice& begin_key, const Slice& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + // begin_key and end_key should be user keys without timestamp. + Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key, const Slice& ts) override; + + // variant that takes SliceParts + Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) override; + Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + + using WriteBatchBase::Merge; + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*ts*/, const Slice& /*value*/) override; + + // variant that takes SliceParts + Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Merge(const SliceParts& key, const SliceParts& value) override { + return Merge(nullptr, key, value); + } + + using WriteBatchBase::PutLogData; + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + // Clear all updates buffered in this batch. + void Clear() override; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + // Support for iterating over the contents of a batch. + // Objects of subclasses of Handler will be used by WriteBatch::Iterate(). + class Handler { + public: + virtual ~Handler(); + // All handler functions in this class provide default implementations so + // we won't break existing clients of Handler on a source code level when + // adding a new member function. + + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutEntityCF(uint32_t /* column_family_id */, + const Slice& /* key */, + const Slice& /* entity */) { + return Status::NotSupported("PutEntityCF not implemented"); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Delete(const Slice& /*key*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + SingleDelete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and SingleDeleteCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void SingleDelete(const Slice& /*key*/) {} + + // If user-defined timestamp is enabled, then `begin_key` and `end_key` + // both include timestamp. + virtual Status DeleteRangeCF(uint32_t /*column_family_id*/, + const Slice& /*begin_key*/, + const Slice& /*end_key*/) { + return Status::InvalidArgument("DeleteRangeCF not implemented"); + } + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {} + + // If user-defined timestamp is enabled, then `key` includes timestamp. + virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/, + const Slice& /*key*/, + const Slice& /*value*/) { + return Status::InvalidArgument("PutBlobIndexCF not implemented"); + } + + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + + virtual Status MarkBeginPrepare(bool = false) { + return Status::InvalidArgument("MarkBeginPrepare() handler not defined."); + } + + virtual Status MarkEndPrepare(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkEndPrepare() handler not defined."); + } + + virtual Status MarkNoop(bool /*empty_batch*/) { + return Status::InvalidArgument("MarkNoop() handler not defined."); + } + + virtual Status MarkRollback(const Slice& /*xid*/) { + return Status::InvalidArgument( + "MarkRollbackPrepare() handler not defined."); + } + + virtual Status MarkCommit(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkCommit() handler not defined."); + } + + virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/, + const Slice& /*commit_ts*/) { + return Status::InvalidArgument( + "MarkCommitWithTimestamp() handler not defined."); + } + + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + + protected: + friend class WriteBatchInternal; + enum class OptionState { + kUnknown, + kDisabled, + kEnabled, + }; + virtual OptionState WriteAfterCommit() const { + return OptionState::kUnknown; + } + virtual OptionState WriteBeforePrepare() const { + return OptionState::kUnknown; + } + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + uint32_t Count() const; + + // Returns true if PutCF will be called during Iterate + bool HasPut() const; + + // Returns true if PutEntityCF will be called during Iterate + bool HasPutEntity() const; + + // Returns true if DeleteCF will be called during Iterate + bool HasDelete() const; + + // Returns true if SingleDeleteCF will be called during Iterate + bool HasSingleDelete() const; + + // Returns true if DeleteRangeCF will be called during Iterate + bool HasDeleteRange() const; + + // Returns true if MergeCF will be called during Iterate + bool HasMerge() const; + + // Returns true if MarkBeginPrepare will be called during Iterate + bool HasBeginPrepare() const; + + // Returns true if MarkEndPrepare will be called during Iterate + bool HasEndPrepare() const; + + // Returns true if MarkCommit will be called during Iterate + bool HasCommit() const; + + // Returns true if MarkRollback will be called during Iterate + bool HasRollback() const; + + // Experimental. + // + // Update timestamps of existing entries in the write batch if + // applicable. If a key is intended for a column family that disables + // timestamp, then this API won't set the timestamp for this key. + // This requires that all keys, if enable timestamp, (possibly from multiple + // column families) in the write batch have timestamps of the same format. + // + // ts_sz_func: callable object to obtain the timestamp sizes of column + // families. If ts_sz_func() accesses data structures, then the caller of this + // API must guarantee thread-safety. Like other parts of RocksDB, this API is + // not exception-safe. Therefore, ts_sz_func() must not throw. + // + // in: cf, the column family id. + // ret: timestamp size of the given column family. Return + // std::numeric_limits::max() indicating "don't know or column + // family info not found", this will cause UpdateTimestamps() to fail. + // size_t ts_sz_func(uint32_t cf); + Status UpdateTimestamps(const Slice& ts, + std::function ts_sz_func); + + // Verify the per-key-value checksums of this write batch. + // Corruption status will be returned if the verification fails. + // If this write batch does not have per-key-value checksum, + // OK status will be returned. + Status VerifyChecksum() const; + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override { return this; } + + // Constructor with a serialized string object + explicit WriteBatch(const std::string& rep); + explicit WriteBatch(std::string&& rep); + + WriteBatch(const WriteBatch& src); + WriteBatch(WriteBatch&& src) noexcept; + WriteBatch& operator=(const WriteBatch& src); + WriteBatch& operator=(WriteBatch&& src); + + // marks this point in the WriteBatch as the last record to + // be inserted into the WAL, provided the WAL is enabled + void MarkWalTerminationPoint(); + const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; } + + void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; } + + struct ProtectionInfo; + size_t GetProtectionBytesPerKey() const; + + private: + friend class WriteBatchInternal; + friend class LocalSavePoint; + // TODO(myabandeh): this is needed for a hack to collapse the write batch and + // remove duplicate keys. Remove it when the hack is replaced with a proper + // solution. + friend class WriteBatchWithIndex; + std::unique_ptr save_points_; + + // When sending a WriteBatch through WriteImpl we might want to + // specify that only the first x records of the batch be written to + // the WAL. + SavePoint wal_term_point_; + + // Is the content of the batch the application's latest state that meant only + // to be used for recovery? Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for + // more details. + bool is_latest_persistent_state_ = false; + + // False if all keys are from column families that disable user-defined + // timestamp OR UpdateTimestamps() has been called at least once. + // This flag will be set to true if any of the above Put(), Delete(), + // SingleDelete(), etc. APIs are called at least once. + // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag + // to true because the assumption is that these APIs have already set the + // timestamps to desired values. + bool needs_in_place_update_ts_ = false; + + // True if the write batch contains at least one key from a column family + // that enables user-defined timestamp. + bool has_key_with_ts_ = false; + + // For HasXYZ. Mutable to allow lazy computation of results + mutable std::atomic content_flags_; + + // Performs deferred computation of content_flags if necessary + uint32_t ComputeContentFlags() const; + + // Maximum size of rep_. + size_t max_bytes_; + + std::unique_ptr prot_info_; + + size_t default_cf_ts_sz_ = 0; + + protected: + std::string rep_; // See comment in write_batch.cc for the format of rep_ +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h new file mode 100644 index 000000000..f6f39ef0b --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_batch_base.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Status; +class ColumnFamilyHandle; +class WriteBatch; +struct SliceParts; + +// Abstract base class that defines the basic interface for a write batch. +// See WriteBatch for a basic implementation and WrithBatchWithIndex for an +// indexed implementation. +class WriteBatchBase { + public: + virtual ~WriteBatchBase() {} + + // Store the mapping "key->value" in the database. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Put(const SliceParts& key, const SliceParts& value); + + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the + // column family specified by "column_family". + virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, + const WideColumns& columns) = 0; + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) = 0; + + // variant that takes SliceParts + virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Merge(const SliceParts& key, const SliceParts& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + virtual Status Delete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status Delete(const SliceParts& key); + + // If the database contains a mapping for "key", erase it. Expects that the + // key was not overwritten. Else do nothing. + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status SingleDelete(const SliceParts& key); + + // If the database contains mappings in the range ["begin_key", "end_key"), + // erase them. Else do nothing. + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) = 0; + virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0; + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) = 0; + + // variant that takes SliceParts + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key); + virtual Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key); + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + virtual Status PutLogData(const Slice& blob) = 0; + + // Clear all updates buffered in this batch. + virtual void Clear() = 0; + + // Covert this batch into a WriteBatch. This is an abstracted way of + // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic + // WriteBatch. + virtual WriteBatch* GetWriteBatch() = 0; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + virtual void SetSavePoint() = 0; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // Sets the maximum size of the write batch in bytes. 0 means no limit. + virtual void SetMaxBytes(size_t max_bytes) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h new file mode 100644 index 000000000..7fb18196d --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h @@ -0,0 +1,176 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBufferManager is for managing memory allocation for one or more +// MemTables. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { +class CacheReservationManager; + +// Interface to block and signal DB instances, intended for RocksDB +// internal use only. Each DB instance contains ptr to StallInterface. +class StallInterface { + public: + virtual ~StallInterface() {} + + virtual void Block() = 0; + + virtual void Signal() = 0; +}; + +class WriteBufferManager final { + public: + // Parameters: + // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. + // memory_usage() won't be valid and ShouldFlush() will always return true. + // + // cache_: if `cache` is provided, we'll put dummy entries in the cache and + // cost the memory allocated to the cache. It can be used even if _buffer_size + // = 0. + // + // allow_stall: if set true, it will enable stalling of writes when + // memory_usage() exceeds buffer_size. It will wait for flush to complete and + // memory usage to drop down. + explicit WriteBufferManager(size_t _buffer_size, + std::shared_ptr cache = {}, + bool allow_stall = false); + // No copying allowed + WriteBufferManager(const WriteBufferManager&) = delete; + WriteBufferManager& operator=(const WriteBufferManager&) = delete; + + ~WriteBufferManager(); + + // Returns true if buffer_limit is passed to limit the total memory usage and + // is greater than 0. + bool enabled() const { return buffer_size() > 0; } + + // Returns true if pointer to cache is passed. + bool cost_to_cache() const { return cache_res_mgr_ != nullptr; } + + // Returns the total memory used by memtables. + // Only valid if enabled() + size_t memory_usage() const { + return memory_used_.load(std::memory_order_relaxed); + } + + // Returns the total memory used by active memtables. + size_t mutable_memtable_memory_usage() const { + return memory_active_.load(std::memory_order_relaxed); + } + + size_t dummy_entries_in_cache_usage() const; + + // Returns the buffer_size. + size_t buffer_size() const { + return buffer_size_.load(std::memory_order_relaxed); + } + + void SetBufferSize(size_t new_size) { + buffer_size_.store(new_size, std::memory_order_relaxed); + mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); + // Check if stall is active and can be ended. + MaybeEndWriteStall(); + } + + // Below functions should be called by RocksDB internally. + + // Should only be called from write thread + bool ShouldFlush() const { + if (enabled()) { + if (mutable_memtable_memory_usage() > + mutable_limit_.load(std::memory_order_relaxed)) { + return true; + } + size_t local_size = buffer_size(); + if (memory_usage() >= local_size && + mutable_memtable_memory_usage() >= local_size / 2) { + // If the memory exceeds the buffer size, we trigger more aggressive + // flush. But if already more than half memory is being flushed, + // triggering more flush may not help. We will hold it instead. + return true; + } + } + return false; + } + + // Returns true if total memory usage exceeded buffer_size. + // We stall the writes untill memory_usage drops below buffer_size. When the + // function returns true, all writer threads (including one checking this + // condition) across all DBs will be stalled. Stall is allowed only if user + // pass allow_stall = true during WriteBufferManager instance creation. + // + // Should only be called by RocksDB internally . + bool ShouldStall() const { + if (!allow_stall_ || !enabled()) { + return false; + } + + return IsStallActive() || IsStallThresholdExceeded(); + } + + // Returns true if stall is active. + bool IsStallActive() const { + return stall_active_.load(std::memory_order_relaxed); + } + + // Returns true if stalling condition is met. + bool IsStallThresholdExceeded() const { + return memory_usage() >= buffer_size_; + } + + void ReserveMem(size_t mem); + + // We are in the process of freeing `mem` bytes, so it is not considered + // when checking the soft limit. + void ScheduleFreeMem(size_t mem); + + void FreeMem(size_t mem); + + // Add the DB instance to the queue and block the DB. + // Should only be called by RocksDB internally. + void BeginWriteStall(StallInterface* wbm_stall); + + // If stall conditions have resolved, remove DB instances from queue and + // signal them to continue. + void MaybeEndWriteStall(); + + void RemoveDBFromQueue(StallInterface* wbm_stall); + + private: + std::atomic buffer_size_; + std::atomic mutable_limit_; + std::atomic memory_used_; + // Memory that hasn't been scheduled to free. + std::atomic memory_active_; + std::shared_ptr cache_res_mgr_; + // Protects cache_res_mgr_ + std::mutex cache_res_mgr_mu_; + + std::list queue_; + // Protects the queue_ and stall_active_. + std::mutex mu_; + bool allow_stall_; + // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() + // while holding mu_, but it can be read without a lock. + std::atomic stall_active_; + + void ReserveMemWithCache(size_t mem); + void FreeMemWithCache(size_t mem); +}; +} // namespace ROCKSDB_NAMESPACE -- cgit v1.2.3