summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/include
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/include/rocksdb/advanced_options.h1098
-rw-r--r--src/rocksdb/include/rocksdb/block_cache_trace_writer.h149
-rw-r--r--src/rocksdb/include/rocksdb/c.h2793
-rw-r--r--src/rocksdb/include/rocksdb/cache.h775
-rw-r--r--src/rocksdb/include/rocksdb/cache_bench_tool.h14
-rw-r--r--src/rocksdb/include/rocksdb/cleanable.h128
-rw-r--r--src/rocksdb/include/rocksdb/compaction_filter.h256
-rw-r--r--src/rocksdb/include/rocksdb/compaction_job_stats.h109
-rw-r--r--src/rocksdb/include/rocksdb/comparator.h164
-rw-r--r--src/rocksdb/include/rocksdb/compression_type.h40
-rw-r--r--src/rocksdb/include/rocksdb/concurrent_task_limiter.h51
-rw-r--r--src/rocksdb/include/rocksdb/configurable.h400
-rw-r--r--src/rocksdb/include/rocksdb/convenience.h525
-rw-r--r--src/rocksdb/include/rocksdb/customizable.h233
-rw-r--r--src/rocksdb/include/rocksdb/data_structure.h51
-rw-r--r--src/rocksdb/include/rocksdb/db.h1859
-rw-r--r--src/rocksdb/include/rocksdb/db_bench_tool.h11
-rw-r--r--src/rocksdb/include/rocksdb/db_dump_tool.h45
-rw-r--r--src/rocksdb/include/rocksdb/db_stress_tool.h11
-rw-r--r--src/rocksdb/include/rocksdb/env.h1893
-rw-r--r--src/rocksdb/include/rocksdb/env_encryption.h465
-rw-r--r--src/rocksdb/include/rocksdb/experimental.h56
-rw-r--r--src/rocksdb/include/rocksdb/file_checksum.h146
-rw-r--r--src/rocksdb/include/rocksdb/file_system.h1849
-rw-r--r--src/rocksdb/include/rocksdb/filter_policy.h206
-rw-r--r--src/rocksdb/include/rocksdb/flush_block_policy.h75
-rw-r--r--src/rocksdb/include/rocksdb/functor_wrapper.h56
-rw-r--r--src/rocksdb/include/rocksdb/io_status.h244
-rw-r--r--src/rocksdb/include/rocksdb/iostats_context.h98
-rw-r--r--src/rocksdb/include/rocksdb/iterator.h144
-rw-r--r--src/rocksdb/include/rocksdb/ldb_tool.h44
-rw-r--r--src/rocksdb/include/rocksdb/listener.h847
-rw-r--r--src/rocksdb/include/rocksdb/memory_allocator.h81
-rw-r--r--src/rocksdb/include/rocksdb/memtablerep.h423
-rw-r--r--src/rocksdb/include/rocksdb/merge_operator.h265
-rw-r--r--src/rocksdb/include/rocksdb/metadata.h245
-rw-r--r--src/rocksdb/include/rocksdb/options.h2113
-rw-r--r--src/rocksdb/include/rocksdb/perf_context.h274
-rw-r--r--src/rocksdb/include/rocksdb/perf_level.h36
-rw-r--r--src/rocksdb/include/rocksdb/persistent_cache.h74
-rw-r--r--src/rocksdb/include/rocksdb/rate_limiter.h159
-rw-r--r--src/rocksdb/include/rocksdb/rocksdb_namespace.h16
-rw-r--r--src/rocksdb/include/rocksdb/secondary_cache.h133
-rw-r--r--src/rocksdb/include/rocksdb/slice.h264
-rw-r--r--src/rocksdb/include/rocksdb/slice_transform.h135
-rw-r--r--src/rocksdb/include/rocksdb/snapshot.h53
-rw-r--r--src/rocksdb/include/rocksdb/sst_dump_tool.h19
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_manager.h136
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_reader.h47
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_writer.h174
-rw-r--r--src/rocksdb/include/rocksdb/sst_partitioner.h142
-rw-r--r--src/rocksdb/include/rocksdb/statistics.h707
-rw-r--r--src/rocksdb/include/rocksdb/stats_history.h70
-rw-r--r--src/rocksdb/include/rocksdb/status.h570
-rw-r--r--src/rocksdb/include/rocksdb/system_clock.h116
-rw-r--r--src/rocksdb/include/rocksdb/table.h940
-rw-r--r--src/rocksdb/include/rocksdb/table_properties.h327
-rw-r--r--src/rocksdb/include/rocksdb/table_reader_caller.h41
-rw-r--r--src/rocksdb/include/rocksdb/thread_status.h189
-rw-r--r--src/rocksdb/include/rocksdb/threadpool.h67
-rw-r--r--src/rocksdb/include/rocksdb/trace_reader_writer.h52
-rw-r--r--src/rocksdb/include/rocksdb/trace_record.h248
-rw-r--r--src/rocksdb/include/rocksdb/trace_record_result.h187
-rw-r--r--src/rocksdb/include/rocksdb/transaction_log.h122
-rw-r--r--src/rocksdb/include/rocksdb/types.h66
-rw-r--r--src/rocksdb/include/rocksdb/unique_id.h55
-rw-r--r--src/rocksdb/include/rocksdb/universal_compaction.h96
-rw-r--r--src/rocksdb/include/rocksdb/utilities/agg_merge.h138
-rw-r--r--src/rocksdb/include/rocksdb/utilities/backup_engine.h631
-rw-r--r--src/rocksdb/include/rocksdb/utilities/cache_dump_load.h142
-rw-r--r--src/rocksdb/include/rocksdb/utilities/checkpoint.h66
-rw-r--r--src/rocksdb/include/rocksdb/utilities/convenience.h10
-rw-r--r--src/rocksdb/include/rocksdb/utilities/customizable_util.h377
-rw-r--r--src/rocksdb/include/rocksdb/utilities/db_ttl.h72
-rw-r--r--src/rocksdb/include/rocksdb/utilities/debug.h48
-rw-r--r--src/rocksdb/include/rocksdb/utilities/env_mirror.h181
-rw-r--r--src/rocksdb/include/rocksdb/utilities/info_log_finder.h19
-rw-r--r--src/rocksdb/include/rocksdb/utilities/ldb_cmd.h318
-rw-r--r--src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h75
-rw-r--r--src/rocksdb/include/rocksdb/utilities/leveldb_options.h145
-rw-r--r--src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h43
-rw-r--r--src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h55
-rw-r--r--src/rocksdb/include/rocksdb/utilities/memory_util.h50
-rw-r--r--src/rocksdb/include/rocksdb/utilities/object_registry.h585
-rw-r--r--src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h100
-rw-r--r--src/rocksdb/include/rocksdb/utilities/option_change_migration.h24
-rw-r--r--src/rocksdb/include/rocksdb/utilities/options_type.h1221
-rw-r--r--src/rocksdb/include/rocksdb/utilities/options_util.h128
-rw-r--r--src/rocksdb/include/rocksdb/utilities/replayer.h87
-rw-r--r--src/rocksdb/include/rocksdb/utilities/sim_cache.h96
-rw-r--r--src/rocksdb/include/rocksdb/utilities/stackable_db.h566
-rw-r--r--src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h90
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction.h686
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction_db.h508
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h91
-rw-r--r--src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h309
-rw-r--r--src/rocksdb/include/rocksdb/version.h43
-rw-r--r--src/rocksdb/include/rocksdb/wal_filter.h111
-rw-r--r--src/rocksdb/include/rocksdb/wide_columns.h171
-rw-r--r--src/rocksdb/include/rocksdb/write_batch.h494
-rw-r--r--src/rocksdb/include/rocksdb/write_batch_base.h144
-rw-r--r--src/rocksdb/include/rocksdb/write_buffer_manager.h176
102 files changed, 31477 insertions, 0 deletions
diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h
new file mode 100644
index 000000000..258cf82a1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/advanced_options.h
@@ -0,0 +1,1098 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+class TablePropertiesCollectorFactory;
+class TableFactory;
+struct Options;
+
+enum CompactionStyle : char {
+ // level based compaction style
+ kCompactionStyleLevel = 0x0,
+ // Universal compaction style
+ // Not supported in ROCKSDB_LITE.
+ kCompactionStyleUniversal = 0x1,
+ // FIFO compaction style
+ // Not supported in ROCKSDB_LITE
+ kCompactionStyleFIFO = 0x2,
+ // Disable background compaction. Compaction jobs are submitted
+ // via CompactFiles().
+ // Not supported in ROCKSDB_LITE
+ kCompactionStyleNone = 0x3,
+};
+
+// In Level-based compaction, it Determines which file from a level to be
+// picked to merge to the next level. We suggest people try
+// kMinOverlappingRatio first when you tune your database.
+enum CompactionPri : char {
+ // Slightly prioritize larger files by size compensated by #deletes
+ kByCompensatedSize = 0x0,
+ // First compact files whose data's latest update time is oldest.
+ // Try this if you only update some hot keys in small ranges.
+ kOldestLargestSeqFirst = 0x1,
+ // First compact files whose range hasn't been compacted to the next level
+ // for the longest. If your updates are random across the key space,
+ // write amplification is slightly better with this option.
+ kOldestSmallestSeqFirst = 0x2,
+ // First compact files whose ratio between overlapping size in next level
+ // and its size is the smallest. It in many cases can optimize write
+ // amplification.
+ kMinOverlappingRatio = 0x3,
+ // Keeps a cursor(s) of the successor of the file (key range) was/were
+ // compacted before, and always picks the next files (key range) in that
+ // level. The file picking process will cycle through all the files in a
+ // round-robin manner.
+ kRoundRobin = 0x4,
+};
+
+struct CompactionOptionsFIFO {
+ // once the total sum of table files reaches this, we will delete the oldest
+ // table file
+ // Default: 1GB
+ uint64_t max_table_files_size;
+
+ // If true, try to do compaction to compact smaller files into larger ones.
+ // Minimum files to compact follows options.level0_file_num_compaction_trigger
+ // and compaction won't trigger if average compact bytes per del file is
+ // larger than options.write_buffer_size. This is to protect large files
+ // from being compacted again.
+ // Default: false;
+ bool allow_compaction = false;
+
+ // When not 0, if the data in the file is older than this threshold, RocksDB
+ // will soon move the file to warm temperature.
+ uint64_t age_for_warm = 0;
+
+ CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
+ CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
+ : max_table_files_size(_max_table_files_size),
+ allow_compaction(_allow_compaction) {}
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+ // RocksDB's generic default compression level. Internally it'll be translated
+ // to the default compression level specific to the library being used (see
+ // comment above `ColumnFamilyOptions::compression`).
+ //
+ // The default value is the max 16-bit int as it'll be written out in OPTIONS
+ // file, which should be portable.
+ const static int kDefaultCompressionLevel = 32767;
+
+ int window_bits;
+ int level;
+ int strategy;
+
+ // Maximum size of dictionaries used to prime the compression library.
+ // Enabling dictionary can improve compression ratios when there are
+ // repetitions across data blocks.
+ //
+ // The dictionary is created by sampling the SST file data. If
+ // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
+ // dictionary generator (see comments for option `use_zstd_dict_trainer` for
+ // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the
+ // random samples are used directly as the dictionary.
+ //
+ // When compression dictionary is disabled, we compress and write each block
+ // before buffering data for the next one. When compression dictionary is
+ // enabled, we buffer SST file data in-memory so we can sample it, as data
+ // can only be compressed and written after the dictionary has been finalized.
+ //
+ // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This
+ // buffered memory is charged to the block cache when there is a block cache.
+ // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is
+ // full), we finalize the dictionary with whatever data we have and then stop
+ // buffering.
+ //
+ // Default: 0.
+ uint32_t max_dict_bytes;
+
+ // Maximum size of training data passed to zstd's dictionary trainer. Using
+ // zstd's dictionary trainer can achieve even better compression ratio
+ // improvements than using `max_dict_bytes` alone.
+ //
+ // The training data will be used to generate a dictionary of max_dict_bytes.
+ //
+ // Default: 0.
+ uint32_t zstd_max_train_bytes;
+
+ // Number of threads for parallel compression.
+ // Parallel compression is enabled only if threads > 1.
+ // THE FEATURE IS STILL EXPERIMENTAL
+ //
+ // This option is valid only when BlockBasedTable is used.
+ //
+ // When parallel compression is enabled, SST size file sizes might be
+ // more inflated compared to the target size, because more data of unknown
+ // compressed size is in flight when compression is parallelized. To be
+ // reasonably accurate, this inflation is also estimated by using historical
+ // compression ratio and current bytes inflight.
+ //
+ // Default: 1.
+ uint32_t parallel_threads;
+
+ // When the compression options are set by the user, it will be set to "true".
+ // For bottommost_compression_opts, to enable it, user must set enabled=true.
+ // Otherwise, bottommost compression will use compression_opts as default
+ // compression options.
+ //
+ // For compression_opts, if compression_opts.enabled=false, it is still
+ // used as compression options for compression process.
+ //
+ // Default: false.
+ bool enabled;
+
+ // Limit on data buffering when gathering samples to build a dictionary. Zero
+ // means no limit. When dictionary is disabled (`max_dict_bytes == 0`),
+ // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect.
+ //
+ // In compaction, the buffering is limited to the target file size (see
+ // `target_file_size_base` and `target_file_size_multiplier`) even if this
+ // setting permits more buffering. Since we cannot determine where the file
+ // should be cut until data blocks are compressed with dictionary, buffering
+ // more than the target file size could lead to selecting samples that belong
+ // to a later output SST.
+ //
+ // Limiting too strictly may harm dictionary effectiveness since it forces
+ // RocksDB to pick samples from the initial portion of the output SST, which
+ // may not be representative of the whole file. Configuring this limit below
+ // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can
+ // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can
+ // restrict the size of the final dictionary.
+ //
+ // Default: 0 (unlimited)
+ uint64_t max_dict_buffer_bytes;
+
+ // Use zstd trainer to generate dictionaries. When this option is set to true,
+ // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes
+ // buffered data will be passed to zstd dictionary trainer to generate a
+ // dictionary of size max_dict_bytes.
+ //
+ // When this option is false, zstd's API ZDICT_finalizeDictionary() will be
+ // called to generate dictionaries. zstd_max_train_bytes of training sampled
+ // data will be passed to this API. Using this API should save CPU time on
+ // dictionary training, but the compression ratio may not be as good as using
+ // a dictionary trainer.
+ //
+ // Default: true
+ bool use_zstd_dict_trainer;
+
+ CompressionOptions()
+ : window_bits(-14),
+ level(kDefaultCompressionLevel),
+ strategy(0),
+ max_dict_bytes(0),
+ zstd_max_train_bytes(0),
+ parallel_threads(1),
+ enabled(false),
+ max_dict_buffer_bytes(0),
+ use_zstd_dict_trainer(true) {}
+ CompressionOptions(int wbits, int _lev, int _strategy,
+ uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes,
+ uint32_t _parallel_threads, bool _enabled,
+ uint64_t _max_dict_buffer_bytes,
+ bool _use_zstd_dict_trainer)
+ : window_bits(wbits),
+ level(_lev),
+ strategy(_strategy),
+ max_dict_bytes(_max_dict_bytes),
+ zstd_max_train_bytes(_zstd_max_train_bytes),
+ parallel_threads(_parallel_threads),
+ enabled(_enabled),
+ max_dict_buffer_bytes(_max_dict_buffer_bytes),
+ use_zstd_dict_trainer(_use_zstd_dict_trainer) {}
+};
+
+// Temperature of a file. Used to pass to FileSystem for a different
+// placement and/or coding.
+// Reserve some numbers in the middle, in case we need to insert new tier
+// there.
+enum class Temperature : uint8_t {
+ kUnknown = 0,
+ kHot = 0x04,
+ kWarm = 0x08,
+ kCold = 0x0C,
+ kLastTemperature,
+};
+
+// The control option of how the cache tiers will be used. Currently rocksdb
+// support block cache (volatile tier), secondary cache (non-volatile tier).
+// In the future, we may add more caching layers.
+enum class CacheTier : uint8_t {
+ kVolatileTier = 0,
+ kNonVolatileBlockTier = 0x01,
+};
+
+enum UpdateStatus { // Return status For inplace update callback
+ UPDATE_FAILED = 0, // Nothing to update
+ UPDATED_INPLACE = 1, // Value updated inplace
+ UPDATED = 2, // No inplace update. Merged value set
+};
+
+enum class PrepopulateBlobCache : uint8_t {
+ kDisable = 0x0, // Disable prepopulate blob cache
+ kFlushOnly = 0x1, // Prepopulate blobs during flush only
+};
+
+struct AdvancedColumnFamilyOptions {
+ // The maximum number of write buffers that are built up in memory.
+ // The default and the minimum number is 2, so that when 1 write buffer
+ // is being flushed to storage, new writes can continue to the other
+ // write buffer.
+ // If max_write_buffer_number > 3, writing will be slowed down to
+ // options.delayed_write_rate if we are writing to the last write buffer
+ // allowed.
+ //
+ // Default: 2
+ //
+ // Dynamically changeable through SetOptions() API
+ int max_write_buffer_number = 2;
+
+ // The minimum number of write buffers that will be merged together
+ // before writing to storage. If set to 1, then
+ // all write buffers are flushed to L0 as individual files and this increases
+ // read amplification because a get request has to check in all of these
+ // files. Also, an in-memory merge may result in writing lesser
+ // data to storage if there are duplicate records in each of these
+ // individual write buffers.
+ // If atomic flush is enabled (options.atomic_flush == true), then this
+ // option will be sanitized to 1.
+ // Default: 1
+ int min_write_buffer_number_to_merge = 1;
+
+ // DEPRECATED
+ // The total maximum number of write buffers to maintain in memory including
+ // copies of buffers that have already been flushed. Unlike
+ // max_write_buffer_number, this parameter does not affect flushing.
+ // This parameter is being replaced by max_write_buffer_size_to_maintain.
+ // If both parameters are set to non-zero values, this parameter will be
+ // ignored.
+ int max_write_buffer_number_to_maintain = 0;
+
+ // The target number of write history bytes to hold in memory. Write history
+ // comprises the latest write buffers (memtables). To reach the target, write
+ // buffers that were most recently flushed to SST files may be retained in
+ // memory.
+ //
+ // This controls the target amount of write history that will be available
+ // in memory for conflict checking when Transactions are used.
+ //
+ // This target may be undershot when the CF first opens and has not recovered
+ // or received enough writes to reach the target. After reaching the target
+ // once, it is guaranteed to never undershoot again. That guarantee is
+ // implemented by retaining flushed write buffers in-memory until the oldest
+ // one can be trimmed without dropping below the target.
+ //
+ // Examples with `max_write_buffer_size_to_maintain` set to 32MB:
+ //
+ // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB,
+ // and zero flushed immutable memtables. Nothing trimmable exists.
+ // - One mutable memtable of 16MB, zero unflushed immutable memtables, and
+ // one flushed immutable memtable of 64MB. Trimming is disallowed because
+ // dropping the earliest (only) flushed immutable memtable would result in
+ // write history of 16MB < 32MB.
+ // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB,
+ // and one flushed immutable memtable of 16MB. The earliest (only) flushed
+ // immutable memtable is trimmed because without it we still have
+ // 16MB + 24MB = 40MB > 32MB of write history.
+ //
+ // When using an OptimisticTransactionDB:
+ // If this value is too low, some transactions may fail at commit time due
+ // to not being able to determine whether there were any write conflicts.
+ //
+ // When using a TransactionDB:
+ // If Transaction::SetSnapshot is used, TransactionDB will read either
+ // in-memory write buffers or SST files to do write-conflict checking.
+ // Increasing this value can reduce the number of reads to SST files
+ // done for conflict detection.
+ //
+ // Setting this value to 0 will cause write buffers to be freed immediately
+ // after they are flushed. If this value is set to -1,
+ // 'max_write_buffer_number * write_buffer_size' will be used.
+ //
+ // Default:
+ // If using a TransactionDB/OptimisticTransactionDB, the default value will
+ // be set to the value of 'max_write_buffer_number * write_buffer_size'
+ // if it is not explicitly set by the user. Otherwise, the default is 0.
+ int64_t max_write_buffer_size_to_maintain = 0;
+
+ // Allows thread-safe inplace updates. If this is true, there is no way to
+ // achieve point-in-time consistency using snapshot or iterator (assuming
+ // concurrent updates). Hence iterator and multi-get will return results
+ // which are not consistent as of any point-in-time.
+ // Backward iteration on memtables will not work either.
+ // If inplace_callback function is not set,
+ // Put(key, new_value) will update inplace the existing_value iff
+ // * key exists in current memtable
+ // * new sizeof(new_value) <= sizeof(existing_value)
+ // * existing_value for that key is a put i.e. kTypeValue
+ // If inplace_callback function is set, check doc for inplace_callback.
+ // Default: false.
+ bool inplace_update_support = false;
+
+ // Number of locks used for inplace update
+ // Default: 10000, if inplace_update_support = true, else 0.
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t inplace_update_num_locks = 10000;
+
+ // [experimental]
+ // Used to activate or deactive the Mempurge feature (memtable garbage
+ // collection). (deactivated by default). At every flush, the total useful
+ // payload (total entries minus garbage entries) is estimated as a ratio
+ // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then
+ // compared to this `threshold` value:
+ // - if ratio<threshold: the flush is replaced by a mempurge operation
+ // - else: a regular flush operation takes place.
+ // Threshold values:
+ // 0.0: mempurge deactivated (default).
+ // 1.0: recommended threshold value.
+ // >1.0 : aggressive mempurge.
+ // 0 < threshold < 1.0: mempurge triggered only for very low useful payload
+ // ratios.
+ // [experimental]
+ double experimental_mempurge_threshold = 0.0;
+
+ // existing_value - pointer to previous value (from both memtable and sst).
+ // nullptr if key doesn't exist
+ // existing_value_size - pointer to size of existing_value).
+ // nullptr if key doesn't exist
+ // delta_value - Delta value to be merged with the existing_value.
+ // Stored in transaction logs.
+ // merged_value - Set when delta is applied on the previous value.
+ //
+ // Applicable only when inplace_update_support is true,
+ // this callback function is called at the time of updating the memtable
+ // as part of a Put operation, lets say Put(key, delta_value). It allows the
+ // 'delta_value' specified as part of the Put operation to be merged with
+ // an 'existing_value' of the key in the database.
+ //
+ // If the merged value is smaller in size that the 'existing_value',
+ // then this function can update the 'existing_value' buffer inplace and
+ // the corresponding 'existing_value'_size pointer, if it wishes to.
+ // The callback should return UpdateStatus::UPDATED_INPLACE.
+ // In this case. (In this case, the snapshot-semantics of the rocksdb
+ // Iterator is not atomic anymore).
+ //
+ // If the merged value is larger in size than the 'existing_value' or the
+ // application does not wish to modify the 'existing_value' buffer inplace,
+ // then the merged value should be returned via *merge_value. It is set by
+ // merging the 'existing_value' and the Put 'delta_value'. The callback should
+ // return UpdateStatus::UPDATED in this case. This merged value will be added
+ // to the memtable.
+ //
+ // If merging fails or the application does not wish to take any action,
+ // then the callback should return UpdateStatus::UPDATE_FAILED.
+ //
+ // Please remember that the original call from the application is Put(key,
+ // delta_value). So the transaction log (if enabled) will still contain (key,
+ // delta_value). The 'merged_value' is not stored in the transaction log.
+ // Hence the inplace_callback function should be consistent across db reopens.
+ //
+ // RocksDB callbacks are NOT exception-safe. A callback completing with an
+ // exception can lead to undefined behavior in RocksDB, including data loss,
+ // unreported corruption, deadlocks, and more.
+ //
+ // Default: nullptr
+ UpdateStatus (*inplace_callback)(char* existing_value,
+ uint32_t* existing_value_size,
+ Slice delta_value,
+ std::string* merged_value) = nullptr;
+
+ // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+ // Bloom filter in memtable to optimize many queries that must go beyond
+ // the memtable. The size in bytes of the filter is
+ // write_buffer_size * memtable_prefix_bloom_size_ratio.
+ // * If prefix_extractor is set, the filter includes prefixes.
+ // * If memtable_whole_key_filtering, the filter includes whole keys.
+ // * If both, the filter includes both.
+ // * If neither, the feature is disabled.
+ //
+ // If this value is larger than 0.25, it is sanitized to 0.25.
+ //
+ // Default: 0 (disabled)
+ //
+ // Dynamically changeable through SetOptions() API
+ double memtable_prefix_bloom_size_ratio = 0.0;
+
+ // Enable whole key bloom filter in memtable. Note this will only take effect
+ // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+ // can potentially reduce CPU usage for point-look-ups.
+ //
+ // Default: false (disabled)
+ //
+ // Dynamically changeable through SetOptions() API
+ bool memtable_whole_key_filtering = false;
+
+ // Page size for huge page for the arena used by the memtable. If <=0, it
+ // won't allocate from huge page but from malloc.
+ // Users are responsible to reserve huge pages for it to be allocated. For
+ // example:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ // If there isn't enough free huge page available, it will fall back to
+ // malloc.
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t memtable_huge_page_size = 0;
+
+ // If non-nullptr, memtable will use the specified function to extract
+ // prefixes for keys, and for each prefix maintain a hint of insert location
+ // to reduce CPU usage for inserting keys with the prefix. Keys out of
+ // domain of the prefix extractor will be insert without using hints.
+ //
+ // Currently only the default skiplist based memtable implements the feature.
+ // All other memtable implementation will ignore the option. It incurs ~250
+ // additional bytes of memory overhead to store a hint for each prefix.
+ // Also concurrent writes (when allow_concurrent_memtable_write is true) will
+ // ignore the option.
+ //
+ // The option is best suited for workloads where keys will likely to insert
+ // to a location close the last inserted key with the same prefix.
+ // One example could be inserting keys of the form (prefix + timestamp),
+ // and keys of the same prefix always comes in with time order. Another
+ // example would be updating the same key over and over again, in which case
+ // the prefix can be the key itself.
+ //
+ // Default: nullptr (disabled)
+ std::shared_ptr<const SliceTransform>
+ memtable_insert_with_hint_prefix_extractor = nullptr;
+
+ // Control locality of bloom filter probes to improve CPU cache hit rate.
+ // This option now only applies to plaintable prefix bloom. This
+ // optimization is turned off when set to 0, and positive number to turn
+ // it on.
+ // Default: 0
+ uint32_t bloom_locality = 0;
+
+ // size of one block in arena memory allocation.
+ // If <= 0, a proper value is automatically calculated (usually 1/8 of
+ // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is
+ // smaller).
+ //
+ // There are two additional restriction of the specified size:
+ // (1) size should be in the range of [4096, 2 << 30] and
+ // (2) be the multiple of the CPU word (which helps with the memory
+ // alignment).
+ //
+ // We'll automatically check and adjust the size number to make sure it
+ // conforms to the restrictions.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t arena_block_size = 0;
+
+ // Different levels can have different compression policies. There
+ // are cases where most lower levels would like to use quick compression
+ // algorithms while the higher levels (which have more data) use
+ // compression algorithms that have better compression but could
+ // be slower. This array, if non-empty, should have an entry for
+ // each level of the database; these override the value specified in
+ // the previous field 'compression'.
+ //
+ // NOTICE if level_compaction_dynamic_level_bytes=true,
+ // compression_per_level[0] still determines L0, but other elements
+ // of the array are based on base level (the level L0 files are merged
+ // to), and may not match the level users see from info log for metadata.
+ // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
+ // determines compaction type for level n+i-1.
+ // For example, if we have three 5 levels, and we determine to merge L0
+ // data to L4 (which means L1..L3 will be empty), then the new files go to
+ // L4 uses compression type compression_per_level[1].
+ // If now L0 is merged to L2. Data goes to L2 will be compressed
+ // according to compression_per_level[1], L3 using compression_per_level[2]
+ // and L4 using compression_per_level[3]. Compaction for each level can
+ // change when data grows.
+ //
+ // NOTE: if the vector size is smaller than the level number, the undefined
+ // lower level uses the last option in the vector, for example, for 3 level
+ // LSM tree the following settings are the same:
+ // {kNoCompression, kSnappyCompression}
+ // {kNoCompression, kSnappyCompression, kSnappyCompression}
+ //
+ // Dynamically changeable through SetOptions() API
+ std::vector<CompressionType> compression_per_level;
+
+ // Number of levels for this database
+ int num_levels = 7;
+
+ // Soft limit on number of level-0 files. We start slowing down writes at this
+ // point. A value <0 means that no writing slow down will be triggered by
+ // number of files in level-0.
+ //
+ // Default: 20
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_slowdown_writes_trigger = 20;
+
+ // Maximum number of level-0 files. We stop writes at this point.
+ //
+ // Default: 36
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_stop_writes_trigger = 36;
+
+ // Target file size for compaction.
+ // target_file_size_base is per-file size for level-1.
+ // Target file size for level L can be calculated by
+ // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+ // For example, if target_file_size_base is 2MB and
+ // target_file_size_multiplier is 10, then each file on level-1 will
+ // be 2MB, and each file on level 2 will be 20MB,
+ // and each file on level-3 will be 200MB.
+ //
+ // Default: 64MB.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t target_file_size_base = 64 * 1048576;
+
+ // By default target_file_size_multiplier is 1, which means
+ // by default files in different levels will have similar size.
+ //
+ // Dynamically changeable through SetOptions() API
+ int target_file_size_multiplier = 1;
+
+ // If true, RocksDB will pick target size of each level dynamically.
+ // We will pick a base level b >= 1. L0 will be directly merged into level b,
+ // instead of always into level 1. Level 1 to b-1 need to be empty.
+ // We try to pick b and its target size so that
+ // 1. target size is in the range of
+ // (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+ // max_bytes_for_level_base]
+ // 2. target size of the last level (level num_levels-1) equals to extra size
+ // of the level.
+ // At the same time max_bytes_for_level_multiplier and
+ // max_bytes_for_level_multiplier_additional are still satisfied.
+ // (When L0 is too large, we make some adjustment. See below.)
+ //
+ // With this option on, from an empty DB, we make last level the base level,
+ // which means merging L0 data into the last level, until it exceeds
+ // max_bytes_for_level_base. And then we make the second last level to be
+ // base level, to start to merge L0 data to second last level, with its
+ // target size to be 1/max_bytes_for_level_multiplier of the last level's
+ // extra size. After the data accumulates more so that we need to move the
+ // base level to the third last one, and so on.
+ //
+ // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
+ // and max_bytes_for_level_base=10MB.
+ // Target sizes of level 1 to 5 starts with:
+ // [- - - - 10MB]
+ // with base level is level. Target sizes of level 1 to 4 are not applicable
+ // because they will not be used.
+ // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+ // base target to level 4 and now the targets looks like:
+ // [- - - 1.1MB 11MB]
+ // While data are accumulated, size targets are tuned based on actual data
+ // of level 5. When level 5 has 50MB of data, the target is like:
+ // [- - - 5MB 50MB]
+ // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
+ // level 4 to be the base level, its target size needs to be 10.1MB, which
+ // doesn't satisfy the target size range. So now we make level 3 the target
+ // size and the target sizes of the levels look like:
+ // [- - 1.01MB 10.1MB 101MB]
+ // In the same way, while level 5 further grows, all levels' targets grow,
+ // like
+ // [- - 5MB 50MB 500MB]
+ // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+ // base level and make levels' target sizes like this:
+ // [- 1.001MB 10.01MB 100.1MB 1001MB]
+ // and go on...
+ //
+ // By doing it, we give max_bytes_for_level_multiplier a priority against
+ // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
+ // useful to limit worse case space amplification.
+ //
+ //
+ // If the compaction from L0 is lagged behind, a special mode will be turned
+ // on to prioritize write amplification against max_bytes_for_level_multiplier
+ // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+ // at number of L0 files and total L0 size. If number of L0 files is at least
+ // the double of level0_file_num_compaction_trigger, or the total size is
+ // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+ // to the actual data size in L0, and then determine the target for each level
+ // so that each level will have the same level multiplier.
+ //
+ // For example, when L0 size is 100MB, the size of last level is 1600MB,
+ // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+ // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+ // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+ // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+ // be needed. The level multiplier will be calculated to be 4 and the three
+ // levels' target to be [100MB, 400MB, 1600MB].
+ //
+ // In this mode, The number of levels will be no more than the normal mode,
+ // and the level multiplier will be lower. The write amplification will
+ // likely to be reduced.
+ //
+ //
+ // max_bytes_for_level_multiplier_additional is ignored with this flag on.
+ //
+ // Turning this feature on or off for an existing DB can cause unexpected
+ // LSM tree structure so it's not recommended.
+ //
+ // Default: false
+ bool level_compaction_dynamic_level_bytes = false;
+
+ // Allows RocksDB to generate files that are not exactly the target_file_size
+ // only for the non-bottommost files. Which can reduce the write-amplification
+ // from compaction. The file size could be from 0 to 2x target_file_size.
+ // Once enabled, non-bottommost compaction will try to cut the files align
+ // with the next level file boundaries (grandparent level).
+ //
+ // Default: true
+ bool level_compaction_dynamic_file_size = true;
+
+ // Default: 10.
+ //
+ // Dynamically changeable through SetOptions() API
+ double max_bytes_for_level_multiplier = 10;
+
+ // Different max-size multipliers for different levels.
+ // These are multiplied by max_bytes_for_level_multiplier to arrive
+ // at the max-size of each level.
+ //
+ // Default: 1
+ //
+ // Dynamically changeable through SetOptions() API
+ std::vector<int> max_bytes_for_level_multiplier_additional =
+ std::vector<int>(num_levels, 1);
+
+ // We try to limit number of bytes in one compaction to be lower than this
+ // threshold. But it's not guaranteed.
+ // Value 0 will be sanitized.
+ //
+ // Default: target_file_size_base * 25
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_compaction_bytes = 0;
+
+ // When setting up compaction input files, we ignore the
+ // `max_compaction_bytes` limit when pulling in input files that are entirely
+ // within output key range.
+ //
+ // Default: true
+ //
+ // Dynamically changeable through SetOptions() API
+ // We could remove this knob and always ignore the limit once it is proven
+ // safe.
+ bool ignore_max_compaction_bytes_for_input = true;
+
+ // All writes will be slowed down to at least delayed_write_rate if estimated
+ // bytes needed to be compaction exceed this threshold.
+ //
+ // Default: 64GB
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
+
+ // All writes are stopped if estimated bytes needed to be compaction exceed
+ // this threshold.
+ //
+ // Default: 256GB
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
+
+ // The compaction style. Default: kCompactionStyleLevel
+ CompactionStyle compaction_style = kCompactionStyleLevel;
+
+ // If level compaction_style = kCompactionStyleLevel, for each level,
+ // which files are prioritized to be picked to compact.
+ // Default: kMinOverlappingRatio
+ CompactionPri compaction_pri = kMinOverlappingRatio;
+
+ // The options needed to support Universal Style compactions
+ //
+ // Dynamically changeable through SetOptions() API
+ // Dynamic change example:
+ // SetOptions("compaction_options_universal", "{size_ratio=2;}")
+ CompactionOptionsUniversal compaction_options_universal;
+
+ // The options for FIFO compaction style
+ //
+ // Dynamically changeable through SetOptions() API
+ // Dynamic change example:
+ // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
+ CompactionOptionsFIFO compaction_options_fifo;
+
+ // An iteration->Next() sequentially skips over keys with the same
+ // user-key unless this option is set. This number specifies the number
+ // of keys (with the same userkey) that will be sequentially
+ // skipped before a reseek is issued.
+ //
+ // Default: 8
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_sequential_skip_in_iterations = 8;
+
+ // This is a factory that provides MemTableRep objects.
+ // Default: a factory that provides a skip-list-based implementation of
+ // MemTableRep.
+ std::shared_ptr<MemTableRepFactory> memtable_factory =
+ std::shared_ptr<SkipListFactory>(new SkipListFactory);
+
+ // Block-based table related options are moved to BlockBasedTableOptions.
+ // Related options that were originally here but now moved include:
+ // no_block_cache
+ // block_cache
+ // block_cache_compressed
+ // block_size
+ // block_size_deviation
+ // block_restart_interval
+ // filter_policy
+ // whole_key_filtering
+ // If you'd like to customize some of these options, you will need to
+ // use NewBlockBasedTableFactory() to construct a new table factory.
+
+ // This option allows user to collect their own interested statistics of
+ // the tables.
+ // Default: empty vector -- no user-defined statistics collection will be
+ // performed.
+ using TablePropertiesCollectorFactories =
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
+ TablePropertiesCollectorFactories table_properties_collector_factories;
+
+ // Maximum number of successive merge operations on a key in the memtable.
+ //
+ // When a merge operation is added to the memtable and the maximum number of
+ // successive merges is reached, the value of the key will be calculated and
+ // inserted into the memtable instead of the merge operation. This will
+ // ensure that there are never more than max_successive_merges merge
+ // operations in the memtable.
+ //
+ // Default: 0 (disabled)
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t max_successive_merges = 0;
+
+ // This flag specifies that the implementation should optimize the filters
+ // mainly for cases where keys are found rather than also optimize for keys
+ // missed. This would be used in cases where the application knows that
+ // there are very few misses or the performance in the case of misses is not
+ // important.
+ //
+ // For now, this flag allows us to not store filters for the last level i.e
+ // the largest level which contains data of the LSM store. For keys which
+ // are hits, the filters in this level are not useful because we will search
+ // for the data anyway. NOTE: the filters in other levels are still useful
+ // even for key hit because they tell us whether to look in that level or go
+ // to the higher level.
+ //
+ // Default: false
+ bool optimize_filters_for_hits = false;
+
+ // During flush or compaction, check whether keys inserted to output files
+ // are in order.
+ //
+ // Default: true
+ //
+ // Dynamically changeable through SetOptions() API
+ bool check_flush_compaction_key_order = true;
+
+ // After writing every SST file, reopen it and read all the keys.
+ // Checks the hash of all of the keys and values written versus the
+ // keys in the file and signals a corruption if they do not match
+ //
+ // Default: false
+ //
+ // Dynamically changeable through SetOptions() API
+ bool paranoid_file_checks = false;
+
+ // In debug mode, RocksDB runs consistency checks on the LSM every time the
+ // LSM changes (Flush, Compaction, AddFile). When this option is true, these
+ // checks are also enabled in release mode. These checks were historically
+ // disabled in release mode, but are now enabled by default for proactive
+ // corruption detection. The CPU overhead is negligible for normal mixed
+ // operations but can slow down saturated writing. See
+ // Options::DisableExtraChecks().
+ // Default: true
+ bool force_consistency_checks = true;
+
+ // Measure IO stats in compactions and flushes, if true.
+ //
+ // Default: false
+ //
+ // Dynamically changeable through SetOptions() API
+ bool report_bg_io_stats = false;
+
+ // Files containing updates older than TTL will go through the compaction
+ // process. This usually happens in a cascading way so that those entries
+ // will be compacted to bottommost level/file.
+ // The feature is used to remove stale entries that have been deleted or
+ // updated from the file system.
+ // Pre-req: This needs max_open_files to be set to -1.
+ // In Level: Non-bottom-level files older than TTL will go through the
+ // compaction process.
+ // In FIFO: Files older than TTL will be deleted.
+ // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+ // In FIFO, this option will have the same meaning as
+ // periodic_compaction_seconds. Whichever stricter will be used.
+ // 0 means disabling.
+ // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+ // pick default.
+ //
+ // Default: 30 days for leveled compaction + block based table. disable
+ // otherwise.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t ttl = 0xfffffffffffffffe;
+
+ // Files older than this value will be picked up for compaction, and
+ // re-written to the same level as they were before.
+ // One main use of the feature is to make sure a file goes through compaction
+ // filters periodically. Users can also use the feature to clear up SST
+ // files using old format.
+ //
+ // A file's age is computed by looking at file_creation_time or creation_time
+ // table properties in order, if they have valid non-zero values; if not, the
+ // age is based on the file's last modified time (given by the underlying
+ // Env).
+ //
+ // Supported in Level and FIFO compaction.
+ // In FIFO compaction, this option has the same meaning as TTL and whichever
+ // stricter will be used.
+ // Pre-req: max_open_file == -1.
+ // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+ //
+ // Values:
+ // 0: Turn off Periodic compactions.
+ // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+ // as needed. For now, RocksDB will change this value to 30 days
+ // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+ // process at least once every 30 days if not compacted sooner.
+ // In FIFO compaction, since the option has the same meaning as ttl,
+ // when this value is left default, and ttl is left to 0, 30 days will be
+ // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+ //
+ // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
+
+ // If this option is set then 1 in N blocks are compressed
+ // using a fast (lz4) and slow (zstd) compression algorithm.
+ // The compressibility is reported as stats and the stored
+ // data is left uncompressed (unless compression is also requested).
+ uint64_t sample_for_compression = 0;
+
+ // EXPERIMENTAL
+ // The feature is still in development and is incomplete.
+ // If this option is set, when creating the last level files, pass this
+ // temperature to FileSystem used. Should be no-op for default FileSystem
+ // and users need to plug in their own FileSystem to take advantage of it.
+ //
+ // Note: the feature is changed from `bottommost_temperature` to
+ // `last_level_temperature` which now only apply for the last level files.
+ // The option name `bottommost_temperature` is kept only for migration, the
+ // behavior is the same as `last_level_temperature`. Please stop using
+ // `bottommost_temperature` and will be removed in next release.
+ //
+ // Dynamically changeable through the SetOptions() API
+ Temperature bottommost_temperature = Temperature::kUnknown;
+ Temperature last_level_temperature = Temperature::kUnknown;
+
+ // EXPERIMENTAL
+ // The feature is still in development and is incomplete.
+ // If this option is set, when data insert time is within this time range, it
+ // will be precluded from the last level.
+ // 0 means no key will be precluded from the last level.
+ //
+ // Note: when enabled, universal size amplification (controlled by option
+ // `compaction_options_universal.max_size_amplification_percent`) calculation
+ // will exclude the last level. As the feature is designed for tiered storage
+ // and a typical setting is the last level is cold tier which is likely not
+ // size constrained, the size amp is going to be only for non-last levels.
+ //
+ // Default: 0 (disable the feature)
+ //
+ // Not dynamically changeable, change it requires db restart.
+ uint64_t preclude_last_level_data_seconds = 0;
+
+ // EXPERIMENTAL
+ // If this option is set, it will preserve the internal time information about
+ // the data until it's older than the specified time here.
+ // Internally the time information is a map between sequence number and time,
+ // which is the same as `preclude_last_level_data_seconds`. But it won't
+ // preclude the data from the last level and the data in the last level won't
+ // have the sequence number zeroed out.
+ // Internally, rocksdb would sample the sequence number to time pair and store
+ // that in SST property "rocksdb.seqno.time.map". The information is currently
+ // only used for tiered storage compaction (option
+ // `preclude_last_level_data_seconds`).
+ //
+ // Note: if both `preclude_last_level_data_seconds` and this option is set, it
+ // will preserve the max time of the 2 options and compaction still preclude
+ // the data based on `preclude_last_level_data_seconds`.
+ // The higher the preserve_time is, the less the sampling frequency will be (
+ // which means less accuracy of the time estimation).
+ //
+ // Default: 0 (disable the feature)
+ //
+ // Not dynamically changeable, change it requires db restart.
+ uint64_t preserve_internal_time_seconds = 0;
+
+ // When set, large values (blobs) are written to separate blob files, and
+ // only pointers to them are stored in SST files. This can reduce write
+ // amplification for large-value use cases at the cost of introducing a level
+ // of indirection for reads. See also the options min_blob_size,
+ // blob_file_size, blob_compression_type, enable_blob_garbage_collection,
+ // blob_garbage_collection_age_cutoff,
+ // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size
+ // below.
+ //
+ // Default: false
+ //
+ // Dynamically changeable through the SetOptions() API
+ bool enable_blob_files = false;
+
+ // The size of the smallest value to be stored separately in a blob file.
+ // Values which have an uncompressed size smaller than this threshold are
+ // stored alongside the keys in SST files in the usual fashion. A value of
+ // zero for this option means that all values are stored in blob files. Note
+ // that enable_blob_files has to be set in order for this option to have any
+ // effect.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through the SetOptions() API
+ uint64_t min_blob_size = 0;
+
+ // The size limit for blob files. When writing blob files, a new file is
+ // opened once this limit is reached. Note that enable_blob_files has to be
+ // set in order for this option to have any effect.
+ //
+ // Default: 256 MB
+ //
+ // Dynamically changeable through the SetOptions() API
+ uint64_t blob_file_size = 1ULL << 28;
+
+ // The compression algorithm to use for large values stored in blob files.
+ // Note that enable_blob_files has to be set in order for this option to have
+ // any effect.
+ //
+ // Default: no compression
+ //
+ // Dynamically changeable through the SetOptions() API
+ CompressionType blob_compression_type = kNoCompression;
+
+ // Enables garbage collection of blobs. Blob GC is performed as part of
+ // compaction. Valid blobs residing in blob files older than a cutoff get
+ // relocated to new files as they are encountered during compaction, which
+ // makes it possible to clean up blob files once they contain nothing but
+ // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and
+ // blob_garbage_collection_force_threshold below.
+ //
+ // Default: false
+ //
+ // Dynamically changeable through the SetOptions() API
+ bool enable_blob_garbage_collection = false;
+
+ // The cutoff in terms of blob file age for garbage collection. Blobs in
+ // the oldest N blob files will be relocated when encountered during
+ // compaction, where N = garbage_collection_cutoff * number_of_blob_files.
+ // Note that enable_blob_garbage_collection has to be set in order for this
+ // option to have any effect.
+ //
+ // Default: 0.25
+ //
+ // Dynamically changeable through the SetOptions() API
+ double blob_garbage_collection_age_cutoff = 0.25;
+
+ // If the ratio of garbage in the oldest blob files exceeds this threshold,
+ // targeted compactions are scheduled in order to force garbage collecting
+ // the blob files in question, assuming they are all eligible based on the
+ // value of blob_garbage_collection_age_cutoff above. This option is
+ // currently only supported with leveled compactions.
+ // Note that enable_blob_garbage_collection has to be set in order for this
+ // option to have any effect.
+ //
+ // Default: 1.0
+ //
+ // Dynamically changeable through the SetOptions() API
+ double blob_garbage_collection_force_threshold = 1.0;
+
+ // Compaction readahead for blob files.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through the SetOptions() API
+ uint64_t blob_compaction_readahead_size = 0;
+
+ // Enable blob files starting from a certain LSM tree level.
+ //
+ // For certain use cases that have a mix of short-lived and long-lived values,
+ // it might make sense to support extracting large values only during
+ // compactions whose output level is greater than or equal to a specified LSM
+ // tree level (e.g. compactions into L1/L2/... or above). This could reduce
+ // the space amplification caused by large values that are turned into garbage
+ // shortly after being written at the price of some write amplification
+ // incurred by long-lived values whose extraction to blob files is delayed.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through the SetOptions() API
+ int blob_file_starting_level = 0;
+
+ // The Cache object to use for blobs. Using a dedicated object for blobs and
+ // using the same object for the block and blob caches are both supported. In
+ // the latter case, note that blobs are less valuable from a caching
+ // perspective than SST blocks, and some cache implementations have
+ // configuration options that can be used to prioritize items accordingly (see
+ // Cache::Priority and LRUCacheOptions::{high,low}_pri_pool_ratio).
+ //
+ // Default: nullptr (disabled)
+ std::shared_ptr<Cache> blob_cache = nullptr;
+
+ // Enable/disable prepopulating the blob cache. When set to kFlushOnly, BlobDB
+ // will insert newly written blobs into the blob cache during flush. This can
+ // improve performance when reading back these blobs would otherwise be
+ // expensive (e.g. when using direct I/O or remote storage), or when the
+ // workload has a high temporal locality.
+ //
+ // Default: disabled
+ //
+ // Dynamically changeable through the SetOptions() API
+ PrepopulateBlobCache prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+
+ // Enable memtable per key-value checksum protection.
+ //
+ // Each entry in memtable will be suffixed by a per key-value checksum.
+ // This options determines the size of such checksums.
+ //
+ // It is suggested to turn on write batch per key-value
+ // checksum protection together with this option, so that the checksum
+ // computation is done outside of writer threads (memtable kv checksum can be
+ // computed from write batch checksum) See
+ // WriteOptions::protection_bytes_per_key for more detail.
+ //
+ // Default: 0 (no protection)
+ // Supported values: 0, 1, 2, 4, 8.
+ uint32_t memtable_protection_bytes_per_key = 0;
+
+ // Create ColumnFamilyOptions with default values for all fields
+ AdvancedColumnFamilyOptions();
+ // Create ColumnFamilyOptions from Options
+ explicit AdvancedColumnFamilyOptions(const Options& options);
+
+ // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/block_cache_trace_writer.h b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
new file mode 100644
index 000000000..18d28685b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2022, Meta Platforms, Inc. and affiliates. All rights
+// reserved. This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_reader_caller.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A record for block cache lookups/inserts. This is passed by the table
+// reader to the BlockCacheTraceWriter for every block cache op.
+struct BlockCacheTraceRecord {
+ // Required fields for all accesses.
+ uint64_t access_timestamp = 0;
+
+ // Info related to the block being looked up or inserted
+ //
+ // 1. The cache key for the block
+ std::string block_key;
+
+ // 2. The type of block
+ TraceType block_type = TraceType::kTraceMax;
+
+ // 3. Size of the block
+ uint64_t block_size = 0;
+
+ // Info about the SST file the block is in
+ //
+ // 1. Column family ID
+ uint64_t cf_id = 0;
+
+ // 2. Column family name
+ std::string cf_name;
+
+ // 3. LSM level of the file
+ uint32_t level = 0;
+
+ // 4. SST file number
+ uint64_t sst_fd_number = 0;
+
+ // Info about the calling context
+ //
+ // 1. The higher level request triggering the block cache request
+ TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
+
+ // 2. Cache lookup hit/miss. Not relevant for inserts
+ bool is_cache_hit = false;
+
+ // 3. Whether this request is a lookup
+ bool no_insert = false;
+
+ // Get/MultiGet specific info
+ //
+ // 1. A unique ID for Get/MultiGet
+ uint64_t get_id = kReservedGetId;
+
+ // 2. Whether the Get/MultiGet is from a user-specified snapshot
+ bool get_from_user_specified_snapshot = false;
+
+ // 3. The target user key in the block
+ std::string referenced_key;
+
+ // Required fields for data block and user Get/Multi-Get only.
+ //
+ // 1. Size of te useful data in the block
+ uint64_t referenced_data_size = 0;
+
+ // 2. Only for MultiGet, number of keys from the batch found in the block
+ uint64_t num_keys_in_block = 0;
+
+ // 3. Whether the key was found in the block or not (false positive)
+ bool referenced_key_exist_in_block = false;
+
+ static const uint64_t kReservedGetId;
+
+ BlockCacheTraceRecord() {}
+
+ BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
+ TraceType _block_type, uint64_t _block_size,
+ uint64_t _cf_id, std::string _cf_name, uint32_t _level,
+ uint64_t _sst_fd_number, TableReaderCaller _caller,
+ bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
+ bool _get_from_user_specified_snapshot = false,
+ std::string _referenced_key = "",
+ uint64_t _referenced_data_size = 0,
+ uint64_t _num_keys_in_block = 0,
+ bool _referenced_key_exist_in_block = false)
+ : access_timestamp(_access_timestamp),
+ block_key(_block_key),
+ block_type(_block_type),
+ block_size(_block_size),
+ cf_id(_cf_id),
+ cf_name(_cf_name),
+ level(_level),
+ sst_fd_number(_sst_fd_number),
+ caller(_caller),
+ is_cache_hit(_is_cache_hit),
+ no_insert(_no_insert),
+ get_id(_get_id),
+ get_from_user_specified_snapshot(_get_from_user_specified_snapshot),
+ referenced_key(_referenced_key),
+ referenced_data_size(_referenced_data_size),
+ num_keys_in_block(_num_keys_in_block),
+ referenced_key_exist_in_block(_referenced_key_exist_in_block) {}
+};
+
+// Options for tracing block cache accesses
+struct BlockCacheTraceOptions {
+ // Specify trace sampling option, i.e. capture one per how many requests.
+ // Default to 1 (capture every request).
+ uint64_t sampling_frequency = 1;
+};
+
+// Options for the built-in implementation of BlockCacheTraceWriter
+struct BlockCacheTraceWriterOptions {
+ uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+};
+
+// BlockCacheTraceWriter is an abstract class that captures all RocksDB block
+// cache accesses. Every RocksDB operation is passed to WriteBlockAccess()
+// with a BlockCacheTraceRecord.
+class BlockCacheTraceWriter {
+ public:
+ virtual ~BlockCacheTraceWriter() {}
+
+ // Pass Slice references to avoid copy.
+ virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+ const Slice& block_key, const Slice& cf_name,
+ const Slice& referenced_key) = 0;
+
+ // Write a trace header at the beginning, typically on initiating a trace,
+ // with some metadata like a magic number and RocksDB version.
+ virtual Status WriteHeader() = 0;
+};
+
+// Allocate an instance of the built-in BlockCacheTraceWriter implementation,
+// that traces all block cache accesses to a user-provided TraceWriter. Each
+// access is traced to a file with a timestamp and type, followed by the
+// payload.
+std::unique_ptr<BlockCacheTraceWriter> NewBlockCacheTraceWriter(
+ SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
new file mode 100644
index 000000000..1639f3cd3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -0,0 +1,2793 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+ C bindings for rocksdb. May be useful as a stable ABI that can be
+ used by programs that keep rocksdb in a shared library, or for
+ a JNI api.
+
+ Does not support:
+ . getters for the option types
+ . custom comparators that implement key shortening
+ . capturing post-write-snapshot
+ . custom iter, db, env, cache implementations using just the C bindings
+
+ Some conventions:
+
+ (1) We expose just opaque struct pointers and functions to clients.
+ This allows us to change internal representations without having to
+ recompile clients.
+
+ (2) For simplicity, there is no equivalent to the Slice type. Instead,
+ the caller has to pass the pointer and length as separate
+ arguments.
+
+ (3) Errors are represented by a null-terminated c string. NULL
+ means no error. All operations that can raise an error are passed
+ a "char** errptr" as the last argument. One of the following must
+ be true on entry:
+ *errptr == NULL
+ *errptr points to a malloc()ed null-terminated error message
+ On success, a leveldb routine leaves *errptr unchanged.
+ On failure, leveldb frees the old value of *errptr and
+ set *errptr to a malloc()ed error message.
+
+ (4) Bools have the type unsigned char (0 == false; rest == true)
+
+ (5) All of the pointer arguments must be non-NULL.
+*/
+
+#pragma once
+
+#ifdef _WIN32
+#ifdef ROCKSDB_DLL
+#ifdef ROCKSDB_LIBRARY_EXPORTS
+#define ROCKSDB_LIBRARY_API __declspec(dllexport)
+#else
+#define ROCKSDB_LIBRARY_API __declspec(dllimport)
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
+typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
+typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
+typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
+typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
+typedef struct rocksdb_compactionfiltercontext_t
+ rocksdb_compactionfiltercontext_t;
+typedef struct rocksdb_compactionfilterfactory_t
+ rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_comparator_t rocksdb_comparator_t;
+typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
+typedef struct rocksdb_env_t rocksdb_env_t;
+typedef struct rocksdb_fifo_compaction_options_t
+ rocksdb_fifo_compaction_options_t;
+typedef struct rocksdb_filelock_t rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t rocksdb_iterator_t;
+typedef struct rocksdb_logger_t rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t rocksdb_options_t;
+typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t;
+typedef struct rocksdb_block_based_table_options_t
+ rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t;
+typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
+typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
+typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t
+ rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
+typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
+typedef struct rocksdb_column_family_metadata_t
+ rocksdb_column_family_metadata_t;
+typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t;
+typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t;
+typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
+typedef struct rocksdb_ingestexternalfileoptions_t
+ rocksdb_ingestexternalfileoptions_t;
+typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t;
+typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t;
+typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t;
+typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
+typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
+typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
+typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t
+ rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t
+ rocksdb_optimistictransaction_options_t;
+typedef struct rocksdb_transaction_t rocksdb_transaction_t;
+typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
+typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
+typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
+
+/* DB operations */
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl(
+ const rocksdb_options_t* options, const char* name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
+ const rocksdb_options_t* options, const char* name,
+ unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+ const rocksdb_options_t* options, const char* name,
+ const char* secondary_path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+ const rocksdb_options_t* options, const char* path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t*
+rocksdb_backup_engine_open_opts(const rocksdb_backup_engine_options_t* options,
+ rocksdb_env_t* env, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
+ rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush(
+ rocksdb_backup_engine_t* be, rocksdb_t* db,
+ unsigned char flush_before_backup, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
+ rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
+rocksdb_restore_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
+ rocksdb_restore_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
+ rocksdb_restore_options_t* opt, int v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup(
+ rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_restore_db_from_latest_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
+rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count(
+ const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp(
+ const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id(
+ const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size(
+ const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files(
+ const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
+ const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
+ rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf_with_ts(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_increase_full_history_ts_low(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* ts_low, size_t ts_lowlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_full_history_ts_low(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ size_t* ts_lowlen, char** errptr);
+
+/* BackupEngineOptions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_options_t*
+rocksdb_backup_engine_options_create(const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_backup_dir(
+ rocksdb_backup_engine_options_t* options, const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_env(
+ rocksdb_backup_engine_options_t* options, rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_table_files(
+ rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_share_table_files(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_sync(
+ rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backup_engine_options_get_sync(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_destroy_old_data(
+ rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_destroy_old_data(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_log_files(
+ rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_backup_log_files(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_rate_limit(
+ rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_backup_rate_limit(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_restore_rate_limit(
+ rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_restore_rate_limit(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_background_operations(
+ rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_background_operations(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+ rocksdb_backup_engine_options_t* options, uint64_t size);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+ rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+ rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+ rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_destroy(
+ rocksdb_backup_engine_options_t*);
+
+/* Checkpoint */
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
+ rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
+ uint64_t log_size_for_flush, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
+ rocksdb_checkpoint_t* checkpoint);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_and_trim_history(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+ size_t trim_tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_open_for_read_only_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles,
+ unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+ const rocksdb_options_t* options, const char* name,
+ const char* secondary_path, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
+ const rocksdb_options_t* options, const char* name, size_t* lencf,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy(
+ char** list, size_t len);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family(rocksdb_t* db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_ttl(
+ rocksdb_t* db, const rocksdb_options_t* column_family_options,
+ const char* column_family_name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy(
+ rocksdb_column_family_handle_t*);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_column_family_handle_get_id(rocksdb_column_family_handle_t* handle);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_handle_get_name(
+ rocksdb_column_family_handle_t* handle, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* start_key,
+ size_t start_key_len, const char* end_key, size_t end_key_len,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_write(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch, char** errptr);
+
+/* Returns NULL if not found. A malloc()ed array otherwise.
+ Stores the length of the array in *vallen. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_get(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_with_ts(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf_with_ts(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+// if values_list[i] == NULL and errs[i] == NULL,
+// then we got status.IsNotFound(), which we will not return.
+// all errors except status status.ok() and status.IsNotFound() are returned.
+//
+// errs, values_list and values_list_sizes must be num_keys in length,
+// allocated by the caller.
+// errs is a list of strings as opposed to the conventional one error,
+// where errs[i] is the status for retrieval of keys_list[i].
+// each non-NULL errs entry is a malloc()ed, null terminated string.
+// each non-NULL values_list entry is a malloc()ed array, with
+// the length for each stored in values_list_sizes[i].
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+ const char* const* keys_list, const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_with_ts(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+ const char* const* keys_list, const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes, char** timestamp_list,
+ size_t* timestamp_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf_with_ts(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** timestamps_list,
+ size_t* timestamps_list_sizes, char** errs);
+
+// The MultiGet API that improves performance by batching operations
+// in the read path for greater efficiency. Currently, only the block based
+// table format with full filters are supported. Other table formats such
+// as plain table, block based table with block based filters and
+// partitioned indexes will still work, but will not get any performance
+// benefits.
+//
+// Note that all the keys passed to this API are restricted to a single
+// column family.
+//
+// Parameters -
+// db - the RocksDB instance.
+// options - ReadOptions
+// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+// passed to the API are restricted to a single column family
+// num_keys - Number of keys to lookup
+// keys_list - Pointer to C style array of keys with num_keys elements
+// keys_list_sizes - Pointer to C style array of the size of corresponding key
+// in key_list with num_keys elements.
+// values - Pointer to C style array of PinnableSlices with num_keys elements
+// statuses - Pointer to C style array of Status with num_keys elements
+// sorted_input - If true, it means the input keys are already sorted by key
+// order, so the MultiGet() API doesn't have to sort them
+// again. If false, the keys will be copied and sorted
+// internally by the API - the input array will not be
+// modified
+extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, size_t num_keys,
+ const char* const* keys_list, const size_t* keys_list_sizes,
+ rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t key_len, char** value, size_t* val_len, const char* timestamp,
+ size_t timestamp_len, unsigned char* value_found);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t key_len, char** value, size_t* val_len, const char* timestamp,
+ size_t timestamp_len, unsigned char* value_found);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
+ rocksdb_t* db, const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+ rocksdb_t* db, uint64_t seq_number,
+ const rocksdb_wal_readoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators(
+ rocksdb_t* db, rocksdb_readoptions_t* opts,
+ rocksdb_column_family_handle_t** column_families,
+ rocksdb_iterator_t** iterators, size_t size, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot(
+ rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
+ rocksdb_t* db, const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+ Else returns a pointer to a malloc()-ed null-terminated value. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
+ const char* propname);
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+ uint64_t* out_val);
+
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int_cf(rocksdb_t* db,
+ rocksdb_column_family_handle_t* column_family,
+ const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* propname);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
+ rocksdb_t* db, int num_ranges, const char* const* range_start_key,
+ const size_t* range_start_key_len, const char* const* range_limit_key,
+ const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ int num_ranges, const char* const* range_start_key,
+ const size_t* range_start_key_len, const char* const* range_limit_key,
+ const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
+ const char* start_key,
+ size_t start_key_len,
+ const char* limit_key,
+ size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range(
+ rocksdb_t* db, const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt(
+ rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key,
+ size_t start_key_len, const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db,
+ const char* name);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles(
+ rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush(
+ rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
+ rocksdb_t* db, const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
+ unsigned char sync,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(
+ rocksdb_t* db, unsigned char force, char** errptr);
+
+/* Management operations */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_repair_db(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+/* Iterator */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid(
+ const rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*,
+ const char* k, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*,
+ const char* k,
+ size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key(
+ const rocksdb_iterator_t*, size_t* klen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
+ const rocksdb_iterator_t*, size_t* vlen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp(
+ const rocksdb_iterator_t*, size_t* tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
+ const rocksdb_iterator_t*, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(
+ rocksdb_wal_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid(
+ const rocksdb_wal_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status(
+ const rocksdb_wal_iterator_t* iter, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+ const rocksdb_wal_iterator_t* iter, uint64_t* seq);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_get_latest_sequence_number(rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy(
+ const rocksdb_wal_iterator_t* iter);
+
+/* Write batch */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(
+ void);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
+ const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
+ rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf_with_ts(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete(
+ rocksdb_writebatch_t* b, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf_with_ts(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf_with_ts(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range(
+ rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data(
+ rocksdb_writebatch_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
+ rocksdb_writebatch_t*, void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
+ rocksdb_writebatch_t*, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(
+ rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point(
+ rocksdb_writebatch_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point(
+ rocksdb_writebatch_t*, char** errptr);
+
+/* Write batch with index */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create(size_t reserved_bytes,
+ unsigned char overwrite_keys);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create_from(const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy(
+ rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(
+ rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(
+ rocksdb_writebatch_wi_t* b);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(
+ rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(
+ rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(
+ rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete(
+ rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range(
+ rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev(
+ rocksdb_writebatch_wi_t* b, int num_keys,
+ const char* const* start_keys_list, const size_t* start_keys_list_sizes,
+ const char* const* end_keys_list, const size_t* end_keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data(
+ rocksdb_writebatch_wi_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate(
+ rocksdb_writebatch_wi_t* b, void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data(
+ rocksdb_writebatch_wi_t* b, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point(
+ rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point(
+ rocksdb_writebatch_wi_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch(
+ rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+ const char* key, size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf(
+ rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+ const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+ size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_wi_t* wbwi, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_cf(
+ rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+ rocksdb_column_family_handle_t* cf);
+
+/* Options utils */
+
+// Load the latest rocksdb options from the specified db_path.
+//
+// On success, num_column_families will be updated with a non-zero
+// number indicating the number of column families.
+// The returned db_options, column_family_names, and column_family_options
+// should be released via rocksdb_load_latest_options_destroy().
+//
+// On error, a non-null errptr that includes the error message will be
+// returned. db_options, column_family_names, and column_family_options
+// will be set to NULL.
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options(
+ const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+ rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+ size_t* num_column_families, char*** column_family_names,
+ rocksdb_options_t*** column_family_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy(
+ rocksdb_options_t* db_options, char** list_column_family_names,
+ rocksdb_options_t** list_column_family_options, size_t len);
+
+/* Block based table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
+ rocksdb_block_based_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_checksum(
+ rocksdb_block_based_table_options_t*, char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
+ rocksdb_block_based_table_options_t* options, size_t block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_size_deviation(
+ rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_restart_interval(
+ rocksdb_block_based_table_options_t* options,
+ int index_block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_metadata_block_size(
+ rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_partition_filters(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char partition_filters);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_use_delta_encoding(
+ rocksdb_block_based_table_options_t* options,
+ unsigned char use_delta_encoding);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_filterpolicy_t* filter_policy);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache(
+ rocksdb_block_based_table_options_t* options, unsigned char no_block_cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache(
+ rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_cache_compressed(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache_compressed);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_whole_key_filtering(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version(
+ rocksdb_block_based_table_options_t*, int);
+enum {
+ rocksdb_block_based_table_index_type_binary_search = 0,
+ rocksdb_block_based_table_index_type_hash_search = 1,
+ rocksdb_block_based_table_index_type_two_level_index_search = 2,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type(
+ rocksdb_block_based_table_options_t*, int); // uses one of the above enums
+enum {
+ rocksdb_block_based_table_data_block_index_type_binary_search = 0,
+ rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1,
+};
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_index_type(
+ rocksdb_block_based_table_options_t*, int); // uses one of the above enums
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_hash_ratio(
+ rocksdb_block_based_table_options_t* options, double v);
+// rocksdb_block_based_options_set_hash_index_allow_collision()
+// is removed since BlockBasedTableOptions.hash_index_allow_collision()
+// is removed
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
+ rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
+
+/* Cuckoo table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
+ rocksdb_cuckoo_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
+ rocksdb_cuckoo_table_options_t* options, double v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_cuckoo_options_set_identity_as_first_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
+ rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options);
+
+/* Options */
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count,
+ const char* const keys[],
+ const char* const values[],
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count,
+ const char* const keys[], const char* const values[], char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
+ rocksdb_options_t* opt, int total_threads);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
+ rocksdb_options_t* opt, uint64_t block_cache_size_mb);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_optimize_universal_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
+ rocksdb_options_t*, rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
+ rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
+ rocksdb_options_t*, rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
+ rocksdb_options_t*, rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
+ rocksdb_options_t* opt, const int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_create_missing_column_families(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(
+ rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
+ rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
+ rocksdb_logger_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
+ rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
+ rocksdb_options_t*, int, int, int, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*,
+ int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_zstd_max_train_bytes(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
+ int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_parallel_threads(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int,
+ int, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+ rocksdb_options_t*, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t*, unsigned char, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+ rocksdb_options_t*, uint64_t, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
+ rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_target_file_size_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+ rocksdb_options_t*, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+ unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt);
+
+/* Blob Options Settings */
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
+ rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size(
+ rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_min_blob_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size(
+ rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_file_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type(
+ rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc(
+ rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
+ rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold(
+ rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+ uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level(
+ rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache(
+ rocksdb_options_t* opt, rocksdb_cache_t* blob_cache);
+
+enum {
+ rocksdb_prepopulate_blob_disable = 0,
+ rocksdb_prepopulate_blob_flush_only = 1
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prepopulate_blob_cache(
+ rocksdb_options_t* opt, int val);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache(
+ rocksdb_options_t* opt);
+
+/* returns a pointer to a malloc()-ed, null terminated string */
+extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
+ int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+ int64_t);
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+ rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_max_subcompactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_log_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_keep_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+ size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+ size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_arena_block_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
+ rocksdb_options_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
+ const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
+ rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec(
+ rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_advise_random_on_open(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
+ uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
+ uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*,
+ double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
+ rocksdb_options_t*, size_t, int32_t, int32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory(
+ rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress(
+ rocksdb_options_t* opt, int level);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_successive_merges(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
+ rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_bloom_locality(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_inplace_update_support(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_avoid_unnecessary_blocking_io(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_experimental_mempurge_threshold(rocksdb_options_t*);
+
+enum {
+ rocksdb_tolerate_corrupted_tail_records_recovery = 0,
+ rocksdb_absolute_consistency_recovery = 1,
+ rocksdb_point_in_time_recovery = 2,
+ rocksdb_skip_any_corrupted_records_recovery = 3
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode(
+ rocksdb_options_t*);
+
+enum {
+ rocksdb_no_compression = 0,
+ rocksdb_snappy_compression = 1,
+ rocksdb_zlib_compression = 2,
+ rocksdb_bz2_compression = 3,
+ rocksdb_lz4_compression = 4,
+ rocksdb_lz4hc_compression = 5,
+ rocksdb_xpress_compression = 6,
+ rocksdb_zstd_compression = 7
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression(
+ rocksdb_options_t*);
+
+enum {
+ rocksdb_level_compaction = 0,
+ rocksdb_universal_compaction = 1,
+ rocksdb_fifo_compaction = 2
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_universal_compaction_options(
+ rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options(
+ rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter(
+ rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush(
+ rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
+ rocksdb_options_t* opt, rocksdb_cache_t* cache);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory(
+ rocksdb_options_t*, size_t window_size, size_t num_dels_trigger);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
+ rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
+ rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_compression(
+ rocksdb_options_t* opt, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression(
+ rocksdb_options_t* opt);
+
+/* RateLimiter */
+extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+ int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
+extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(
+ rocksdb_ratelimiter_t*);
+
+/* PerfContext */
+enum {
+ rocksdb_uninitialized = 0,
+ rocksdb_disable = 1,
+ rocksdb_enable_count = 2,
+ rocksdb_enable_time_except_for_mutex = 3,
+ rocksdb_enable_time = 4,
+ rocksdb_out_of_bounds = 5
+};
+
+enum {
+ rocksdb_user_key_comparison_count = 0,
+ rocksdb_block_cache_hit_count,
+ rocksdb_block_read_count,
+ rocksdb_block_read_byte,
+ rocksdb_block_read_time,
+ rocksdb_block_checksum_time,
+ rocksdb_block_decompress_time,
+ rocksdb_get_read_bytes,
+ rocksdb_multiget_read_bytes,
+ rocksdb_iter_read_bytes,
+ rocksdb_internal_key_skipped_count,
+ rocksdb_internal_delete_skipped_count,
+ rocksdb_internal_recent_skipped_count,
+ rocksdb_internal_merge_count,
+ rocksdb_get_snapshot_time,
+ rocksdb_get_from_memtable_time,
+ rocksdb_get_from_memtable_count,
+ rocksdb_get_post_process_time,
+ rocksdb_get_from_output_files_time,
+ rocksdb_seek_on_memtable_time,
+ rocksdb_seek_on_memtable_count,
+ rocksdb_next_on_memtable_count,
+ rocksdb_prev_on_memtable_count,
+ rocksdb_seek_child_seek_time,
+ rocksdb_seek_child_seek_count,
+ rocksdb_seek_min_heap_time,
+ rocksdb_seek_max_heap_time,
+ rocksdb_seek_internal_seek_time,
+ rocksdb_find_next_user_entry_time,
+ rocksdb_write_wal_time,
+ rocksdb_write_memtable_time,
+ rocksdb_write_delay_time,
+ rocksdb_write_pre_and_post_process_time,
+ rocksdb_db_mutex_lock_nanos,
+ rocksdb_db_condition_wait_nanos,
+ rocksdb_merge_operator_time_nanos,
+ rocksdb_read_index_block_nanos,
+ rocksdb_read_filter_block_nanos,
+ rocksdb_new_table_block_iter_nanos,
+ rocksdb_new_table_iterator_nanos,
+ rocksdb_block_seek_nanos,
+ rocksdb_find_table_nanos,
+ rocksdb_bloom_memtable_hit_count,
+ rocksdb_bloom_memtable_miss_count,
+ rocksdb_bloom_sst_hit_count,
+ rocksdb_bloom_sst_miss_count,
+ rocksdb_key_lock_wait_time,
+ rocksdb_key_lock_wait_count,
+ rocksdb_env_new_sequential_file_nanos,
+ rocksdb_env_new_random_access_file_nanos,
+ rocksdb_env_new_writable_file_nanos,
+ rocksdb_env_reuse_writable_file_nanos,
+ rocksdb_env_new_random_rw_file_nanos,
+ rocksdb_env_new_directory_nanos,
+ rocksdb_env_file_exists_nanos,
+ rocksdb_env_get_children_nanos,
+ rocksdb_env_get_children_file_attributes_nanos,
+ rocksdb_env_delete_file_nanos,
+ rocksdb_env_create_dir_nanos,
+ rocksdb_env_create_dir_if_missing_nanos,
+ rocksdb_env_delete_dir_nanos,
+ rocksdb_env_get_file_size_nanos,
+ rocksdb_env_get_file_modification_time_nanos,
+ rocksdb_env_rename_file_nanos,
+ rocksdb_env_link_file_nanos,
+ rocksdb_env_lock_file_nanos,
+ rocksdb_env_unlock_file_nanos,
+ rocksdb_env_new_logger_nanos,
+ rocksdb_number_async_seek,
+ rocksdb_blob_cache_hit_count,
+ rocksdb_blob_read_count,
+ rocksdb_blob_read_byte,
+ rocksdb_blob_read_time,
+ rocksdb_blob_checksum_time,
+ rocksdb_blob_decompress_time,
+ rocksdb_internal_range_del_reseek_count,
+ rocksdb_total_metric_count = 78
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(
+ void);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
+ rocksdb_perfcontext_t* context);
+extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
+ rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy(
+ rocksdb_perfcontext_t* context);
+
+/* Compaction Filter */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
+rocksdb_compactionfilter_create(
+ void* state, void (*destructor)(void*),
+ unsigned char (*filter)(void*, int level, const char* key,
+ size_t key_length, const char* existing_value,
+ size_t value_length, char** new_value,
+ size_t* new_value_length,
+ unsigned char* value_changed),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots(
+ rocksdb_compactionfilter_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy(
+ rocksdb_compactionfilter_t*);
+
+/* Compaction Filter Context */
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_full_compaction(
+ rocksdb_compactionfiltercontext_t* context);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_manual_compaction(
+ rocksdb_compactionfiltercontext_t* context);
+
+/* Compaction Filter Factory */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t*
+rocksdb_compactionfilterfactory_create(
+ void* state, void (*destructor)(void*),
+ rocksdb_compactionfilter_t* (*create_compaction_filter)(
+ void*, rocksdb_compactionfiltercontext_t* context),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy(
+ rocksdb_compactionfilterfactory_t*);
+
+/* Comparator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy(
+ rocksdb_comparator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t*
+rocksdb_comparator_with_ts_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+ size_t b_tslen),
+ int (*compare_without_ts)(void*, const char* a, size_t alen,
+ unsigned char a_has_ts, const char* b,
+ size_t blen, unsigned char b_has_ts),
+ const char* (*name)(void*), size_t timestamp_size);
+
+/* Filter policy */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy(
+ rocksdb_filterpolicy_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom_full(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key,
+ int bloom_before_level);
+
+/* Merge Operator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t*
+rocksdb_mergeoperator_create(
+ void* state, void (*destructor)(void*),
+ char* (*full_merge)(void*, const char* key, size_t key_length,
+ const char* existing_value,
+ size_t existing_value_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ char* (*partial_merge)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ void (*delete_value)(void*, const char* value, size_t value_length),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
+ rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(
+ void);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
+ rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
+ rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound(
+ rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
+ rocksdb_readoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
+ rocksdb_readoptions_t*);
+// The functionality that this option controlled has been removed.
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
+ rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*,
+ uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline(
+ rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout(
+ rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp(
+ rocksdb_readoptions_t*, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts(
+ rocksdb_readoptions_t*, const char* ts, size_t tslen);
+
+/* Write options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create(
+ void);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
+ rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL(
+ rocksdb_writeoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_ignore_missing_column_families(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+ rocksdb_writeoptions_t*);
+
+/* Compact range options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
+rocksdb_compactoptions_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
+ rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_exclusive_manual_compaction(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_exclusive_manual_compaction(
+ rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_bottommost_level_compaction(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_bottommost_level_compaction(
+ rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
+ rocksdb_compactoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level(
+ rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_full_history_ts_low(
+ rocksdb_compactoptions_t*, char* ts, size_t tslen);
+
+/* Flush options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create(
+ void);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
+ rocksdb_flushoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
+ rocksdb_flushoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait(
+ rocksdb_flushoptions_t*);
+
+/* Memory allocator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t*
+rocksdb_jemalloc_nodump_allocator_create(char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy(
+ rocksdb_memory_allocator_t*);
+
+/* Cache */
+
+extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t*
+rocksdb_lru_cache_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
+ rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
+ rocksdb_lru_cache_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits(
+ rocksdb_lru_cache_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
+ rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
+ size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t*
+rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+ rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data(
+ rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
+ rocksdb_cache_t* cache, size_t capacity);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_capacity(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_usage(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
+
+/* DBPath */
+
+extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(
+ const char* path, uint64_t target_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
+
+/* Env */
+
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
+ rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads(
+ rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(
+ void);
+extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
+ rocksdb_envoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_create_dir_if_missing(
+ rocksdb_env_t* env, const char* path, char** errptr);
+
+/* SstFile */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env,
+ const rocksdb_options_t* io_options);
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create_with_comparator(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+ const rocksdb_comparator_t* comparator);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open(
+ rocksdb_sstfilewriter_t* writer, const char* name, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put_with_ts(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* ts, size_t tslen, const char* val, size_t vallen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_with_ts(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* ts, size_t tslen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_range(
+ rocksdb_sstfilewriter_t* writer, const char* begin_key, size_t begin_keylen,
+ const char* end_key, size_t end_keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish(
+ rocksdb_sstfilewriter_t* writer, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size(
+ rocksdb_sstfilewriter_t* writer, uint64_t* file_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
+ rocksdb_sstfilewriter_t* writer);
+extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_move_files(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char snapshot_consistency);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_blocking_flush);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_ingest_behind(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
+ rocksdb_ingestexternalfileoptions_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file(
+ rocksdb_t* db, const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+ const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+ rocksdb_t* db, char** errptr);
+
+/* SliceTransform */
+
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create(
+ void* state, void (*destructor)(void*),
+ char* (*transform)(void*, const char* key, size_t length,
+ size_t* dst_length),
+ unsigned char (*in_domain)(void*, const char* key, size_t length),
+ unsigned char (*in_range)(void*, const char* key, size_t length),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+ rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create_noop(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
+ rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+ rocksdb_similar_size_compaction_stop_style = 0,
+ rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_size_ratio(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_size_ratio(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_min_merge_width(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_min_merge_width(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_merge_width(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_merge_width(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_compression_size_percent(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_compression_size_percent(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_stop_style(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_stop_style(
+ rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
+ rocksdb_universal_compaction_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
+rocksdb_fifo_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
+ rocksdb_fifo_compaction_options_t* fifo_opts);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
+ const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
+ const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
+ const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
+ const rocksdb_livefiles_t*);
+
+/* Utility Helpers */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
+ const rocksdb_options_t* base_options, const char* opts_str,
+ rocksdb_options_t* new_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range(
+ rocksdb_t* db, const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr);
+
+/* MetaData */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata(rocksdb_t* db);
+
+/**
+ * Returns the rocksdb_column_family_metadata_t of the specified
+ * column family.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_column_family_metadata_destroy.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_metadata_destroy(
+ rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_column_family_metadata_get_size(
+ rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t rocksdb_column_family_metadata_get_file_count(
+ rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_metadata_get_name(
+ rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_column_family_metadata_get_level_count(
+ rocksdb_column_family_metadata_t* cf_meta);
+
+/**
+ * Returns the rocksdb_level_metadata_t of the ith level from the specified
+ * column family metadata.
+ *
+ * If the specified i is greater than or equal to the number of levels
+ * in the specified column family, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_level_metadata_destroy before releasing its parent
+ * rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_level_metadata_t*
+rocksdb_column_family_metadata_get_level_metadata(
+ rocksdb_column_family_metadata_t* cf_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_level_metadata_t.
+ *
+ * Note that the specified rocksdb_level_metadata_t must be released
+ * before the release of its parent rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_level_metadata_destroy(
+ rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_level_metadata_get_level(
+ rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_level_metadata_get_file_count(rocksdb_level_metadata_t* level_meta);
+
+/**
+ * Returns the sst_file_metadata_t of the ith file from the specified level
+ * metadata.
+ *
+ * If the specified i is greater than or equal to the number of files
+ * in the specified level, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_sst_file_metadata_destroy before releasing its
+ * parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_sst_file_metadata_t*
+rocksdb_level_metadata_get_sst_file_metadata(
+ rocksdb_level_metadata_t* level_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_sst_file_metadata_t.
+ *
+ * Note that the specified rocksdb_sst_file_metadata_t must be released
+ * before the release of its parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_metadata_destroy(
+ rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API char*
+rocksdb_sst_file_metadata_get_relative_filename(
+ rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ * after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_smallestkey(
+ rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ * after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_largestkey(
+ rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/* Transactions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_transactiondb_create_column_family(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t*
+rocksdb_transactiondb_open_column_families(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value(
+ rocksdb_transactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int(
+ rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_transaction_options_t* txn_options,
+ rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t**
+rocksdb_transactiondb_get_prepared_transactions(rocksdb_transactiondb_t* txn_db,
+ size_t* cnt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_name(
+ rocksdb_transaction_t* txn, const char* name, size_t name_len,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_name(
+ rocksdb_transaction_t* txn, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_prepare(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint(
+ rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
+ rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_transaction_get_writebatch_wi(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch(
+ rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+ char** errptr);
+
+// This rocksdb_writebatch_wi_t should be freed with rocksdb_free
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch_wi(
+ rocksdb_transaction_t* txn, rocksdb_writebatch_wi_t* wi, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_commit_timestamp(
+ rocksdb_transaction_t* txn, uint64_t commit_timestamp);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_set_read_timestamp_for_validation(
+ rocksdb_transaction_t* txn, uint64_t read_timestamp);
+
+// This snapshot should be freed using rocksdb_free
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_cf(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen,
+ unsigned char exclusive,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
+ rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal(
+ rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
+ const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_optimistictransactiondb_get_base_db(
+ rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db(
+ rocksdb_t* base_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
+rocksdb_optimistictransaction_begin(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_optimistictransaction_options_t* otxn_options,
+ rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
+ rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_optimistictransactiondb_checkpoint_object_create(
+ rocksdb_optimistictransactiondb_t* otxn_db, char** errptr);
+
+/* Transaction Options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
+rocksdb_transactiondb_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
+ rocksdb_transactiondb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks(
+ rocksdb_transactiondb_options_t* opt, int64_t max_num_locks);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes(
+ rocksdb_transactiondb_options_t* opt, size_t num_stripes);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_transaction_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_default_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
+rocksdb_transaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
+ rocksdb_transaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot(
+ rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect(
+ rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout(
+ rocksdb_transaction_options_t* opt, int64_t lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration(
+ rocksdb_transaction_options_t* opt, int64_t expiration);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_deadlock_detect_depth(
+ rocksdb_transaction_options_t* opt, int64_t depth);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_max_write_batch_size(
+ rocksdb_transaction_options_t* opt, size_t size);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_skip_prepare(
+ rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
+ rocksdb_optimistictransaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_optimistictransaction_options_set_set_snapshot(
+ rocksdb_optimistictransaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value(
+ rocksdb_optimistictransactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int(
+ rocksdb_optimistictransactiondb_t* db, const char* propname,
+ uint64_t* out_val);
+
+// referring to convention (3), this should be used by client
+// to free memory that was malloc()ed
+extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy(
+ rocksdb_pinnableslice_t* v);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
+ const rocksdb_pinnableslice_t* t, size_t* vlen);
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+rocksdb_memory_consumers_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+ rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+ rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+ rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+ rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+ rocksdb_memory_usage_t* memory_usage);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats(
+ rocksdb_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*,
+ unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work(
+ rocksdb_t* db, unsigned char wait);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction(
+ rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
new file mode 100644
index 000000000..575d276b5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -0,0 +1,775 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values. It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads. It may automatically evict entries to make room
+// for new entries. Values have a specified charge against the cache
+// capacity. For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided. Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ConfigOptions;
+class Logger;
+class SecondaryCache;
+
+// Classifications of block cache entries.
+//
+// Developer notes: Adding a new enum to this class requires corresponding
+// updates to `kCacheEntryRoleToCamelString` and
+// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since
+// `kNumCacheEntryRoles` assumes `kMisc` comes last.
+enum class CacheEntryRole {
+ // Block-based table data block
+ kDataBlock,
+ // Block-based table filter block (full or partitioned)
+ kFilterBlock,
+ // Block-based table metadata block for partitioned filter
+ kFilterMetaBlock,
+ // OBSOLETE / DEPRECATED: old/removed block-based filter
+ kDeprecatedFilterBlock,
+ // Block-based table index block
+ kIndexBlock,
+ // Other kinds of block-based table block
+ kOtherBlock,
+ // WriteBufferManager's charge to account for its memtable usage
+ kWriteBuffer,
+ // Compression dictionary building buffer's charge to account for
+ // its memory usage
+ kCompressionDictionaryBuildingBuffer,
+ // Filter's charge to account for
+ // (new) bloom and ribbon filter construction's memory usage
+ kFilterConstruction,
+ // BlockBasedTableReader's charge to account for its memory usage
+ kBlockBasedTableReader,
+ // FileMetadata's charge to account for its memory usage
+ kFileMetadata,
+ // Blob value (when using the same cache as block cache and blob cache)
+ kBlobValue,
+ // Blob cache's charge to account for its memory usage (when using a
+ // separate block cache and blob cache)
+ kBlobCache,
+ // Default bucket, for miscellaneous cache entries. Do not use for
+ // entries that could potentially add up to large usage.
+ kMisc,
+};
+constexpr uint32_t kNumCacheEntryRoles =
+ static_cast<uint32_t>(CacheEntryRole::kMisc) + 1;
+
+// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`.
+const std::string& GetCacheEntryRoleName(CacheEntryRole);
+
+// For use with `GetMapProperty()` for property
+// `DB::Properties::kBlockCacheEntryStats`. On success, the map will
+// be populated with all keys that can be obtained from these functions.
+struct BlockCacheEntryStatsMapKeys {
+ static const std::string& CacheId();
+ static const std::string& CacheCapacityBytes();
+ static const std::string& LastCollectionDurationSeconds();
+ static const std::string& LastCollectionAgeSeconds();
+
+ static std::string EntryCount(CacheEntryRole);
+ static std::string UsedBytes(CacheEntryRole);
+ static std::string UsedPercent(CacheEntryRole);
+};
+
+extern const bool kDefaultToAdaptiveMutex;
+
+enum CacheMetadataChargePolicy {
+ // Only the `charge` of each entry inserted into a Cache counts against
+ // the `capacity`
+ kDontChargeCacheMetadata,
+ // In addition to the `charge`, the approximate space overheads in the
+ // Cache (in bytes) also count against `capacity`. These space overheads
+ // are for supporting fast Lookup and managing the lifetime of entries.
+ kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+ kFullChargeCacheMetadata;
+
+// Options shared betweeen various cache implementations that
+// divide the key space into shards using hashing.
+struct ShardedCacheOptions {
+ // Capacity of the cache, in the same units as the `charge` of each entry.
+ // This is typically measured in bytes, but can be a different unit if using
+ // kDontChargeCacheMetadata.
+ size_t capacity = 0;
+
+ // Cache is sharded into 2^num_shard_bits shards, by hash of key.
+ // If < 0, a good default is chosen based on the capacity and the
+ // implementation. (Mutex-based implementations are much more reliant
+ // on many shards for parallel scalability.)
+ int num_shard_bits = -1;
+
+ // If strict_capacity_limit is set, Insert() will fail if there is not
+ // enough capacity for the new entry along with all the existing referenced
+ // (pinned) cache entries. (Unreferenced cache entries are evicted as
+ // needed, sometimes immediately.) If strict_capacity_limit == false
+ // (default), Insert() never fails.
+ bool strict_capacity_limit = false;
+
+ // If non-nullptr, RocksDB will use this allocator instead of system
+ // allocator when allocating memory for cache blocks.
+ //
+ // Caveat: when the cache is used as block cache, the memory allocator is
+ // ignored when dealing with compression libraries that allocate memory
+ // internally (currently only XPRESS).
+ std::shared_ptr<MemoryAllocator> memory_allocator;
+
+ // See CacheMetadataChargePolicy
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy;
+
+ ShardedCacheOptions() {}
+ ShardedCacheOptions(
+ size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy)
+ : capacity(_capacity),
+ num_shard_bits(_num_shard_bits),
+ strict_capacity_limit(_strict_capacity_limit),
+ memory_allocator(std::move(_memory_allocator)),
+ metadata_charge_policy(_metadata_charge_policy) {}
+};
+
+struct LRUCacheOptions : public ShardedCacheOptions {
+ // Ratio of cache reserved for high-priority and low-priority entries,
+ // respectively. (See Cache::Priority below more information on the levels.)
+ // Valid values are between 0 and 1 (inclusive), and the sum of the two
+ // values cannot exceed 1.
+ //
+ // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU
+ // list is maintained by the cache. Similarly, if low_pri_pool_ratio is
+ // greater than zero, a dedicated low-priority LRU list is maintained.
+ // There is also a bottom-priority LRU list, which is always enabled and not
+ // explicitly configurable. Entries are spilled over to the next available
+ // lower-priority pool if a certain pool's capacity is exceeded.
+ //
+ // Entries with cache hits are inserted into the highest priority LRU list
+ // available regardless of the entry's priority. Entries without hits
+ // are inserted into highest priority LRU list available whose priority
+ // does not exceed the entry's priority. (For example, high-priority items
+ // with no hits are placed in the high-priority pool if available;
+ // otherwise, they are placed in the low-priority pool if available;
+ // otherwise, they are placed in the bottom-priority pool.) This results
+ // in lower-priority entries without hits getting evicted from the cache
+ // sooner.
+ //
+ // Default values: high_pri_pool_ratio = 0.5 (which is referred to as
+ // "midpoint insertion"), low_pri_pool_ratio = 0
+ double high_pri_pool_ratio = 0.5;
+ double low_pri_pool_ratio = 0.0;
+
+ // Whether to use adaptive mutexes for cache shards. Note that adaptive
+ // mutexes need to be supported by the platform in order for this to have any
+ // effect. The default value is true if RocksDB is compiled with
+ // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
+
+ // A SecondaryCache instance to use a the non-volatile tier.
+ std::shared_ptr<SecondaryCache> secondary_cache;
+
+ LRUCacheOptions() {}
+ LRUCacheOptions(size_t _capacity, int _num_shard_bits,
+ bool _strict_capacity_limit, double _high_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy,
+ double _low_pri_pool_ratio = 0.0)
+ : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+ std::move(_memory_allocator),
+ _metadata_charge_policy),
+ high_pri_pool_ratio(_high_pri_pool_ratio),
+ low_pri_pool_ratio(_low_pri_pool_ratio),
+ use_adaptive_mutex(_use_adaptive_mutex) {}
+};
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^num_shard_bits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. If strict_capacity_limit
+// is set, insert to the cache will fail when cache is full. User can also
+// set percentage of the cache reserves for high priority entries via
+// high_pri_pool_pct.
+// num_shard_bits = -1 means it is automatically determined: every shard
+// will be at least 512KB and number of shard bits will not exceed 6.
+extern std::shared_ptr<Cache> NewLRUCache(
+ size_t capacity, int num_shard_bits = -1,
+ bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+ std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy,
+ double low_pri_pool_ratio = 0.0);
+
+extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
+
+// EXPERIMENTAL
+// Options structure for configuring a SecondaryCache instance based on
+// LRUCache. The LRUCacheOptions.secondary_cache is not used and
+// should not be set.
+struct CompressedSecondaryCacheOptions : LRUCacheOptions {
+ // The compression method (if any) that is used to compress data.
+ CompressionType compression_type = CompressionType::kLZ4Compression;
+
+ // compress_format_version can have two values:
+ // compress_format_version == 1 -- decompressed size is not included in the
+ // block header.
+ // compress_format_version == 2 -- decompressed size is included in the block
+ // header in varint32 format.
+ uint32_t compress_format_version = 2;
+
+ // Enable the custom split and merge feature, which split the compressed value
+ // into chunks so that they may better fit jemalloc bins.
+ bool enable_custom_split_merge = false;
+
+ CompressedSecondaryCacheOptions() {}
+ CompressedSecondaryCacheOptions(
+ size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+ double _high_pri_pool_ratio, double _low_pri_pool_ratio = 0.0,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy,
+ CompressionType _compression_type = CompressionType::kLZ4Compression,
+ uint32_t _compress_format_version = 2,
+ bool _enable_custom_split_merge = false)
+ : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+ _high_pri_pool_ratio, std::move(_memory_allocator),
+ _use_adaptive_mutex, _metadata_charge_policy,
+ _low_pri_pool_ratio),
+ compression_type(_compression_type),
+ compress_format_version(_compress_format_version),
+ enable_custom_split_merge(_enable_custom_split_merge) {}
+};
+
+// EXPERIMENTAL
+// Create a new Secondary Cache that is implemented on top of LRUCache.
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+ size_t capacity, int num_shard_bits = -1,
+ bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+ double low_pri_pool_ratio = 0.0,
+ std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy,
+ CompressionType compression_type = CompressionType::kLZ4Compression,
+ uint32_t compress_format_version = 2,
+ bool enable_custom_split_merge = false);
+
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+ const CompressedSecondaryCacheOptions& opts);
+
+// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
+// that offers much improved CPU efficiency vs. LRUCache under high parallel
+// load or high contention, with some caveats:
+// * Not a general Cache implementation: can only be used for
+// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is
+// compatible with HyperClockCache.
+// * Requires an extra tuning parameter: see estimated_entry_charge below.
+// Similarly, substantially changing the capacity with SetCapacity could
+// harm efficiency.
+// * SecondaryCache is not yet supported.
+// * Cache priorities are less aggressively enforced, which could cause
+// cache dilution from long range scans (unless they use fill_cache=false).
+// * Can be worse for small caches, because if almost all of a cache shard is
+// pinned (more likely with non-partitioned filters), then CLOCK eviction
+// becomes very CPU intensive.
+//
+// See internal cache/clock_cache.h for full description.
+struct HyperClockCacheOptions : public ShardedCacheOptions {
+ // The estimated average `charge` associated with cache entries. This is a
+ // critical configuration parameter for good performance from the hyper
+ // cache, because having a table size that is fixed at creation time greatly
+ // reduces the required synchronization between threads.
+ // * If the estimate is substantially too low (e.g. less than half the true
+ // average) then metadata space overhead with be substantially higher (e.g.
+ // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this
+ // can slightly reduce cache hit rates, and slightly reduce access times due
+ // to the larger working memory size.
+ // * If the estimate is substantially too high (e.g. 25% higher than the true
+ // average) then there might not be sufficient slots in the hash table for
+ // both efficient operation and capacity utilization (hit rate). The hyper
+ // cache will evict entries to prevent load factors that could dramatically
+ // affect lookup times, instead letting the hit rate suffer by not utilizing
+ // the full capacity.
+ //
+ // A reasonable choice is the larger of block_size and metadata_block_size.
+ // When WriteBufferManager (and similar) charge memory usage to the block
+ // cache, this can lead to the same effect as estimate being too low, which
+ // is better than the opposite. Therefore, the general recommendation is to
+ // assume that other memory charged to block cache could be negligible, and
+ // ignore it in making the estimate.
+ //
+ // The best parameter choice based on a cache in use is given by
+ // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as
+ // with kDontChargeCacheMetadata. More precisely with
+ // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) /
+ // GetOccupancyCount(). However, when the average value size might vary
+ // (e.g. balance between metadata and data blocks in cache), it is better
+ // to estimate toward the lower side than the higher side.
+ size_t estimated_entry_charge;
+
+ HyperClockCacheOptions(
+ size_t _capacity, size_t _estimated_entry_charge,
+ int _num_shard_bits = -1, bool _strict_capacity_limit = false,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy)
+ : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+ std::move(_memory_allocator),
+ _metadata_charge_policy),
+ estimated_entry_charge(_estimated_entry_charge) {}
+
+ // Construct an instance of HyperClockCache using these options
+ std::shared_ptr<Cache> MakeSharedCache() const;
+};
+
+// DEPRECATED - The old Clock Cache implementation had an unresolved bug and
+// has been removed. The new HyperClockCache requires an additional
+// configuration parameter that is not provided by this API. This function
+// simply returns a new LRUCache for functional compatibility.
+extern std::shared_ptr<Cache> NewClockCache(
+ size_t capacity, int num_shard_bits = -1,
+ bool strict_capacity_limit = false,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy);
+
+class Cache {
+ public: // opaque types
+ // Opaque handle to an entry stored in the cache.
+ struct Handle {};
+
+ public: // type defs
+ // Depending on implementation, cache entries with higher priority levels
+ // could be less likely to get evicted than entries with lower priority
+ // levels. The "high" priority level applies to certain SST metablocks (e.g.
+ // index and filter blocks) if the option
+ // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority
+ // level is used for other kinds of SST blocks (most importantly, data
+ // blocks), as well as the above metablocks in case
+ // cache_index_and_filter_blocks_with_high_priority is
+ // not set. The "bottom" priority level is for BlobDB's blob values.
+ enum class Priority { HIGH, LOW, BOTTOM };
+
+ // A set of callbacks to allow objects in the primary block cache to be
+ // be persisted in a secondary cache. The purpose of the secondary cache
+ // is to support other ways of caching the object, such as persistent or
+ // compressed data, that may require the object to be parsed and transformed
+ // in some way. Since the primary cache holds C++ objects and the secondary
+ // cache may only hold flat data that doesn't need relocation, these
+ // callbacks need to be provided by the user of the block
+ // cache to do the conversion.
+ // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers
+ // to callback functions for size, saving and deletion of the
+ // object. The callbacks are defined in C-style in order to make them
+ // stateless and not add to the cache metadata size.
+ // Saving multiple std::function objects will take up 32 bytes per
+ // function, even if its not bound to an object and does no capture.
+ //
+ // All the callbacks are C-style function pointers in order to simplify
+ // lifecycle management. Objects in the cache can outlive the parent DB,
+ // so anything required for these operations should be contained in the
+ // object itself.
+ //
+ // The SizeCallback takes a void* pointer to the object and returns the size
+ // of the persistable data. It can be used by the secondary cache to allocate
+ // memory if needed.
+ //
+ // RocksDB callbacks are NOT exception-safe. A callback completing with an
+ // exception can lead to undefined behavior in RocksDB, including data loss,
+ // unreported corruption, deadlocks, and more.
+ using SizeCallback = size_t (*)(void* obj);
+
+ // The SaveToCallback takes a void* object pointer and saves the persistable
+ // data into a buffer. The secondary cache may decide to not store it in a
+ // contiguous buffer, in which case this callback will be called multiple
+ // times with increasing offset
+ using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+ size_t length, void* out);
+
+ // A function pointer type for custom destruction of an entry's
+ // value. The Cache is responsible for copying and reclaiming space
+ // for the key, but values are managed by the caller.
+ using DeleterFn = void (*)(const Slice& key, void* value);
+
+ // A struct with pointers to helper functions for spilling items from the
+ // cache into the secondary cache. May be extended in the future. An
+ // instance of this struct is expected to outlive the cache.
+ struct CacheItemHelper {
+ SizeCallback size_cb;
+ SaveToCallback saveto_cb;
+ DeleterFn del_cb;
+
+ CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+ CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+ DeleterFn _del_cb)
+ : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+ };
+
+ // The CreateCallback is passed by the block cache user to Lookup(). It
+ // takes in a buffer from the NVM cache and constructs an object using
+ // it. The callback doesn't have ownership of the buffer and should
+ // copy the contents into its own buffer.
+ using CreateCallback = std::function<Status(const void* buf, size_t size,
+ void** out_obj, size_t* charge)>;
+
+ public: // ctor/dtor/create
+ Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+ : memory_allocator_(std::move(allocator)) {}
+ // No copying allowed
+ Cache(const Cache&) = delete;
+ Cache& operator=(const Cache&) = delete;
+
+ // Destroys all remaining entries by calling the associated "deleter"
+ virtual ~Cache() {}
+
+ // Creates a new Cache based on the input value string and returns the result.
+ // Currently, this method can be used to create LRUCaches only
+ // @param config_options
+ // @param value The value might be:
+ // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102(
+ // - Name-value option pairs -- "capacity=1M; num_shard_bits=4;
+ // For the LRUCache, the values are defined in LRUCacheOptions.
+ // @param result The new Cache object
+ // @return OK if the cache was successfully created
+ // @return NotFound if an invalid name was specified in the value
+ // @return InvalidArgument if either the options were not valid
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<Cache>* result);
+
+ public: // functions
+ // The type of the Cache
+ virtual const char* Name() const = 0;
+
+ // EXPERIMENTAL SecondaryCache support:
+ // Some APIs here are experimental and might change in the future.
+ // The Insert and Lookup APIs below are intended to allow cached objects
+ // to be demoted/promoted between the primary block cache and a secondary
+ // cache. The secondary cache could be a non-volatile cache, and will
+ // likely store the object in a different representation. They rely on a
+ // per object CacheItemHelper to do the conversions.
+ // The secondary cache may persist across process and system restarts,
+ // and may even be moved between hosts. Therefore, the cache key must
+ // be repeatable across restarts/reboots, and globally unique if
+ // multiple DBs share the same cache and the set of DBs can change
+ // over time.
+
+ // Insert a mapping from key->value into the volatile cache only
+ // and assign it with the specified charge against the total cache capacity.
+ // If strict_capacity_limit is true and cache reaches its full capacity,
+ // return Status::MemoryLimit.
+ //
+ // If handle is not nullptr, returns a handle that corresponds to the
+ // mapping. The caller must call this->Release(handle) when the returned
+ // mapping is no longer needed. In case of error caller is responsible to
+ // cleanup the value (i.e. calling "deleter").
+ //
+ // If handle is nullptr, it is as if Release is called immediately after
+ // insert. In case of error value will be cleanup.
+ //
+ // When the inserted entry is no longer needed, the key and
+ // value will be passed to "deleter" which must delete the value.
+ // (The Cache is responsible for copying and reclaiming space for
+ // the key.)
+ virtual Status Insert(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter, Handle** handle = nullptr,
+ Priority priority = Priority::LOW) = 0;
+
+ // EXPERIMENTAL
+ // Insert a mapping from key->value into the cache and assign it
+ // the specified charge against the total cache capacity. If
+ // strict_capacity_limit is true and cache reaches its full capacity,
+ // return Status::MemoryLimit. `value` must be non-nullptr for this
+ // Insert() because Value() == nullptr is reserved for indicating failure
+ // with secondary-cache-compatible mappings.
+ //
+ // The helper argument is saved by the cache and will be used when the
+ // inserted object is evicted or promoted to the secondary cache. It,
+ // therefore, must outlive the cache.
+ //
+ // If handle is not nullptr, returns a handle that corresponds to the
+ // mapping. The caller must call this->Release(handle) when the returned
+ // mapping is no longer needed. In case of error caller is responsible to
+ // cleanup the value (i.e. calling "deleter").
+ //
+ // If handle is nullptr, it is as if Release is called immediately after
+ // insert. In case of error value will be cleanup.
+ //
+ // Regardless of whether the item was inserted into the cache,
+ // it will attempt to insert it into the secondary cache if one is
+ // configured, and the helper supports it.
+ // The cache implementation must support a secondary cache, otherwise
+ // the item is only inserted into the primary cache. It may
+ // defer the insertion to the secondary cache as it sees fit.
+ //
+ // When the inserted entry is no longer needed, the key and
+ // value will be passed to "deleter".
+ virtual Status Insert(const Slice& key, void* value,
+ const CacheItemHelper* helper, size_t charge,
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) {
+ if (!helper) {
+ return Status::InvalidArgument();
+ }
+ return Insert(key, value, charge, helper->del_cb, handle, priority);
+ }
+
+ // If the cache has no mapping for "key", returns nullptr.
+ //
+ // Else return a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ // If stats is not nullptr, relative tickers could be used inside the
+ // function.
+ virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0;
+
+ // EXPERIMENTAL
+ // Lookup the key in the primary and secondary caches (if one is configured).
+ // The create_cb callback function object will be used to contruct the
+ // cached object.
+ // If none of the caches have the mapping for the key, returns nullptr.
+ // Else, returns a handle that corresponds to the mapping.
+ //
+ // This call may promote the object from the secondary cache (if one is
+ // configured, and has the given key) to the primary cache.
+ //
+ // The helper argument should be provided if the caller wants the lookup
+ // to include the secondary cache (if one is configured) and the object,
+ // if it exists, to be promoted to the primary cache. The helper may be
+ // saved and used later when the object is evicted. Therefore, it must
+ // outlive the cache.
+ //
+ // ======================== Async Lookup (wait=false) ======================
+ // When wait=false, the handle returned might be in any of three states:
+ // * Present - If Value() != nullptr, then the result is present and
+ // the handle can be used just as if wait=true.
+ // * Pending, not ready (IsReady() == false) - secondary cache is still
+ // working to retrieve the value. Might become ready any time.
+ // * Pending, ready (IsReady() == true) - secondary cache has the value
+ // but it has not been loaded into primary cache. Call to Wait()/WaitAll()
+ // will not block.
+ //
+ // IMPORTANT: Pending handles are not thread-safe, and only these functions
+ // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release()
+ // can only come after Wait() or WaitAll() even though a reference is held.
+ //
+ // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is
+ // safe and has no effect on other handle states.) After waiting on a Handle,
+ // it is in one of two states:
+ // * Present - if Value() != nullptr
+ // * Failed - if Value() == nullptr, such as if the secondary cache
+ // initially thought it had the value but actually did not.
+ //
+ // Note that given an arbitrary Handle, the only way to distinguish the
+ // Pending+ready state from the Failed state is to Wait() on it. A cache
+ // entry not compatible with secondary cache can also have Value()==nullptr
+ // like the Failed state, but this is not generally a concern.
+ virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/,
+ const CreateCallback& /*create_cb*/,
+ Priority /*priority*/, bool /*wait*/,
+ Statistics* stats = nullptr) {
+ return Lookup(key, stats);
+ }
+
+ // Increments the reference count for the handle if it refers to an entry in
+ // the cache. Returns true if refcount was incremented; otherwise, returns
+ // false.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual bool Ref(Handle* handle) = 0;
+
+ /**
+ * Release a mapping returned by a previous Lookup(). A released entry might
+ * still remain in cache in case it is later looked up by others. If
+ * erase_if_last_ref is set then it also erases it from the cache if there is
+ * no other reference to it. Erasing it should call the deleter function that
+ * was provided when the entry was inserted.
+ *
+ * Returns true if the entry was also erased.
+ */
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0;
+
+ // Return the value encapsulated in a handle returned by a
+ // successful Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void* Value(Handle* handle) = 0;
+
+ // If the cache contains the entry for the key, erase it. Note that the
+ // underlying entry will be kept around until all existing handles
+ // to it have been released.
+ virtual void Erase(const Slice& key) = 0;
+ // Return a new numeric id. May be used by multiple clients who are
+ // sharding the same cache to partition the key space. Typically the
+ // client will allocate a new id at startup and prepend the id to
+ // its cache keys.
+ virtual uint64_t NewId() = 0;
+
+ // sets the maximum configured capacity of the cache. When the new
+ // capacity is less than the old capacity and the existing usage is
+ // greater than new capacity, the implementation will do its best job to
+ // purge the released entries from the cache in order to lower the usage
+ virtual void SetCapacity(size_t capacity) = 0;
+
+ // Set whether to return error on insertion when cache reaches its full
+ // capacity.
+ virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+
+ // Get the flag whether to return error on insertion when cache reaches its
+ // full capacity.
+ virtual bool HasStrictCapacityLimit() const = 0;
+
+ // Returns the maximum configured capacity of the cache
+ virtual size_t GetCapacity() const = 0;
+
+ // Returns the memory size for the entries residing in the cache.
+ virtual size_t GetUsage() const = 0;
+
+ // Returns the number of entries currently tracked in the table. SIZE_MAX
+ // means "not supported." This is used for inspecting the load factor, along
+ // with GetTableAddressCount().
+ virtual size_t GetOccupancyCount() const { return SIZE_MAX; }
+
+ // Returns the number of ways the hash function is divided for addressing
+ // entries. Zero means "not supported." This is used for inspecting the load
+ // factor, along with GetOccupancyCount().
+ virtual size_t GetTableAddressCount() const { return 0; }
+
+ // Returns the memory size for a specific entry in the cache.
+ virtual size_t GetUsage(Handle* handle) const = 0;
+
+ // Returns the memory size for the entries in use by the system
+ virtual size_t GetPinnedUsage() const = 0;
+
+ // Returns the charge for the specific entry in the cache.
+ virtual size_t GetCharge(Handle* handle) const = 0;
+
+ // Returns the deleter for the specified entry. This might seem useless
+ // as the Cache itself is responsible for calling the deleter, but
+ // the deleter can essentially verify that a cache entry is of an
+ // expected type from an expected code source.
+ virtual DeleterFn GetDeleter(Handle* handle) const = 0;
+
+ // Call this on shutdown if you want to speed it up. Cache will disown
+ // any underlying data and will not free it on delete. This call will leak
+ // memory - call this only if you're shutting down the process.
+ // Any attempts of using cache after this call will fail terribly.
+ // Always delete the DB object before calling this method!
+ virtual void DisownData() {
+ // default implementation is noop
+ }
+
+ struct ApplyToAllEntriesOptions {
+ // If the Cache uses locks, setting `average_entries_per_lock` to
+ // a higher value suggests iterating over more entries each time a lock
+ // is acquired, likely reducing the time for ApplyToAllEntries but
+ // increasing latency for concurrent users of the Cache. Setting
+ // `average_entries_per_lock` to a smaller value could be helpful if
+ // callback is relatively expensive, such as using large data structures.
+ size_t average_entries_per_lock = 256;
+ };
+
+ // Apply a callback to all entries in the cache. The Cache must ensure
+ // thread safety but does not guarantee that a consistent snapshot of all
+ // entries is iterated over if other threads are operating on the Cache
+ // also.
+ virtual void ApplyToAllEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ const ApplyToAllEntriesOptions& opts) = 0;
+
+ // DEPRECATED version of above. (Default implementation uses above.)
+ virtual void ApplyToAllCacheEntries(void (*callback)(void* value,
+ size_t charge),
+ bool /*thread_safe*/) {
+ ApplyToAllEntries([callback](const Slice&, void* value, size_t charge,
+ DeleterFn) { callback(value, charge); },
+ {});
+ }
+
+ // Remove all entries.
+ // Prerequisite: no entry is referenced.
+ virtual void EraseUnRefEntries() = 0;
+
+ virtual std::string GetPrintableOptions() const { return ""; }
+
+ // Check for any warnings or errors in the operation of the cache and
+ // report them to the logger. This is intended only to be called
+ // periodically so does not need to be very efficient. (Obscure calling
+ // conventions for Logger inherited from env.h)
+ virtual void ReportProblems(
+ const std::shared_ptr<Logger>& /*info_log*/) const {}
+
+ MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
+
+ // EXPERIMENTAL
+ // Release a mapping returned by a previous Lookup(). The "useful"
+ // parameter specifies whether the data was actually used or not,
+ // which may be used by the cache implementation to decide whether
+ // to consider it as a hit for retention purposes. As noted elsewhere,
+ // "pending" handles require Wait()/WaitAll() before Release().
+ virtual bool Release(Handle* handle, bool /*useful*/,
+ bool erase_if_last_ref) {
+ return Release(handle, erase_if_last_ref);
+ }
+
+ // EXPERIMENTAL
+ // Determines if the handle returned by Lookup() can give a value without
+ // blocking, though Wait()/WaitAll() might be required to publish it to
+ // Value(). See secondary cache compatible Lookup() above for details.
+ // This call is not thread safe on "pending" handles.
+ virtual bool IsReady(Handle* /*handle*/) { return true; }
+
+ // EXPERIMENTAL
+ // Convert a "pending" handle into a full thread-shareable handle by
+ // * If necessary, wait until secondary cache finishes loading the value.
+ // * Construct the value for primary cache and set it in the handle.
+ // Even after Wait() on a pending handle, the caller must check for
+ // Value() == nullptr in case of failure. This call is not thread-safe
+ // on pending handles. This call has no effect on non-pending handles.
+ // See secondary cache compatible Lookup() above for details.
+ virtual void Wait(Handle* /*handle*/) {}
+
+ // EXPERIMENTAL
+ // Wait for a vector of handles to become ready. As with Wait(), the user
+ // should check the Value() of each handle for nullptr. This call is not
+ // thread-safe on pending handles.
+ virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
+
+ private:
+ std::shared_ptr<MemoryAllocator> memory_allocator_;
+};
+
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cache_bench_tool.h b/src/rocksdb/include/rocksdb/cache_bench_tool.h
new file mode 100644
index 000000000..413ce1593
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache_bench_tool.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int cache_bench_tool(int argc, char** argv);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h
new file mode 100644
index 000000000..afc736673
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cleanable.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cleanable {
+ public:
+ Cleanable();
+ // No copy constructor and copy assignment allowed.
+ Cleanable(Cleanable&) = delete;
+ Cleanable& operator=(Cleanable&) = delete;
+
+ // Executes all the registered cleanups
+ ~Cleanable();
+
+ // Move constructor and move assignment is allowed.
+ Cleanable(Cleanable&&) noexcept;
+ Cleanable& operator=(Cleanable&&) noexcept;
+
+ // Clients are allowed to register function/arg1/arg2 triples that
+ // will be invoked when this iterator is destroyed.
+ //
+ // Note that unlike all of the preceding methods, this method is
+ // not abstract and therefore clients should not override it.
+ using CleanupFunction = void (*)(void* arg1, void* arg2);
+
+ // Add another Cleanup to the list
+ void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ // Move the cleanups owned by this Cleanable to another Cleanable, adding to
+ // any existing cleanups it has
+ void DelegateCleanupsTo(Cleanable* other);
+
+ // DoCleanup and also resets the pointers for reuse
+ inline void Reset() {
+ DoCleanup();
+ cleanup_.function = nullptr;
+ cleanup_.next = nullptr;
+ }
+
+ inline bool HasCleanups() { return cleanup_.function != nullptr; }
+
+ protected:
+ struct Cleanup {
+ CleanupFunction function;
+ void* arg1;
+ void* arg2;
+ Cleanup* next;
+ };
+ Cleanup cleanup_;
+ // It also becomes the owner of c
+ void RegisterCleanup(Cleanup* c);
+
+ private:
+ // Performs all the cleanups. It does not reset the pointers. Making it
+ // private
+ // to prevent misuse
+ inline void DoCleanup() {
+ if (cleanup_.function != nullptr) {
+ (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+ for (Cleanup* c = cleanup_.next; c != nullptr;) {
+ (*c->function)(c->arg1, c->arg2);
+ Cleanup* next = c->next;
+ delete c;
+ c = next;
+ }
+ }
+ }
+};
+
+// A copyable, reference-counted pointer to a simple Cleanable that only
+// performs registered cleanups after all copies are destroy. This is like
+// shared_ptr<Cleanable> but works more efficiently with wrapping the pointer
+// in an outer Cleanable (see RegisterCopyWith() and MoveAsCleanupTo()).
+// WARNING: if you create a reference cycle, for example:
+// SharedCleanablePtr scp;
+// scp.Allocate();
+// scp.RegisterCopyWith(&*scp);
+// It will prevent cleanups from ever happening!
+class SharedCleanablePtr {
+ public:
+ // Empy/null pointer
+ SharedCleanablePtr() {}
+ // Copy and move constructors and assignment
+ SharedCleanablePtr(const SharedCleanablePtr& from);
+ SharedCleanablePtr(SharedCleanablePtr&& from) noexcept;
+ SharedCleanablePtr& operator=(const SharedCleanablePtr& from);
+ SharedCleanablePtr& operator=(SharedCleanablePtr&& from) noexcept;
+ // Destructor (decrement refcount if non-null)
+ ~SharedCleanablePtr();
+ // Create a new simple Cleanable and make this assign this pointer to it.
+ // (Reset()s first if necessary.)
+ void Allocate();
+ // Reset to empty/null (decrement refcount if previously non-null)
+ void Reset();
+ // Dereference to pointed-to Cleanable
+ Cleanable& operator*();
+ Cleanable* operator->();
+ // Get as raw pointer to Cleanable
+ Cleanable* get();
+
+ // Creates a (virtual) copy of this SharedCleanablePtr and registers its
+ // destruction with target, so that the cleanups registered with the
+ // Cleanable pointed to by this can only happen after the cleanups in the
+ // target Cleanable are run.
+ // No-op if this is empty (nullptr).
+ void RegisterCopyWith(Cleanable* target);
+
+ // Moves (virtually) this shared pointer to a new cleanup in the target.
+ // This is essentilly a move semantics version of RegisterCopyWith(), for
+ // performance optimization. No-op if this is empty (nullptr).
+ void MoveAsCleanupTo(Cleanable* target);
+
+ private:
+ struct Impl;
+ Impl* ptr_ = nullptr;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
new file mode 100644
index 000000000..9c6a9c30d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+
+// CompactionFilter allows an application to modify/delete a key-value during
+// table file creation.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilter : public Customizable {
+ public:
+ enum ValueType {
+ kValue,
+ kMergeOperand,
+ kBlobIndex, // used internally by BlobDB.
+ };
+
+ enum class Decision {
+ kKeep,
+ kRemove,
+ kChangeValue,
+ kRemoveAndSkipUntil,
+ kChangeBlobIndex, // used internally by BlobDB.
+ kIOError, // used internally by BlobDB.
+ kPurge, // used for keys that can only be SingleDelete'ed
+ kUndetermined,
+ };
+
+ enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
+
+ // Context information for a table file creation.
+ struct Context {
+ // Whether this table file is created as part of a compaction including all
+ // table files.
+ bool is_full_compaction;
+ // Whether this table file is created as part of a compaction requested by
+ // the client.
+ bool is_manual_compaction;
+ // The column family that will contain the created table file.
+ uint32_t column_family_id;
+ // Reason this table file is being created.
+ TableFileCreationReason reason;
+ };
+
+ virtual ~CompactionFilter() {}
+ static const char* Type() { return "CompactionFilter"; }
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& name,
+ const CompactionFilter** result);
+
+ // The table file creation process invokes this method before adding a kv to
+ // the table file. A return value of false indicates that the kv should be
+ // preserved in the new table file and a return value of true indicates
+ // that this key-value should be removed from the new table file. The
+ // application can inspect the existing value of the key and make decision
+ // based on it.
+ //
+ // Key-Values that are results of merge operation during table file creation
+ // are not passed into this function. Currently, when you have a mix of Put()s
+ // and Merge()s on a same key, we only guarantee to process the merge operands
+ // through the `CompactionFilter`s. Put()s might be processed, or might not.
+ //
+ // When the value is to be preserved, the application has the option
+ // to modify the existing_value and pass it back through new_value.
+ // value_changed needs to be set to true in this case.
+ //
+ // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
+ // DB* object) will not guarantee to preserve the state of the DB with
+ // CompactionFilter. Data seen from a snapshot might disappear after a
+ // table file created with a `CompactionFilter` is installed. If you use
+ // snapshots, think twice about whether you want to use `CompactionFilter` and
+ // whether you are using it in a safe way.
+ //
+ // If multithreaded compaction is being used *and* a single CompactionFilter
+ // instance was supplied via Options::compaction_filter, this method may be
+ // called from different threads concurrently. The application must ensure
+ // that the call is thread-safe.
+ //
+ // If the CompactionFilter was created by a factory, then it will only ever
+ // be used by a single thread that is doing the table file creation, and this
+ // call does not need to be thread-safe. However, multiple filters may be
+ // in existence and operating concurrently.
+ virtual bool Filter(int /*level*/, const Slice& /*key*/,
+ const Slice& /*existing_value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const {
+ return false;
+ }
+
+ // The table file creation process invokes this method on every merge operand.
+ // If this method returns true, the merge operand will be ignored and not
+ // written out in the new table file.
+ //
+ // Note: If you are using a TransactionDB, it is not recommended to implement
+ // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB
+ // may not realize there is a write conflict and may allow a Transaction to
+ // Commit that should have failed. Instead, it is better to implement any
+ // Merge filtering inside the MergeOperator.
+ virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+ const Slice& /*operand*/) const {
+ return false;
+ }
+
+ // An extended API. Called for both values and merge operands.
+ // Allows changing value and skipping ranges of keys.
+ // The default implementation uses Filter() and FilterMergeOperand().
+ // If you're overriding this method, no need to override the other two.
+ // `value_type` indicates whether this key-value corresponds to a normal
+ // value (e.g. written with Put()) or a merge operand (written with Merge()).
+ //
+ // Possible return values:
+ // * kKeep - keep the key-value pair.
+ // * kRemove - remove the key-value pair or merge operand.
+ // * kChangeValue - keep the key and change the value/operand to *new_value.
+ // * kRemoveAndSkipUntil - remove this key-value pair, and also remove
+ // all key-value pairs with key in [key, *skip_until). This range
+ // of keys will be skipped without reading, potentially saving some
+ // IO operations compared to removing the keys one by one.
+ //
+ // *skip_until <= key is treated the same as Decision::kKeep
+ // (since the range [key, *skip_until) is empty).
+ //
+ // Caveats:
+ // - The keys are skipped even if there are snapshots containing them,
+ // i.e. values removed by kRemoveAndSkipUntil can disappear from a
+ // snapshot - beware if you're using TransactionDB or
+ // DB::GetSnapshot().
+ // - If value for a key was overwritten or merged into (multiple Put()s
+ // or Merge()s), and `CompactionFilter` skips this key with
+ // kRemoveAndSkipUntil, it's possible that it will remove only
+ // the new value, exposing the old value that was supposed to be
+ // overwritten.
+ // - Doesn't work with PlainTableFactory in prefix mode.
+ // - If you use kRemoveAndSkipUntil for table files created by
+ // compaction, consider also reducing compaction_readahead_size
+ // option.
+ //
+ // Should never return kUndetermined.
+ // Note: If you are using a TransactionDB, it is not recommended to filter
+ // out or modify merge operands (ValueType::kMergeOperand).
+ // If a merge operation is filtered out, TransactionDB may not realize there
+ // is a write conflict and may allow a Transaction to Commit that should have
+ // failed. Instead, it is better to implement any Merge filtering inside the
+ // MergeOperator.
+ // key includes timestamp if user-defined timestamp is enabled.
+ virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ switch (value_type) {
+ case ValueType::kValue: {
+ bool value_changed = false;
+ bool rv = Filter(level, key, existing_value, new_value, &value_changed);
+ if (rv) {
+ return Decision::kRemove;
+ }
+ return value_changed ? Decision::kChangeValue : Decision::kKeep;
+ }
+ case ValueType::kMergeOperand: {
+ bool rv = FilterMergeOperand(level, key, existing_value);
+ return rv ? Decision::kRemove : Decision::kKeep;
+ }
+ case ValueType::kBlobIndex:
+ return Decision::kKeep;
+ }
+ assert(false);
+ return Decision::kKeep;
+ }
+
+ // Internal (BlobDB) use only. Do not override in application code.
+ virtual BlobDecision PrepareBlobOutput(const Slice& /* key */,
+ const Slice& /* existing_value */,
+ std::string* /* new_value */) const {
+ return BlobDecision::kKeep;
+ }
+
+ // This function is deprecated. Snapshots will always be ignored for
+ // `CompactionFilter`s, because we realized that not ignoring snapshots
+ // doesn't provide the guarantee we initially thought it would provide.
+ // Repeatable reads will not be guaranteed anyway. If you override the
+ // function and returns false, we will fail the table file creation.
+ virtual bool IgnoreSnapshots() const { return true; }
+
+ // Returns a name that identifies this `CompactionFilter`.
+ // The name will be printed to LOG file on start up for diagnosis.
+ const char* Name() const override = 0;
+
+ // Internal (BlobDB) use only. Do not override in application code.
+ virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
+
+ // In the case of BlobDB, it may be possible to reach a decision with only
+ // the key without reading the actual value. Keys whose value_type is
+ // kBlobIndex will be checked by this method.
+ // Returning kUndetermined will cause FilterV2() to be called to make a
+ // decision as usual.
+ virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
+ std::string* /*new_value*/,
+ std::string* /*skip_until*/) const {
+ return Decision::kUndetermined;
+ }
+};
+
+// Each thread of work involving creating table files will create a new
+// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This
+// allows the application to know about the different ongoing threads of work
+// and makes it unnecessary for `CompactionFilter` to provide thread-safety.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilterFactory : public Customizable {
+ public:
+ virtual ~CompactionFilterFactory() {}
+ static const char* Type() { return "CompactionFilterFactory"; }
+ static Status CreateFromString(
+ const ConfigOptions& config_options, const std::string& name,
+ std::shared_ptr<CompactionFilterFactory>* result);
+
+ // Returns whether a thread creating table files for the specified `reason`
+ // should invoke `CreateCompactionFilter()` and pass KVs through the returned
+ // filter.
+ virtual bool ShouldFilterTableFileCreation(
+ TableFileCreationReason reason) const {
+ // For backward compatibility, default implementation only applies
+ // `CompactionFilter` to files generated by compaction.
+ return reason == TableFileCreationReason::kCompaction;
+ }
+
+ virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) = 0;
+
+ // Returns a name that identifies this `CompactionFilter` factory.
+ virtual const char* Name() const override = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h
new file mode 100644
index 000000000..5ff8eccc8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct CompactionJobStats {
+ CompactionJobStats() { Reset(); }
+ void Reset();
+ // Aggregate the CompactionJobStats from another instance with this one
+ void Add(const CompactionJobStats& stats);
+
+ // the elapsed time of this compaction in microseconds.
+ uint64_t elapsed_micros;
+
+ // the elapsed CPU time of this compaction in microseconds.
+ uint64_t cpu_micros;
+
+ // the number of compaction input records.
+ uint64_t num_input_records;
+ // the number of blobs read from blob files
+ uint64_t num_blobs_read;
+ // the number of compaction input files (table files)
+ size_t num_input_files;
+ // the number of compaction input files at the output level (table files)
+ size_t num_input_files_at_output_level;
+
+ // the number of compaction output records.
+ uint64_t num_output_records;
+ // the number of compaction output files (table files)
+ size_t num_output_files;
+ // the number of compaction output files (blob files)
+ size_t num_output_files_blob;
+
+ // true if the compaction is a full compaction (all live SST files input)
+ bool is_full_compaction;
+ // true if the compaction is a manual compaction
+ bool is_manual_compaction;
+
+ // the total size of table files in the compaction input
+ uint64_t total_input_bytes;
+ // the total size of blobs read from blob files
+ uint64_t total_blob_bytes_read;
+ // the total size of table files in the compaction output
+ uint64_t total_output_bytes;
+ // the total size of blob files in the compaction output
+ uint64_t total_output_bytes_blob;
+
+ // number of records being replaced by newer record associated with same key.
+ // this could be a new value or a deletion entry for that key so this field
+ // sums up all updated and deleted keys
+ uint64_t num_records_replaced;
+
+ // the sum of the uncompressed input keys in bytes.
+ uint64_t total_input_raw_key_bytes;
+ // the sum of the uncompressed input values in bytes.
+ uint64_t total_input_raw_value_bytes;
+
+ // the number of deletion entries before compaction. Deletion entries
+ // can disappear after compaction because they expired
+ uint64_t num_input_deletion_records;
+ // number of deletion records that were found obsolete and discarded
+ // because it is not possible to delete any more keys with this entry
+ // (i.e. all possible deletions resulting from it have been completed)
+ uint64_t num_expired_deletion_records;
+
+ // number of corrupt keys (ParseInternalKey returned false when applied to
+ // the key) encountered and written out.
+ uint64_t num_corrupt_keys;
+
+ // Following counters are only populated if
+ // options.report_bg_io_stats = true;
+
+ // Time spent on file's Append() call.
+ uint64_t file_write_nanos;
+
+ // Time spent on sync file range.
+ uint64_t file_range_sync_nanos;
+
+ // Time spent on file fsync.
+ uint64_t file_fsync_nanos;
+
+ // Time spent on preparing file write (fallocate, etc)
+ uint64_t file_prepare_write_nanos;
+
+ // 0-terminated strings storing the first 8 bytes of the smallest and
+ // largest key in the output.
+ static const size_t kMaxPrefixLength = 8;
+
+ std::string smallest_output_key_prefix;
+ std::string largest_output_key_prefix;
+
+ // number of single-deletes which do not meet a put
+ uint64_t num_single_del_fallthru;
+
+ // number of single-deletes which meet something other than a put
+ uint64_t num_single_del_mismatch;
+
+ // TODO: Add output_to_penultimate_level output information
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
new file mode 100644
index 000000000..ad1e71a11
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// The general interface for comparing two Slices are defined for both of
+// Comparator and some internal data structures.
+class CompareInterface {
+ public:
+ virtual ~CompareInterface() {}
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "a" < "b",
+ // == 0 iff "a" == "b",
+ // > 0 iff "a" > "b"
+ // Note that Compare(a, b) also compares timestamp if timestamp size is
+ // non-zero. For the same user key with different timestamps, larger (newer)
+ // timestamp comes first.
+ virtual int Compare(const Slice& a, const Slice& b) const = 0;
+};
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database. A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Comparator : public Customizable, public CompareInterface {
+ public:
+ Comparator() : timestamp_size_(0) {}
+
+ Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+ Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+ Comparator& operator=(const Comparator& rhs) {
+ if (this != &rhs) {
+ timestamp_size_ = rhs.timestamp_size_;
+ }
+ return *this;
+ }
+
+ ~Comparator() override {}
+
+ static Status CreateFromString(const ConfigOptions& opts,
+ const std::string& id,
+ const Comparator** comp);
+ static const char* Type() { return "Comparator"; }
+
+ // The name of the comparator. Used to check for comparator
+ // mismatches (i.e., a DB created with one comparator is
+ // accessed using a different comparator.
+ //
+ // The client of this package should switch to a new name whenever
+ // the comparator implementation changes in a way that will cause
+ // the relative ordering of any two keys to change.
+ //
+ // Names starting with "rocksdb." are reserved and should not be used
+ // by any clients of this package.
+ const char* Name() const override = 0;
+
+ // Compares two slices for equality. The following invariant should always
+ // hold (and is the default implementation):
+ // Equal(a, b) iff Compare(a, b) == 0
+ // Overwrite only if equality comparisons can be done more efficiently than
+ // three-way comparisons.
+ virtual bool Equal(const Slice& a, const Slice& b) const {
+ return Compare(a, b) == 0;
+ }
+
+ // Advanced functions: these are used to reduce the space requirements
+ // for internal data structures like index blocks.
+
+ // If *start < limit, changes *start to a short string in [start,limit).
+ // Simple comparator implementations may return with *start unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortestSeparator(std::string* start,
+ const Slice& limit) const = 0;
+
+ // Changes *key to a short string >= *key.
+ // Simple comparator implementations may return with *key unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortSuccessor(std::string* key) const = 0;
+
+ // given two keys, determine if t is the successor of s
+ // BUG: only return true if no other keys starting with `t` are ordered
+ // before `t`. Otherwise, the auto_prefix_mode can omit entries within
+ // iterator bounds that have same prefix as upper bound but different
+ // prefix from seek key.
+ virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
+ const Slice& /*t*/) const {
+ return false;
+ }
+
+ // return true if two keys with different byte sequences can be regarded
+ // as equal by this comparator.
+ // The major use case is to determine if DataBlockHashIndex is compatible
+ // with the customized comparator.
+ virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+ // if it is a wrapped comparator, may return the root one.
+ // return itself it is not wrapped.
+ virtual const Comparator* GetRootComparator() const { return this; }
+
+ inline size_t timestamp_size() const { return timestamp_size_; }
+
+ int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+ return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+ }
+
+ // For two events e1 and e2 whose timestamps are t1 and t2 respectively,
+ // Returns value:
+ // < 0 iff t1 < t2
+ // == 0 iff t1 == t2
+ // > 0 iff t1 > t2
+ // Note that an all-zero byte array will be the smallest (oldest) timestamp
+ // of the same length, and a byte array with all bits 1 will be the largest.
+ // In the future, we can extend Comparator so that subclasses can specify
+ // both largest and smallest timestamps.
+ virtual int CompareTimestamp(const Slice& /*ts1*/,
+ const Slice& /*ts2*/) const {
+ return 0;
+ }
+
+ virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/,
+ const Slice& b, bool /*b_has_ts*/) const {
+ return Compare(a, b);
+ }
+
+ virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+ return 0 ==
+ CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+ }
+
+ private:
+ size_t timestamp_size_;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering. The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering.
+extern const Comparator* ReverseBytewiseComparator();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compression_type.h b/src/rocksdb/include/rocksdb/compression_type.h
new file mode 100644
index 000000000..bfeb00bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compression_type.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs. Each block may be compressed before
+// being stored in a file. The following enum describes which
+// compression method (if any) is used to compress a block.
+
+enum CompressionType : unsigned char {
+ // NOTE: do not change the values of existing entries, as these are
+ // part of the persistent format on disk.
+ kNoCompression = 0x0,
+ kSnappyCompression = 0x1,
+ kZlibCompression = 0x2,
+ kBZip2Compression = 0x3,
+ kLZ4Compression = 0x4,
+ kLZ4HCCompression = 0x5,
+ kXpressCompression = 0x6,
+ kZSTD = 0x7,
+
+ // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
+ // 0.8.0 or consider a possibility of downgrading the service or copying
+ // the database files to another service running with an older version of
+ // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
+ // eventually remove the option from the public API.
+ kZSTDNotFinalCompression = 0x40,
+
+ // kDisableCompressionOption is used to disable some compression options.
+ kDisableCompressionOption = 0xff,
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
new file mode 100644
index 000000000..9ad741f98
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is NOT an extensible interface but a public interface for result of
+// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal.
+class ConcurrentTaskLimiter {
+ public:
+ virtual ~ConcurrentTaskLimiter() {}
+
+ // Returns a name that identifies this concurrent task limiter.
+ virtual const std::string& GetName() const = 0;
+
+ // Set max concurrent tasks.
+ // limit = 0 means no new task allowed.
+ // limit < 0 means no limitation.
+ virtual void SetMaxOutstandingTask(int32_t limit) = 0;
+
+ // Reset to unlimited max concurrent task.
+ virtual void ResetMaxOutstandingTask() = 0;
+
+ // Returns current outstanding task count.
+ virtual int32_t GetOutstandingTask() const = 0;
+};
+
+// Create a ConcurrentTaskLimiter that can be shared with multiple CFs
+// across RocksDB instances to control concurrent tasks.
+//
+// @param name: Name of the limiter.
+// @param limit: max concurrent tasks.
+// limit = 0 means no new task allowed.
+// limit < 0 means no limitation.
+extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+ int32_t limit);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/configurable.h b/src/rocksdb/include/rocksdb/configurable.h
new file mode 100644
index 000000000..60ae89f97
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/configurable.h
@@ -0,0 +1,400 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+class ObjectRegistry;
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+
+// Configurable is a base class used by the rocksdb that describes a
+// standard way of configuring objects. A Configurable object can:
+// -> Populate itself given:
+// - One or more "name/value" pair strings
+// - A string representing the set of name=value properties
+// - A map of name/value properties.
+// -> Convert itself into its string representation
+// -> Dump itself to a Logger
+// -> Compare itself to another Configurable object to see if the two objects
+// have equivalent options settings
+//
+// If a derived class calls RegisterOptions to register (by name) how its
+// options objects are to be processed, this functionality can typically be
+// handled by this class without additional overrides. Otherwise, the derived
+// class will need to implement the methods for handling the corresponding
+// functionality.
+class Configurable {
+ protected:
+ friend class ConfigurableHelper;
+ struct RegisteredOptions {
+ // The name of the options being registered
+ std::string name;
+ // Pointer to the object being registered
+ void* opt_ptr;
+#ifndef ROCKSDB_LITE
+ // The map of options being registered
+ const std::unordered_map<std::string, OptionTypeInfo>* type_map;
+#endif
+ };
+
+ public:
+ virtual ~Configurable() {}
+
+ // Returns the raw pointer of the named options that is used by this
+ // object, or nullptr if this function is not supported.
+ // Since the return value is a raw pointer, the object owns the
+ // pointer and the caller should not delete the pointer.
+ //
+ // Note that changing the underlying options while the object
+ // is currently used by any open DB is undefined behavior.
+ // Developers should use DB::SetOption() instead to dynamically change
+ // options while the DB is open.
+ template <typename T>
+ const T* GetOptions() const {
+ return GetOptions<T>(T::kName());
+ }
+ template <typename T>
+ T* GetOptions() {
+ return GetOptions<T>(T::kName());
+ }
+ template <typename T>
+ const T* GetOptions(const std::string& name) const {
+ return reinterpret_cast<const T*>(GetOptionsPtr(name));
+ }
+ template <typename T>
+ T* GetOptions(const std::string& name) {
+ return reinterpret_cast<T*>(const_cast<void*>(GetOptionsPtr(name)));
+ }
+
+ // Configures the options for this class based on the input parameters.
+ // On successful completion, the object is updated with the settings from
+ // the opt_map.
+ // If this method fails, an attempt is made to revert the object to original
+ // state. Note that the revert may not be the original state but may be an
+ // equivalent. For example, if the object contains an option that is a
+ // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not
+ // the Cache object that was passed in, but a Cache object of the same size).
+ //
+ // The acceptable values of the name/value pairs are documented with the
+ // specific class/instance.
+ //
+ // @param config_options Controls how the arguments are processed.
+ // @param opt_map Name/value pairs of the options to update
+ // @param unused If specified, this value will return the name/value
+ // pairs from opt_map that were NotFound for this object.
+ // @return OK If all values in the map were successfully updated
+ // If invoke_prepare_options is true, OK also implies
+ // PrepareOptions ran successfully.
+ // @return NotFound If any of the names in the opt_map were not valid
+ // for this object. If unused is specified, it will contain the
+ // collection of NotFound names.
+ // @return NotSupported If any of the names are valid but the object does
+ // not know how to convert the value. This can happen if, for example,
+ // there is some nested Configurable that cannot be created.
+ // @return InvalidArgument If any of the values cannot be successfully
+ // parsed. This can also be returned if PrepareOptions encounters an
+ // error.
+ // @see ConfigOptions for a description of the controls.
+ Status ConfigureFromMap(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, std::string>& opt_map);
+ Status ConfigureFromMap(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, std::string>& opt_map,
+ std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+ // Updates the named option to the input value, returning OK if successful.
+ // Note that ConfigureOption does not cause PrepareOptions to be invoked.
+ // @param config_options Controls how the name/value is processed.
+ // @param name The name of the option to update
+ // @param value The value to set for the named option
+ // @return OK If the named field was successfully updated to value.
+ // @return NotFound If the name is not valid for this object.
+ // @return NotSupported If the name is valid but the object does
+ // not know how to convert the value. This can happen if, for example,
+ // there is some nested Configurable that cannot be created.
+ // @return InvalidArgument If the value cannot be successfully parsed.
+ Status ConfigureOption(const ConfigOptions& config_options,
+ const std::string& name, const std::string& value);
+#endif // ROCKSDB_LITE
+
+ // Configures the options for this class based on the input parameters.
+ // On successful completion, the object is updated with the settings from
+ // the opt_map. If this method fails, an attempt is made to revert the
+ // object to original state. Note that the revert may not be the original
+ // state but may be an equivalent.
+ // @see ConfigureFromMap for more details
+ // @param config_options Controls how the arguments are processed.
+ // @param opt_str string containing the values to update.
+ // @param unused If specified, this value will return the name/value
+ // pairs from opt_map that were NotFound for this object.
+ // @return OK If all specified values were successfully updated
+ // If invoke_prepare_options is true, OK also implies
+ // PrepareOptions ran successfully.
+ // @return NotFound If any of the names were not valid for this object.
+ // If unused is specified, it will contain the collection of NotFound
+ // names.
+ // @return NotSupported If any of the names are valid but the object does
+ // not know how to convert the value. This can happen if, for example,
+ // there is some nested Configurable that cannot be created.
+ // @return InvalidArgument If any of the values cannot be successfully
+ // parsed. This can also be returned if PrepareOptions encounters an
+ // error.
+ Status ConfigureFromString(const ConfigOptions& config_options,
+ const std::string& opts);
+
+ // Fills in result with the serialized options for this object.
+ // This is the inverse of ConfigureFromString.
+ // @param config_options Controls how serialization happens.
+ // @param result The string representation of this object.
+ // @return OK If the options for this object were successfully serialized.
+ // @return InvalidArgument If one or more of the options could not be
+ // serialized.
+ Status GetOptionString(const ConfigOptions& config_options,
+ std::string* result) const;
+#ifndef ROCKSDB_LITE
+ // Returns the serialized options for this object.
+ // This method is similar to GetOptionString with no errors.
+ // @param config_options Controls how serialization happens.
+ // @param prefix A string to prepend to every option.
+ // @return The serialized representation of the options for this object
+ std::string ToString(const ConfigOptions& config_options) const {
+ return ToString(config_options, "");
+ }
+ std::string ToString(const ConfigOptions& config_options,
+ const std::string& prefix) const;
+
+ // Returns the list of option names associated with this configurable
+ // @param config_options Controls how the names are returned
+ // @param result The set of option names for this object. Note that
+ // options that are deprecated or aliases are not returned.
+ // @return OK on success.
+ Status GetOptionNames(const ConfigOptions& config_options,
+ std::unordered_set<std::string>* result) const;
+
+ // Returns the value of the option associated with the input name
+ // This method is the functional inverse of ConfigureOption
+ // @param config_options Controls how the value is returned
+ // @param name The name of the option to return a value for.
+ // @param value The returned value associated with the named option.
+ // @return OK If the named field was successfully updated to value.
+ // @return NotFound If the name is not valid for this object.
+ // @param InvalidArgument If the name is valid for this object but
+ // its value cannot be serialized.
+ virtual Status GetOption(const ConfigOptions& config_options,
+ const std::string& name, std::string* value) const;
+#endif // ROCKSDB_LITE
+
+ // Checks to see if this Configurable is equivalent to other.
+ // This method assumes that the two objects are of the same class.
+ // @param config_options Controls how the options are compared.
+ // @param other The other object to compare to.
+ // @param mismatch If the objects do not match, this parameter contains
+ // the name of the option that triggered the match failure.
+ // @param True if the objects match, false otherwise.
+ virtual bool AreEquivalent(const ConfigOptions& config_options,
+ const Configurable* other,
+ std::string* name) const;
+
+ // Returns a pretty-printed, human-readable version of the options.
+ // This method is typically used to dump the options to a log file.
+ // Classes should override this method
+ virtual std::string GetPrintableOptions() const { return ""; }
+
+ // Validates that the settings are valid/consistent and performs any object
+ // initialization required by this object. This method may be called as part
+ // of Configure (if invoke_prepare_options is set), or may be invoked
+ // separately.
+ //
+ // Once an object has been prepared, non-mutable options can no longer be
+ // updated.
+ //
+ // Classes must override this method to provide any implementation-specific
+ // initialization, such as opening log files or setting up cache parameters.
+ // Implementations should be idempotent (e.g. don't re-open the log file or
+ // reconfigure the cache), as there is the potential this method can be called
+ // more than once.
+ //
+ // By default, this method will also prepare all nested (Inner and
+ // OptionType::kConfigurable) objects.
+ //
+ // @param config_options Controls how the object is prepared. Also contains
+ // a Logger and Env that can be used to initialize this object.
+ // @return OK If the object was successfully initialized.
+ // @return InvalidArgument If this object could not be successfully
+ // initialized.
+ virtual Status PrepareOptions(const ConfigOptions& config_options);
+
+ // Checks to see if the settings are valid for this object.
+ // This method checks to see if the input DBOptions and ColumnFamilyOptions
+ // are valid for the settings of this object. For example, an Env might not
+ // support certain mmap modes or a TableFactory might require certain
+ // settings.
+ //
+ // By default, this method will also validate all nested (Inner and
+ // OptionType::kConfigurable) objects.
+ //
+ // @param db_opts The DBOptions to validate
+ // @param cf_opts The ColumnFamilyOptions to validate
+ // @return OK if the options are valid
+ // @return InvalidArgument If the arguments are not valid for the options
+ // of the current object.
+ virtual Status ValidateOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const;
+
+ // Splits the input opt_value into the ID field and the remaining options.
+ // The input opt_value can be in the form of "name" or "name=value
+ // [;name=value]". The first form uses the "name" as an id with no options The
+ // latter form converts the input into a map of name=value pairs and sets "id"
+ // to the "id" value from the map.
+ // @param opt_value The value to split into id and options
+ // @param id The id field from the opt_value
+ // @param options The remaining name/value pairs from the opt_value
+ // @param default_id If specified and there is no id field in the map, this
+ // value is returned as the ID
+ // @return OK if the value was converted to a map successfully and an ID was
+ // found.
+ // @return InvalidArgument if the value could not be converted to a map or
+ // there was or there is no id property in the map.
+ static Status GetOptionsMap(
+ const std::string& opt_value, const std::string& default_id,
+ std::string* id, std::unordered_map<std::string, std::string>* options);
+
+ protected:
+ // Returns the raw pointer for the associated named option.
+ // The name is typically the name of an option registered via the
+ // Classes may override this method to provide further specialization (such as
+ // returning a sub-option)
+ //
+ // The default implementation looks at the registered options. If the
+ // input name matches that of a registered option, the pointer registered
+ // with that name is returned.
+ // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns
+ // "my_ptr"
+ virtual const void* GetOptionsPtr(const std::string& name) const;
+
+ // Method for allowing options to be configured outside of the normal
+ // registered options framework. Classes may override this method if they
+ // wish to support non-standard options implementations (such as configuring
+ // themselves from constant or simple ":"-separated strings.
+ //
+ // The default implementation does nothing and returns OK
+ virtual Status ParseStringOptions(const ConfigOptions& config_options,
+ const std::string& opts_str);
+
+ // Internal method to configure an object from a map of name-value options.
+ // This method uses the input config_options to drive the configuration of
+ // the options in opt_map. Any option name that cannot be found from the
+ // input set will be returned in "unused".
+ //
+ // Classes may override this method to extend the functionality if required.
+ // @param config_options Controls how the options are configured and errors
+ // handled.
+ // @param opts_map The set of options to configure
+ // @param unused Any options from opt_map that were not configured.
+ // @returns a Status based on the rules outlined in ConfigureFromMap
+ virtual Status ConfigureOptions(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+ // Method that configures a the specific opt_name from opt_value.
+ // By default, this method calls opt_info.ParseOption with the
+ // input parameters.
+ // Classes may override this method to extend the functionality, or
+ // change the returned Status.
+ virtual Status ParseOption(const ConfigOptions& config_options,
+ const OptionTypeInfo& opt_info,
+ const std::string& opt_name,
+ const std::string& opt_value, void* opt_ptr);
+
+ // Internal method to see if the single option name/info matches for this and
+ // that Classes may override this value to change its behavior.
+ // @param config_options Controls how the options are being matched
+ // @param opt_info The OptionTypeInfo registered for this option name
+ // that controls what field is matched (offset) and how (type).
+ // @param name The name associated with this opt_info.
+ // @param this_ptr The base pointer to compare to. This is the object
+ // registered for
+ // for this OptionTypeInfo.
+ // @param that_ptr The other pointer to compare to. This is the object
+ // registered for
+ // for this OptionTypeInfo.
+ // @param bad_name If the match fails, the name of the option that failed to
+ // match.
+ virtual bool OptionsAreEqual(const ConfigOptions& config_options,
+ const OptionTypeInfo& opt_info,
+ const std::string& name,
+ const void* const this_ptr,
+ const void* const that_ptr,
+ std::string* bad_name) const;
+#endif
+#ifndef ROCKSDB_LITE
+ // Internal method to serialize options (ToString)
+ // Classes may override this value to change its behavior.
+ virtual std::string SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const;
+#endif // ROCKSDB_LITE
+
+ // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+ virtual std::string GetOptionName(const std::string& long_name) const;
+
+ // Registers the input name with the options and associated map.
+ // When classes register their options in this manner, most of the
+ // functionality (excluding unknown options and validate/prepare) is
+ // implemented by the base class.
+ //
+ // This method should be called in the class constructor to register the
+ // option set for this object. For example, to register the options
+ // associated with the BlockBasedTableFactory, the constructor calls this
+ // method passing in:
+ // - the name of the options ("BlockBasedTableOptions");
+ // - the options object (the BlockBasedTableOptions object for this object;
+ // - the options type map for the BlockBasedTableOptions.
+ // This registration allows the Configurable class to process the option
+ // values associated with the BlockBasedTableOptions without further code in
+ // the derived class.
+ //
+ // @param name The name of this set of options (@see GetOptionsPtr)
+ // @param opt_ptr Pointer to the options to associate with this name
+ // @param opt_map Options map that controls how this option is configured.
+ template <typename T>
+ void RegisterOptions(
+ T* opt_ptr,
+ const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
+ RegisterOptions(T::kName(), opt_ptr, opt_map);
+ }
+ void RegisterOptions(
+ const std::string& name, void* opt_ptr,
+ const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
+
+ // Returns true if there are registered options for this Configurable object
+ inline bool HasRegisteredOptions() const { return !options_.empty(); }
+
+ private:
+ // Contains the collection of options (name, opt_ptr, opt_map) associated with
+ // this object. This collection is typically set in the constructor of the
+ // Configurable option via
+ std::vector<RegisteredOptions> options_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h
new file mode 100644
index 000000000..921ec221b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/convenience.h
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Env;
+class Logger;
+class ObjectRegistry;
+
+struct ColumnFamilyOptions;
+struct DBOptions;
+struct Options;
+
+// ConfigOptions containing the parameters/controls for
+// comparing objects and converting to/from strings.
+// These settings control how the methods
+// treat errors (e.g. ignore_unknown_objects), the format
+// of the serialization (e.g. delimiter), and how to compare
+// options (sanity_level).
+struct ConfigOptions {
+ // Constructs a new ConfigOptions with a new object registry.
+ // This method should only be used when a DBOptions is not available,
+ // else registry settings may be lost
+ ConfigOptions();
+
+ // Constructs a new ConfigOptions using the settings from
+ // the input DBOptions. Currently constructs a new object registry.
+ explicit ConfigOptions(const DBOptions&);
+
+ // This enum defines the RocksDB options sanity level.
+ enum SanityLevel : unsigned char {
+ kSanityLevelNone = 0x01, // Performs no sanity check at all.
+ // Performs minimum check to ensure the RocksDB instance can be
+ // opened without corrupting / mis-interpreting the data.
+ kSanityLevelLooselyCompatible = 0x02,
+ // Perform exact match sanity check.
+ kSanityLevelExactMatch = 0xFF,
+ };
+
+ enum Depth {
+ kDepthDefault, // Traverse nested options that are not flagged as "shallow"
+ kDepthShallow, // Do not traverse into any nested options
+ kDepthDetailed, // Traverse nested options, overriding the options shallow
+ // setting
+ };
+
+ // When true, any unused options will be ignored and OK will be returned
+ bool ignore_unknown_options = false;
+
+ // When true, any unsupported options will be ignored and OK will be returned
+ bool ignore_unsupported_options = true;
+
+ // If the strings are escaped (old-style?)
+ bool input_strings_escaped = true;
+
+ // Whether or not to invoke PrepareOptions after configure is called.
+ bool invoke_prepare_options = true;
+
+ // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not.
+ // When "mutable_options_only=false", all options are evaluated.
+ // When "mutable_options_only="true", any option not marked as Mutable is
+ // either ignored (in the case of string/equals methods) or results in an
+ // error (in the case of Configure).
+ bool mutable_options_only = false;
+
+ // The separator between options when converting to a string
+ std::string delimiter = ";";
+
+ // Controls how to traverse options during print/match stages
+ Depth depth = Depth::kDepthDefault;
+
+ // Controls how options are serialized
+ // Controls how pedantic the comparison must be for equivalency
+ SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch;
+ // `file_readahead_size` is used for readahead for the option file.
+ size_t file_readahead_size = 512 * 1024;
+
+ // The environment to use for this option
+ Env* env = Env::Default();
+
+#ifndef ROCKSDB_LITE
+ // The object registry to use for this options
+ std::shared_ptr<ObjectRegistry> registry;
+#endif
+
+ bool IsShallow() const { return depth == Depth::kDepthShallow; }
+ bool IsDetailed() const { return depth == Depth::kDepthDetailed; }
+
+ bool IsCheckDisabled() const {
+ return sanity_level == SanityLevel::kSanityLevelNone;
+ }
+
+ bool IsCheckEnabled(SanityLevel level) const {
+ return (level > SanityLevel::kSanityLevelNone && level <= sanity_level);
+ }
+};
+
+#ifndef ROCKSDB_LITE
+
+// The following set of functions provide a way to construct RocksDB Options
+// from a string or a string-to-string map. Here is the general rule of
+// setting option values from strings by type. Some RocksDB types are also
+// supported in these APIs. Please refer to the comment of the function itself
+// to find more information about how to config those RocksDB types.
+//
+// * Strings:
+// Strings will be used as values directly without any truncating or
+// trimming.
+//
+// * Booleans:
+// - "true" or "1" => true
+// - "false" or "0" => false.
+// [Example]:
+// - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or
+// - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString.
+//
+// * Integers:
+// Integers are converted directly from string, in addition to the following
+// units that we support:
+// - 'k' or 'K' => 2^10
+// - 'm' or 'M' => 2^20
+// - 'g' or 'G' => 2^30
+// - 't' or 'T' => 2^40 // only for unsigned int with sufficient bits.
+// [Example]:
+// - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or
+// - "arena_block_size=19G" in GetColumnFamilyOptionsFromString.
+//
+// * Doubles / Floating Points:
+// Doubles / Floating Points are converted directly from string. Note that
+// currently we do not support units.
+// [Example]:
+// - {"memtable_prefix_bloom_size_ratio", "0.1"} in
+// GetColumnFamilyOptionsFromMap, or
+// - "memtable_prefix_bloom_size_ratio=0.1" in
+// GetColumnFamilyOptionsFromString.
+// * Array / Vectors:
+// An array is specified by a list of values, where ':' is used as
+// the delimiter to separate each value.
+// [Example]:
+// - {"compression_per_level", "kNoCompression:kSnappyCompression"}
+// in GetColumnFamilyOptionsFromMap, or
+// - "compression_per_level=kNoCompression:kSnappyCompression" in
+// GetColumnFamilyOptionsFromMapString
+// * Enums:
+// The valid values of each enum are identical to the names of its constants.
+// [Example]:
+// - CompressionType: valid values are "kNoCompression",
+// "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ...
+// - CompactionStyle: valid values are "kCompactionStyleLevel",
+// "kCompactionStyleUniversal", "kCompactionStyleFIFO", and
+// "kCompactionStyleNone".
+//
+
+// Take a default ColumnFamilyOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// ColumnFamilyOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in ColumnFamilyOptions:
+//
+// * table_factory:
+// table_factory can be configured using our custom nested-option syntax.
+//
+// {option_a=value_a; option_b=value_b; option_c=value_c; ... }
+//
+// A nested option is enclosed by two curly braces, within which there are
+// multiple option assignments. Each assignment is of the form
+// "variable_name=value;".
+//
+// Currently we support the following types of TableFactory:
+// - BlockBasedTableFactory:
+// Use name "block_based_table_factory" to initialize table_factory with
+// BlockBasedTableFactory. Its BlockBasedTableFactoryOptions can be
+// configured using the nested-option syntax.
+// [Example]:
+// * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"}
+// is equivalent to assigning table_factory with a BlockBasedTableFactory
+// that has 1M LRU block-cache with block size equals to 4k:
+// ColumnFamilyOptions cf_opt;
+// BlockBasedTableOptions blk_opt;
+// blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024);
+// blk_opt.block_size = 4 * 1024;
+// cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt));
+// - PlainTableFactory:
+// Use name "plain_table_factory" to initialize table_factory with
+// PlainTableFactory. Its PlainTableFactoryOptions can be configured using
+// the nested-option syntax.
+// [Example]:
+// * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"}
+//
+// * memtable_factory:
+// Use "memtable" to config memtable_factory. Here are the supported
+// memtable factories:
+// - SkipList:
+// Pass "skip_list:<lookahead>" to config memtable to use SkipList,
+// or simply "skip_list" to use the default SkipList.
+// [Example]:
+// * {"memtable", "skip_list:5"} is equivalent to setting
+// memtable to SkipListFactory(5).
+// - PrefixHash:
+// Pass "prefix_hash:<hash_bucket_count>" to config memtable
+// to use PrefixHash, or simply "prefix_hash" to use the default
+// PrefixHash.
+// [Example]:
+// * {"memtable", "prefix_hash:1000"} is equivalent to setting
+// memtable to NewHashSkipListRepFactory(hash_bucket_count).
+// - HashLinkedList:
+// Pass "hash_linkedlist:<hash_bucket_count>" to config memtable
+// to use HashLinkedList, or simply "hash_linkedlist" to use the default
+// HashLinkedList.
+// [Example]:
+// * {"memtable", "hash_linkedlist:1000"} is equivalent to
+// setting memtable to NewHashLinkListRepFactory(1000).
+// - VectorRepFactory:
+// Pass "vector:<count>" to config memtable to use VectorRepFactory,
+// or simply "vector" to use the default Vector memtable.
+// [Example]:
+// * {"memtable", "vector:1024"} is equivalent to setting memtable
+// to VectorRepFactory(1024).
+//
+// * compression_opts:
+// Use "compression_opts" to config compression_opts. The value format
+// is of the form "<window_bits>:<level>:<strategy>:<max_dict_bytes>".
+// [Example]:
+// * {"compression_opts", "4:5:6:7"} is equivalent to setting:
+// ColumnFamilyOptions cf_opt;
+// cf_opt.compression_opts.window_bits = 4;
+// cf_opt.compression_opts.level = 5;
+// cf_opt.compression_opts.strategy = 6;
+// cf_opt.compression_opts.max_dict_bytes = 7;
+//
+// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+// should be set.
+// @param new_options the resulting options based on "base_options" with the
+// change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+// the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+// value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+// valid for this option.
+Status GetColumnFamilyOptionsFromMap(
+ const ConfigOptions& config_options,
+ const ColumnFamilyOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromMap(
+ const ColumnFamilyOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ ColumnFamilyOptions* new_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a default DBOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// DBOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in DBOptions:
+//
+// * rate_limiter_bytes_per_sec:
+// RateLimiter can be configured directly by specifying its bytes_per_sec.
+// [Example]:
+// - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to
+// passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec.
+//
+// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+// should be set.
+// @param new_options the resulting options based on "base_options" with the
+// change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+// the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+// value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+// valid for this option.
+Status GetDBOptionsFromMap(
+ const ConfigOptions& cfg_options, const DBOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ DBOptions* new_options);
+Status GetDBOptionsFromMap(
+ const DBOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ DBOptions* new_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+// We currently only support the following FilterPolicy in the convenience
+// functions:
+// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+// to specify BloomFilter. The above string is equivalent to calling
+// NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+// [Example]:
+// - Pass {"filter_policy", "bloomfilter:4:true"} in
+// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+// per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+// We currently only support LRU cache in the GetOptions API. The LRU
+// cache can be set by directly specifying its size.
+// [Example]:
+// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+// equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+// "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+// with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_table_options" will be set to
+// "table_options".
+Status GetBlockBasedTableOptionsFromMap(
+ const ConfigOptions& config_options,
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromMap(
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options,
+ bool input_strings_escaped = false, bool ignore_unknown_options = false);
+
+// Take a default PlainTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// PlainTableOptions "new_table_options".
+//
+// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+// "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+// with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_table_options" will be set to
+// "table_options".
+Status GetPlainTableOptionsFromMap(
+ const ConfigOptions& config_options, const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromMap(
+ const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a string representation of option names and values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+// "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+// "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+// "max_write_buffer_num=2"
+//
+//
+// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+ const ColumnFamilyOptions& base_options,
+ const std::string& opts_str,
+ ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
+ const std::string& opts_str,
+ ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+ const DBOptions& base_options,
+ const std::string& opts_str,
+ DBOptions* new_options);
+
+Status GetDBOptionsFromString(const DBOptions& base_options,
+ const std::string& opts_str,
+ DBOptions* new_options);
+
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+ const DBOptions& db_options,
+ std::string* opts_str);
+
+Status GetStringFromDBOptions(std::string* opts_str,
+ const DBOptions& db_options,
+ const std::string& delimiter = "; ");
+
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+ const ColumnFamilyOptions& cf_options,
+ std::string* opts_str);
+Status GetStringFromColumnFamilyOptions(std::string* opts_str,
+ const ColumnFamilyOptions& cf_options,
+ const std::string& delimiter = "; ");
+Status GetStringFromCompressionType(std::string* compression_str,
+ CompressionType compression_type);
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+Status GetBlockBasedTableOptionsFromString(
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromString(
+ const ConfigOptions& config_options,
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options);
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+ const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options);
+
+Status GetMemTableRepFactoryFromString(
+ const std::string& opts_str,
+ std::unique_ptr<MemTableRepFactory>* new_mem_factory);
+
+Status GetOptionsFromString(const Options& base_options,
+ const std::string& opts_str, Options* new_options);
+Status GetOptionsFromString(const ConfigOptions& config_options,
+ const Options& base_options,
+ const std::string& opts_str, Options* new_options);
+
+Status StringToMap(const std::string& opts_str,
+ std::unordered_map<std::string, std::string>* opts_map);
+
+// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+
+// Delete files which are entirely in the given range
+// Could leave some keys in the range which are in files which are not
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
+// Snapshots before the delete might not see the data in the given range.
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ bool include_end = true);
+
+// Delete files in multiple ranges at once
+// Delete files in a lot of ranges one at a time can be slow, use this API for
+// better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end = true);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const ReadOptions& read_options,
+ const std::string& file_path,
+ const SequenceNumber& largest_seqno = 0);
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/customizable.h b/src/rocksdb/include/rocksdb/customizable.h
new file mode 100644
index 000000000..92f7504ae
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/customizable.h
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * Customizable a base class used by the rocksdb that describes a
+ * standard way of configuring and creating objects. Customizable objects
+ * are configurable objects that can be created from an ObjectRegistry.
+ *
+ * Customizable classes are used when there are multiple potential
+ * implementations of a class for use by RocksDB (e.g. Table, Cache,
+ * MergeOperator, etc). The abstract base class is expected to define a method
+ * declaring its type and a factory method for creating one of these, such as:
+ * static const char *Type() { return "Table"; }
+ * static Status CreateFromString(const ConfigOptions& options,
+ * const std::string& id,
+ * std::shared_ptr<TableFactory>* result);
+ * The "Type" string is expected to be unique (no two base classes are the same
+ * type). This factory is expected, based on the options and id, create and
+ * return the appropriate derived type of the customizable class (e.g.
+ * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers,
+ * helper classes and methods are provided for writing this factory.
+ *
+ * Instances of a Customizable class need to define:
+ * - A "static const char *kClassName()" method. This method defines the name
+ * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the
+ * CheckedCast method.
+ * - The Name() of the object. This name is used when creating and saving
+ * instances of this class. Typically this name will be the same as
+ * kClassName().
+ *
+ * Additionally, Customizable classes should register any options used to
+ * configure themselves with the Configurable subsystem.
+ *
+ * When a Customizable is being created, the "name" property specifies
+ * the name of the instance being created.
+ * For custom objects, their configuration and name can be specified by:
+ * [prop]={name=X;option 1 = value1[; option2=value2...]}
+ *
+ * [prop].name=X
+ * [prop].option1 = value1
+ *
+ * [prop].name=X
+ * X.option1 =value1
+ */
+class Customizable : public Configurable {
+ public:
+ ~Customizable() override {}
+
+ // Returns the name of this class of Customizable
+ virtual const char* Name() const = 0;
+
+ // Returns an identifier for this Customizable.
+ // This could be its name or something more complex (like its URL/pattern).
+ // Used for pretty printing.
+ virtual std::string GetId() const {
+ std::string id = Name();
+ return id;
+ }
+
+ // This is typically determined by if the input name matches the
+ // name of this object.
+ // This method is typically used in conjunction with CheckedCast to find the
+ // derived class instance from its base. For example, if you have an Env
+ // and want the "Default" env, you would IsInstanceOf("Default") to get
+ // the default implementation. This method should be used when you need a
+ // specific derivative or implementation of a class.
+ //
+ // Intermediary caches (such as SharedCache) may wish to override this method
+ // to check for the intermediary name (SharedCache). Classes with multiple
+ // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override
+ // this method.
+ //
+ // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a".
+ // Wrapped classes that have an Inner "has-a" should not be returned.
+ //
+ // @param name The name of the instance to find.
+ // Returns true if the class is an instance of the input name.
+ virtual bool IsInstanceOf(const std::string& name) const {
+ if (name.empty()) {
+ return false;
+ } else if (name == Name()) {
+ return true;
+ } else {
+ const char* nickname = NickName();
+ if (nickname != nullptr && name == nickname) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ const void* GetOptionsPtr(const std::string& name) const override {
+ const void* ptr = Configurable::GetOptionsPtr(name);
+ if (ptr != nullptr) {
+ return ptr;
+ } else {
+ const auto inner = Inner();
+ if (inner != nullptr) {
+ return inner->GetOptionsPtr(name);
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ // Returns the named instance of the Customizable as a T*, or nullptr if not
+ // found. This method uses IsInstanceOf/Inner to find the appropriate class
+ // instance and then casts it to the expected return type.
+ template <typename T>
+ const T* CheckedCast() const {
+ if (IsInstanceOf(T::kClassName())) {
+ return static_cast<const T*>(this);
+ } else {
+ const auto inner = Inner();
+ if (inner != nullptr) {
+ return inner->CheckedCast<T>();
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ template <typename T>
+ T* CheckedCast() {
+ if (IsInstanceOf(T::kClassName())) {
+ return static_cast<T*>(this);
+ } else {
+ auto inner = const_cast<Customizable*>(Inner());
+ if (inner != nullptr) {
+ return inner->CheckedCast<T>();
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ // Checks to see if this Customizable is equivalent to other.
+ // This method assumes that the two objects are of the same class.
+ // @param config_options Controls how the options are compared.
+ // @param other The other object to compare to.
+ // @param mismatch If the objects do not match, this parameter contains
+ // the name of the option that triggered the match failure.
+ // @param True if the objects match, false otherwise.
+ // @see Configurable::AreEquivalent for more details
+ bool AreEquivalent(const ConfigOptions& config_options,
+ const Configurable* other,
+ std::string* mismatch) const override;
+#ifndef ROCKSDB_LITE
+ // Gets the value of the option associated with the input name
+ // @see Configurable::GetOption for more details
+ Status GetOption(const ConfigOptions& config_options, const std::string& name,
+ std::string* value) const override;
+#endif // ROCKSDB_LITE
+ // Helper method for getting for parsing the opt_value into the corresponding
+ // options for use in potentially creating a new Customizable object (this
+ // method is primarily a support method for LoadSharedObject et al for new
+ // Customizable objects). The opt_value may be either name-value pairs
+ // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new
+ // Customizable, the ID is determined by:
+ // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this
+ // name;
+ // - Otherwise, if there is a "id=value", the id is set to "value"
+ // - Otherwise, if the input customizable is not null, custom->GetId is used
+ // - Otherwise, an error is returned.
+ //
+ // If the opt_value is name-value pairs, these pairs will be returned in
+ // options (without the id pair). If the ID being returned matches the ID of
+ // the input custom object, then the options from the input object will also
+ // be added to the returned options.
+ //
+ // This method returns non-OK if the ID could not be found, or if the
+ // opt_value could not be parsed into name-value pairs.
+ static Status GetOptionsMap(
+ const ConfigOptions& config_options, const Customizable* custom,
+ const std::string& opt_value, std::string* id,
+ std::unordered_map<std::string, std::string>* options);
+
+ // Helper method to configure a new object with the supplied options.
+ // If the object is not null and invoke_prepare_options=true, the object
+ // will be configured and prepared.
+ // Returns success if the object is properly configured and (optionally)
+ // prepared Returns InvalidArgument if the object is nullptr and there are
+ // options in the map Returns the result of the ConfigureFromMap or
+ // PrepareOptions
+ static Status ConfigureNewObject(
+ const ConfigOptions& config_options, Customizable* object,
+ const std::unordered_map<std::string, std::string>& options);
+
+ // Returns the inner class when a Customizable implements a has-a (wrapped)
+ // relationship. Derived classes that implement a has-a must override this
+ // method in order to get CheckedCast to function properly.
+ virtual const Customizable* Inner() const { return nullptr; }
+
+ protected:
+ // Generates a ID specific for this instance of the customizable.
+ // The unique ID is of the form <name>:<addr>#pid, where:
+ // - name is the Name() of this object;
+ // - addr is the memory address of this object;
+ // - pid is the process ID of this process ID for this process.
+ // Note that if obj1 and obj2 have the same unique IDs, they must be the
+ // same. However, if an object is deleted and recreated, it may have the
+ // same unique ID as a predecessor
+ //
+ // This method is useful for objects (especially ManagedObjects) that
+ // wish to generate an ID that is specific for this instance and wish to
+ // override the GetId() method.
+ std::string GenerateIndividualId() const;
+
+ // Some classes have both a class name (e.g. PutOperator) and a nickname
+ // (e.g. put). Classes can override this method to return a
+ // nickname. Nicknames can be used by InstanceOf and object creation.
+ virtual const char* NickName() const { return ""; }
+ // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+ std::string GetOptionName(const std::string& long_name) const override;
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& options,
+ const std::string& prefix) const override;
+#endif // ROCKSDB_LITE
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/data_structure.h b/src/rocksdb/include/rocksdb/data_structure.h
new file mode 100644
index 000000000..f868a6be5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/data_structure.h
@@ -0,0 +1,51 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a data structure specifically designed as a "Set" for a
+// pretty small scale of Enum structure. For now, it can support up
+// to 64 element, and it is expandable in the future.
+template <typename ENUM_TYPE, ENUM_TYPE MAX_VALUE>
+class SmallEnumSet {
+ public:
+ SmallEnumSet() : state_(0) {}
+
+ ~SmallEnumSet() {}
+
+ // Return true if the input enum is included in the "Set" (i.e., changes the
+ // internal scalar state successfully), otherwise, it will return false.
+ bool Add(const ENUM_TYPE value) {
+ static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+ assert(value >= 0 && value <= MAX_VALUE);
+ uint64_t old_state = state_;
+ uint64_t tmp = 1;
+ state_ |= (tmp << value);
+ return old_state != state_;
+ }
+
+ // Return true if the input enum is contained in the "Set".
+ bool Contains(const ENUM_TYPE value) {
+ static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+ assert(value >= 0 && value <= MAX_VALUE);
+ uint64_t tmp = 1;
+ return state_ & (tmp << value);
+ }
+
+ private:
+ uint64_t state_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
new file mode 100644
index 000000000..26c07c19f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -0,0 +1,1859 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/block_cache_trace_writer.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/version.h"
+#include "rocksdb/wide_columns.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ColumnFamilyOptions;
+struct CompactionOptions;
+struct CompactRangeOptions;
+struct DBOptions;
+struct ExternalSstFileInfo;
+struct FlushOptions;
+struct Options;
+struct ReadOptions;
+struct TableProperties;
+struct WriteOptions;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
+class Env;
+class EventListener;
+class FileSystem;
+#ifndef ROCKSDB_LITE
+class Replayer;
+#endif
+class StatsHistoryIterator;
+#ifndef ROCKSDB_LITE
+class TraceReader;
+class TraceWriter;
+#endif
+class WriteBatch;
+
+extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
+struct ColumnFamilyDescriptor {
+ std::string name;
+ ColumnFamilyOptions options;
+ ColumnFamilyDescriptor()
+ : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+ ColumnFamilyDescriptor(const std::string& _name,
+ const ColumnFamilyOptions& _options)
+ : name(_name), options(_options) {}
+};
+
+class ColumnFamilyHandle {
+ public:
+ virtual ~ColumnFamilyHandle() {}
+ // Returns the name of the column family associated with the current handle.
+ virtual const std::string& GetName() const = 0;
+ // Returns the ID of the column family associated with the current handle.
+ virtual uint32_t GetID() const = 0;
+ // Fills "*desc" with the up-to-date descriptor of the column family
+ // associated with this handle. Since it fills "*desc" with the up-to-date
+ // information, this call might internally lock and release DB mutex to
+ // access the up-to-date CF options. In addition, all the pointer-typed
+ // options cannot be referenced any longer than the original options exist.
+ //
+ // Note that this function is not supported in RocksDBLite.
+ virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
+ // Returns the comparator of the column family associated with the
+ // current handle.
+ virtual const Comparator* GetComparator() const = 0;
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+// A range of keys
+struct Range {
+ Slice start;
+ Slice limit;
+
+ Range() {}
+ Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+struct RangePtr {
+ const Slice* start;
+ const Slice* limit;
+
+ RangePtr() : start(nullptr), limit(nullptr) {}
+ RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
+struct IngestExternalFileArg {
+ ColumnFamilyHandle* column_family = nullptr;
+ std::vector<std::string> external_files;
+ IngestExternalFileOptions options;
+ std::vector<std::string> files_checksums;
+ std::vector<std::string> files_checksum_func_names;
+ Temperature file_temperature = Temperature::kUnknown;
+};
+
+struct GetMergeOperandsOptions {
+ int expected_max_number_of_operands = 0;
+};
+
+// A collections of table properties objects, where
+// key: is the table's file name.
+// value: the table properties object of the given table.
+using TablePropertiesCollection =
+ std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+// A DB is a persistent, versioned ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+// DB is an abstract base class with one primary implementation (DBImpl)
+// and a number of wrapper implementations.
+class DB {
+ public:
+ // Open the database with the specified "name" for reads and writes.
+ // Stores a pointer to a heap-allocated database in *dbptr and returns
+ // OK on success.
+ // Stores nullptr in *dbptr and returns a non-OK status on error, including
+ // if the DB is already open (read-write) by another DB object. (This
+ // guarantee depends on options.env->LockFile(), which might not provide
+ // this guarantee in a custom Env implementation.)
+ //
+ // Caller must delete *dbptr when it is no longer needed.
+ static Status Open(const Options& options, const std::string& name,
+ DB** dbptr);
+
+ // Open DB with column families.
+ // db_options specify database specific options
+ // column_families is the vector of all column families in the database,
+ // containing column family name and options. You need to open ALL column
+ // families in the database. To get the list of column families, you can use
+ // ListColumnFamilies().
+ //
+ // The default column family name is 'default' and it's stored
+ // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
+ // If everything is OK, handles will on return be the same size
+ // as column_families --- handles[i] will be a handle that you
+ // will use to operate on column family column_family[i].
+ // Before delete DB, you have to close All column families by calling
+ // DestroyColumnFamilyHandle() with all the handles.
+ static Status Open(const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+ // OpenForReadOnly() creates a Read-only instance that supports reads alone.
+ //
+ // All DB interfaces that modify data, like put/delete, will return error.
+ // Automatic Flush and Compactions are disabled and any manual calls
+ // to Flush/Compaction will return error.
+ //
+ // While a given DB can be simultaneously opened via OpenForReadOnly
+ // by any number of readers, if a DB is simultaneously opened by Open
+ // and OpenForReadOnly, the read-only instance has undefined behavior
+ // (though can often succeed if quickly closed) and the read-write
+ // instance is unaffected. See also OpenAsSecondary.
+
+ // Open the database for read only.
+ //
+ // Not supported in ROCKSDB_LITE, in which case the function will
+ // return Status::NotSupported.
+ static Status OpenForReadOnly(const Options& options, const std::string& name,
+ DB** dbptr,
+ bool error_if_wal_file_exists = false);
+
+ // Open the database for read only with column families.
+ //
+ // When opening DB with read only, you can specify only a subset of column
+ // families in the database that should be opened. However, you always need
+ // to specify default column family. The default column family name is
+ // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+ //
+ // Not supported in ROCKSDB_LITE, in which case the function will
+ // return Status::NotSupported.
+ static Status OpenForReadOnly(
+ const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_wal_file_exists = false);
+
+ // OpenAsSecondary() creates a secondary instance that supports read-only
+ // operations and supports dynamic catch up with the primary (through a
+ // call to TryCatchUpWithPrimary()).
+ //
+ // All DB interfaces that modify data, like put/delete, will return error.
+ // Automatic Flush and Compactions are disabled and any manual calls
+ // to Flush/Compaction will return error.
+ //
+ // Multiple secondary instances can co-exist at the same time.
+ //
+
+ // Open DB as secondary instance
+ //
+ // The options argument specifies the options to open the secondary instance.
+ // Options.max_open_files should be set to -1.
+ // The name argument specifies the name of the primary db that you have used
+ // to open the primary instance.
+ // The secondary_path argument points to a directory where the secondary
+ // instance stores its info log.
+ // The dbptr is an out-arg corresponding to the opened secondary instance.
+ // The pointer points to a heap-allocated database, and the caller should
+ // delete it after use.
+ //
+ // Return OK on success, non-OK on failures.
+ static Status OpenAsSecondary(const Options& options, const std::string& name,
+ const std::string& secondary_path, DB** dbptr);
+
+ // Open DB as secondary instance with specified column families
+ //
+ // When opening DB in secondary mode, you can specify only a subset of column
+ // families in the database that should be opened. However, you always need
+ // to specify default column family. The default column family name is
+ // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+ //
+ // Column families created by the primary after the secondary instance starts
+ // are currently ignored by the secondary instance. Column families opened
+ // by secondary and dropped by the primary will be dropped by secondary as
+ // well (on next invocation of TryCatchUpWithPrimary()). However the user
+ // of the secondary instance can still access the data of such dropped column
+ // family as long as they do not destroy the corresponding column family
+ // handle.
+ //
+ // The options argument specifies the options to open the secondary instance.
+ // Options.max_open_files should be set to -1.
+ // The name argument specifies the name of the primary db that you have used
+ // to open the primary instance.
+ // The secondary_path argument points to a directory where the secondary
+ // instance stores its info log.
+ // The column_families argument specifies a list of column families to open.
+ // If default column family is not specified or if any specified column
+ // families does not exist, the function returns non-OK status.
+ // The handles is an out-arg corresponding to the opened database column
+ // family handles.
+ // The dbptr is an out-arg corresponding to the opened secondary instance.
+ // The pointer points to a heap-allocated database, and the caller should
+ // delete it after use. Before deleting the dbptr, the user should also
+ // delete the pointers stored in handles vector.
+ //
+ // Return OK on success, non-OK on failures.
+ static Status OpenAsSecondary(
+ const DBOptions& db_options, const std::string& name,
+ const std::string& secondary_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+ // Open DB and run the compaction.
+ // It's a read-only operation, the result won't be installed to the DB, it
+ // will be output to the `output_directory`. The API should only be used with
+ // `options.CompactionService` to run compaction triggered by
+ // `CompactionService`.
+ static Status OpenAndCompact(
+ const std::string& name, const std::string& output_directory,
+ const std::string& input, std::string* output,
+ const CompactionServiceOptionsOverride& override_options);
+
+ static Status OpenAndCompact(
+ const OpenAndCompactOptions& options, const std::string& name,
+ const std::string& output_directory, const std::string& input,
+ std::string* output,
+ const CompactionServiceOptionsOverride& override_options);
+
+ // Experimental and subject to change
+ // Open DB and trim data newer than specified timestamp.
+ // The trim_ts specified the user-defined timestamp trim bound.
+ // This API should only be used at timestamp enabled column families recovery.
+ // If some input column families do not support timestamp, nothing will
+ // be happened to them. The data with timestamp > trim_ts
+ // will be removed after this API returns successfully.
+ static Status OpenAndTrimHistory(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ std::string trim_ts);
+
+ virtual Status Resume() { return Status::NotSupported(); }
+
+ // Close the DB by releasing resources, closing files etc. This should be
+ // called before calling the destructor so that the caller can get back a
+ // status in case there are any errors. This will not fsync the WAL files.
+ // If syncing is required, the caller must first call SyncWAL(), or Write()
+ // using an empty write batch with WriteOptions.sync=true.
+ // Regardless of the return status, the DB must be freed.
+ // If the return status is Aborted(), closing fails because there is
+ // unreleased snapshot in the system. In this case, users can release
+ // the unreleased snapshots and try again and expect it to succeed. For
+ // other status, re-calling Close() will be no-op and return the original
+ // close status. If the return status is NotSupported(), then the DB
+ // implementation does cleanup in the destructor
+ virtual Status Close() { return Status::NotSupported(); }
+
+ // ListColumnFamilies will open the DB specified by argument name
+ // and return the list of all column families in that DB
+ // through column_families argument. The ordering of
+ // column families in column_families is unspecified.
+ static Status ListColumnFamilies(const DBOptions& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families);
+
+ // Abstract class ctor
+ DB() {}
+ // No copying allowed
+ DB(const DB&) = delete;
+ void operator=(const DB&) = delete;
+
+ virtual ~DB();
+
+ // Create a column_family and return the handle of column family
+ // through the argument handle.
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle);
+
+ // Bulk create column families with the same column family options.
+ // Return the handles of the column families through the argument handles.
+ // In case of error, the request may succeed partially, and handles will
+ // contain column family handles that it managed to create, and have size
+ // equal to the number of created column families.
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles);
+
+ // Bulk create column families.
+ // Return the handles of the column families through the argument handles.
+ // In case of error, the request may succeed partially, and handles will
+ // contain column family handles that it managed to create, and have size
+ // equal to the number of created column families.
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles);
+
+ // Drop a column family specified by column_family handle. This call
+ // only records a drop record in the manifest and prevents the column
+ // family from flushing and compacting.
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+ // Bulk drop column families. This call only records drop records in the
+ // manifest and prevents the column families from flushing and compacting.
+ // In case of error, the request may succeed partially. User may call
+ // ListColumnFamilies to check the result.
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families);
+
+ // Release and deallocate a column family handle. A column family is only
+ // removed once it is dropped (DropColumnFamily) and all handles have been
+ // destroyed (DestroyColumnFamilyHandle). Use this method to destroy
+ // column family handles (except for DefaultColumnFamily()!) before closing
+ // a DB.
+ virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
+
+ // Set the database entry for "key" to "value".
+ // If "key" already exists, it will be overwritten.
+ // Returns OK on success, and a non-OK status on error.
+ // Note: consider setting options.sync = true.
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) = 0;
+ virtual Status Put(const WriteOptions& options, const Slice& key,
+ const Slice& value) {
+ return Put(options, DefaultColumnFamily(), key, value);
+ }
+ virtual Status Put(const WriteOptions& options, const Slice& key,
+ const Slice& ts, const Slice& value) {
+ return Put(options, DefaultColumnFamily(), key, ts, value);
+ }
+
+ // Set the database entry for "key" in the column family specified by
+ // "column_family" to the wide-column entity defined by "columns". If the key
+ // already exists in the column family, it will be overwritten.
+ //
+ // Returns OK on success, and a non-OK status on error.
+ virtual Status PutEntity(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns);
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = true.
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) = 0;
+ virtual Status Delete(const WriteOptions& options, const Slice& key) {
+ return Delete(options, DefaultColumnFamily(), key);
+ }
+ virtual Status Delete(const WriteOptions& options, const Slice& key,
+ const Slice& ts) {
+ return Delete(options, DefaultColumnFamily(), key, ts);
+ }
+
+ // Remove the database entry for "key". Requires that the key exists
+ // and was not overwritten. Returns OK on success, and a non-OK status
+ // on error. It is not an error if "key" did not exist in the database.
+ //
+ // If a key is overwritten (by calling Put() multiple times), then the result
+ // of calling SingleDelete() on this key is undefined. SingleDelete() only
+ // behaves correctly if there has been only one Put() for this key since the
+ // previous call to SingleDelete() for this key.
+ //
+ // This feature is currently an experimental performance optimization
+ // for a very specific workload. It is up to the caller to ensure that
+ // SingleDelete is only used for a key that is not deleted using Delete() or
+ // written using Merge(). Mixing SingleDelete operations with Deletes and
+ // Merges can result in undefined behavior.
+ //
+ // Note: consider setting options.sync = true.
+ virtual Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) = 0;
+ virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
+ return SingleDelete(options, DefaultColumnFamily(), key);
+ }
+ virtual Status SingleDelete(const WriteOptions& options, const Slice& key,
+ const Slice& ts) {
+ return SingleDelete(options, DefaultColumnFamily(), key, ts);
+ }
+
+ // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
+ // including "begin_key" and excluding "end_key". Returns OK on success, and
+ // a non-OK status on error. It is not an error if the database does not
+ // contain any existing data in the range ["begin_key", "end_key").
+ //
+ // If "end_key" comes before "start_key" according to the user's comparator,
+ // a `Status::InvalidArgument` is returned.
+ //
+ // This feature is now usable in production, with the following caveats:
+ // 1) Accumulating too many range tombstones in the memtable will degrade read
+ // performance; this can be avoided by manually flushing occasionally.
+ // 2) Limiting the maximum number of open files in the presence of range
+ // tombstones can degrade read performance. To avoid this problem, set
+ // max_open_files to -1 whenever possible.
+ virtual Status DeleteRange(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key);
+ virtual Status DeleteRange(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key,
+ const Slice& ts);
+
+ // Merge the database entry for "key" with "value". Returns OK on success,
+ // and a non-OK status on error. The semantics of this operation is
+ // determined by the user provided merge_operator when opening DB.
+ // Note: consider setting options.sync = true.
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Merge(const WriteOptions& options, const Slice& key,
+ const Slice& value) {
+ return Merge(options, DefaultColumnFamily(), key, value);
+ }
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*ts*/,
+ const Slice& /*value*/);
+
+ // Apply the specified updates to the database.
+ // If `updates` contains no update, WAL will still be synced if
+ // options.sync=true.
+ // Returns OK on success, non-OK on failure.
+ // Note: consider setting options.sync = true.
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+ // If the column family specified by "column_family" contains an entry for
+ // "key", return the corresponding value in "*value". If the entry is a plain
+ // key-value, return the value as-is; if it is a wide-column entity, return
+ // the value of its default anonymous column (see kDefaultWideColumnName) if
+ // any, or an empty value otherwise.
+ //
+ // If timestamp is enabled and a non-null timestamp pointer is passed in,
+ // timestamp is returned.
+ //
+ // Returns OK on success. Returns NotFound and an empty value in "*value" if
+ // there is no entry for "key". Returns some other non-OK status on error.
+ virtual inline Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s = Get(options, column_family, key, &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+ }
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) = 0;
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) {
+ return Get(options, DefaultColumnFamily(), key, value);
+ }
+
+ // Get() methods that return timestamp. Derived DB classes don't need to worry
+ // about this group of methods if they don't care about timestamp feature.
+ virtual inline Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, std::string* timestamp) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s = Get(options, column_family, key, &pinnable_val, timestamp);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+ }
+ virtual Status Get(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, PinnableSlice* /*value*/,
+ std::string* /*timestamp*/) {
+ return Status::NotSupported(
+ "Get() that returns timestamp is not implemented.");
+ }
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value, std::string* timestamp) {
+ return Get(options, DefaultColumnFamily(), key, value, timestamp);
+ }
+
+ // If the column family specified by "column_family" contains an entry for
+ // "key", return it as a wide-column entity in "*columns". If the entry is a
+ // wide-column entity, return it as-is; if it is a plain key-value, return it
+ // as an entity with a single anonymous column (see kDefaultWideColumnName)
+ // which contains the value.
+ //
+ // Returns OK on success. Returns NotFound and an empty wide-column entity in
+ // "*columns" if there is no entry for "key". Returns some other non-OK status
+ // on error.
+ virtual Status GetEntity(const ReadOptions& /* options */,
+ ColumnFamilyHandle* /* column_family */,
+ const Slice& /* key */,
+ PinnableWideColumns* /* columns */) {
+ return Status::NotSupported("GetEntity not supported");
+ }
+
+ // Populates the `merge_operands` array with all the merge operands in the DB
+ // for `key`. The `merge_operands` array will be populated in the order of
+ // insertion. The number of entries populated in `merge_operands` will be
+ // assigned to `*number_of_operands`.
+ //
+ // If the number of merge operands in DB for `key` is greater than
+ // `merge_operands_options.expected_max_number_of_operands`,
+ // `merge_operands` is not populated and the return value is
+ // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned
+ // the number of merge operands found in the DB for `key`.
+ //
+ // `merge_operands`- Points to an array of at-least
+ // merge_operands_options.expected_max_number_of_operands and the
+ // caller is responsible for allocating it.
+ //
+ // The caller should delete or `Reset()` the `merge_operands` entries when
+ // they are no longer needed. All `merge_operands` entries must be destroyed
+ // or `Reset()` before this DB is closed or destroyed.
+ virtual Status GetMergeOperands(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* merge_operands,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) = 0;
+
+ // Consistent Get of many keys across column families without the need
+ // for an explicit snapshot. NOTE: the implementation of this MultiGet API
+ // does not have the performance benefits of the void-returning MultiGet
+ // functions.
+ //
+ // If keys[i] does not exist in the database, then the i'th returned
+ // status will be one for which Status::IsNotFound() is true, and
+ // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+ // the i'th returned status will have Status::ok() true, and (*values)[i]
+ // will store the value associated with keys[i].
+ //
+ // (*values) will always be resized to be the same size as (keys).
+ // Similarly, the number of returned statuses will be the number of keys.
+ // Note: keys will not be "de-duplicated". Duplicate keys will return
+ // duplicate values in order.
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+ virtual std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) {
+ return MultiGet(
+ options,
+ std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+ keys, values);
+ }
+
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& /*options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+ const std::vector<Slice>& keys, std::vector<std::string>* /*values*/,
+ std::vector<std::string>* /*timestamps*/) {
+ return std::vector<Status>(
+ keys.size(), Status::NotSupported(
+ "MultiGet() returning timestamps not implemented."));
+ }
+ virtual std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values,
+ std::vector<std::string>* timestamps) {
+ return MultiGet(
+ options,
+ std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+ keys, values, timestamps);
+ }
+
+ // Overloaded MultiGet API that improves performance by batching operations
+ // in the read path for greater efficiency. Currently, only the block based
+ // table format with full filters are supported. Other table formats such
+ // as plain table, block based table with block based filters and
+ // partitioned indexes will still work, but will not get any performance
+ // benefits.
+ // Parameters -
+ // options - ReadOptions
+ // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+ // passed to the API are restricted to a single column family
+ // num_keys - Number of keys to lookup
+ // keys - Pointer to C style array of key Slices with num_keys elements
+ // values - Pointer to C style array of PinnableSlices with num_keys elements
+ // statuses - Pointer to C style array of Status with num_keys elements
+ // sorted_input - If true, it means the input keys are already sorted by key
+ // order, so the MultiGet() API doesn't have to sort them
+ // again. If false, the keys will be copied and sorted
+ // internally by the API - the input array will not be
+ // modified
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_family);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals);
+ std::copy(status.begin(), status.end(), statuses);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, std::string* timestamps,
+ Status* statuses, const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+ std::vector<std::string> tss;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_family);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals, &tss);
+ std::copy(status.begin(), status.end(), statuses);
+ std::copy(tss.begin(), tss.end(), timestamps);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+
+ // Overloaded MultiGet API that improves performance by batching operations
+ // in the read path for greater efficiency. Currently, only the block based
+ // table format with full filters are supported. Other table formats such
+ // as plain table, block based table with block based filters and
+ // partitioned indexes will still work, but will not get any performance
+ // benefits.
+ // Parameters -
+ // options - ReadOptions
+ // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+ // passed to the API are restricted to a single column family
+ // num_keys - Number of keys to lookup
+ // keys - Pointer to C style array of key Slices with num_keys elements
+ // values - Pointer to C style array of PinnableSlices with num_keys elements
+ // statuses - Pointer to C style array of Status with num_keys elements
+ // sorted_input - If true, it means the input keys are already sorted by key
+ // order, so the MultiGet() API doesn't have to sort them
+ // again. If false, the keys will be copied and sorted
+ // internally by the API - the input array will not be
+ // modified
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_families[i]);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals);
+ std::copy(status.begin(), status.end(), statuses);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, std::string* timestamps,
+ Status* statuses, const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+ std::vector<std::string> tss;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_families[i]);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals, &tss);
+ std::copy(status.begin(), status.end(), statuses);
+ std::copy(tss.begin(), tss.end(), timestamps);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+
+ // If the key definitely does not exist in the database, then this method
+ // returns false, else true. If the caller wants to obtain value when the key
+ // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+ // will be true on return if value has been set properly.
+ // This check is potentially lighter-weight than invoking DB::Get(). One way
+ // to make this lighter weight is to avoid doing any IOs.
+ // Default implementation here returns true and sets 'value_found' to false
+ virtual bool KeyMayExist(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, std::string* /*value*/,
+ std::string* /*timestamp*/,
+ bool* value_found = nullptr) {
+ if (value_found != nullptr) {
+ *value_found = false;
+ }
+ return true;
+ }
+
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, bool* value_found = nullptr) {
+ return KeyMayExist(options, column_family, key, value,
+ /*timestamp=*/nullptr, value_found);
+ }
+
+ virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+ std::string* value, bool* value_found = nullptr) {
+ return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+ }
+
+ virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+ std::string* value, std::string* timestamp,
+ bool* value_found = nullptr) {
+ return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp,
+ value_found);
+ }
+
+ // Return a heap-allocated iterator over the contents of the database.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ //
+ // Caller should delete the iterator when it is no longer needed.
+ // The returned iterator should be deleted before this db is deleted.
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+ virtual Iterator* NewIterator(const ReadOptions& options) {
+ return NewIterator(options, DefaultColumnFamily());
+ }
+ // Returns iterators from a consistent database state across multiple
+ // column families. Iterators are heap allocated and need to be deleted
+ // before the db is deleted
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) = 0;
+
+ // Return a handle to the current DB state. Iterators created with
+ // this handle will all observe a stable snapshot of the current DB
+ // state. The caller must call ReleaseSnapshot(result) when the
+ // snapshot is no longer needed.
+ //
+ // nullptr will be returned if the DB fails to take a snapshot or does
+ // not support snapshot (eg: inplace_update_support enabled).
+ virtual const Snapshot* GetSnapshot() = 0;
+
+ // Release a previously acquired snapshot. The caller must not
+ // use "snapshot" after this call.
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+#ifndef ROCKSDB_LITE
+ // Contains all valid property arguments for GetProperty() or
+ // GetMapProperty(). Each is a "string" property for retrieval with
+ // GetProperty() unless noted as a "map" property, for GetMapProperty().
+ //
+ // NOTE: Property names cannot end in numbers since those are interpreted as
+ // arguments, e.g., see kNumFilesAtLevelPrefix.
+ struct Properties {
+ // "rocksdb.num-files-at-level<N>" - returns string containing the number
+ // of files at level <N>, where <N> is an ASCII representation of a
+ // level number (e.g., "0").
+ static const std::string kNumFilesAtLevelPrefix;
+
+ // "rocksdb.compression-ratio-at-level<N>" - returns string containing the
+ // compression ratio of data at level <N>, where <N> is an ASCII
+ // representation of a level number (e.g., "0"). Here, compression
+ // ratio is defined as uncompressed data size / compressed file size.
+ // Returns "-1.0" if no open files at level <N>.
+ static const std::string kCompressionRatioAtLevelPrefix;
+
+ // "rocksdb.stats" - returns a multi-line string containing the data
+ // described by kCFStats followed by the data described by kDBStats.
+ static const std::string kStats;
+
+ // "rocksdb.sstables" - returns a multi-line string summarizing current
+ // SST files.
+ static const std::string kSSTables;
+
+ // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
+ // and "rocksdb.cf-file-histogram" as a "map" property.
+ static const std::string kCFStats;
+
+ // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
+ // general column family stats per-level over db's lifetime ("L<n>"),
+ // aggregated over db's lifetime ("Sum"), and aggregated over the
+ // interval since the last retrieval ("Int").
+ static const std::string kCFStatsNoFileHistogram;
+
+ // "rocksdb.cf-file-histogram" - print out how many file reads to every
+ // level, as well as the histogram of latency of single requests.
+ static const std::string kCFFileHistogram;
+
+ // "rocksdb.dbstats" - As a string property, returns a multi-line string
+ // with general database stats, both cumulative (over the db's
+ // lifetime) and interval (since the last retrieval of kDBStats).
+ // As a map property, returns cumulative stats only and does not
+ // update the baseline for the interval stats.
+ static const std::string kDBStats;
+
+ // "rocksdb.levelstats" - returns multi-line string containing the number
+ // of files per level and total size of each level (MB).
+ static const std::string kLevelStats;
+
+ // "rocksdb.block-cache-entry-stats" - returns a multi-line string or
+ // map with statistics on block cache usage. See
+ // `BlockCacheEntryStatsMapKeys` for structured representation of keys
+ // available in the map form.
+ static const std::string kBlockCacheEntryStats;
+
+ // "rocksdb.fast-block-cache-entry-stats" - same as above, but returns
+ // stale values more frequently to reduce overhead and latency.
+ static const std::string kFastBlockCacheEntryStats;
+
+ // "rocksdb.num-immutable-mem-table" - returns number of immutable
+ // memtables that have not yet been flushed.
+ static const std::string kNumImmutableMemTable;
+
+ // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
+ // memtables that have already been flushed.
+ static const std::string kNumImmutableMemTableFlushed;
+
+ // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
+ // pending; otherwise, returns 0.
+ static const std::string kMemTableFlushPending;
+
+ // "rocksdb.num-running-flushes" - returns the number of currently running
+ // flushes.
+ static const std::string kNumRunningFlushes;
+
+ // "rocksdb.compaction-pending" - returns 1 if at least one compaction is
+ // pending; otherwise, returns 0.
+ static const std::string kCompactionPending;
+
+ // "rocksdb.num-running-compactions" - returns the number of currently
+ // running compactions.
+ static const std::string kNumRunningCompactions;
+
+ // "rocksdb.background-errors" - returns accumulated number of background
+ // errors.
+ static const std::string kBackgroundErrors;
+
+ // "rocksdb.cur-size-active-mem-table" - returns approximate size of active
+ // memtable (bytes).
+ static const std::string kCurSizeActiveMemTable;
+
+ // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
+ // and unflushed immutable memtables (bytes).
+ static const std::string kCurSizeAllMemTables;
+
+ // "rocksdb.size-all-mem-tables" - returns approximate size of active,
+ // unflushed immutable, and pinned immutable memtables (bytes).
+ static const std::string kSizeAllMemTables;
+
+ // "rocksdb.num-entries-active-mem-table" - returns total number of entries
+ // in the active memtable.
+ static const std::string kNumEntriesActiveMemTable;
+
+ // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
+ // in the unflushed immutable memtables.
+ static const std::string kNumEntriesImmMemTables;
+
+ // "rocksdb.num-deletes-active-mem-table" - returns total number of delete
+ // entries in the active memtable.
+ static const std::string kNumDeletesActiveMemTable;
+
+ // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
+ // entries in the unflushed immutable memtables.
+ static const std::string kNumDeletesImmMemTables;
+
+ // "rocksdb.estimate-num-keys" - returns estimated number of total keys in
+ // the active and unflushed immutable memtables and storage.
+ static const std::string kEstimateNumKeys;
+
+ // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
+ // reading SST tables, excluding memory used in block cache (e.g.,
+ // filter and index blocks).
+ static const std::string kEstimateTableReadersMem;
+
+ // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
+ // files is enabled; otherwise, returns a non-zero number.
+ // This name may be misleading because true(non-zero) means disable,
+ // but we keep the name for backward compatibility.
+ static const std::string kIsFileDeletionsEnabled;
+
+ // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
+ // database.
+ static const std::string kNumSnapshots;
+
+ // "rocksdb.oldest-snapshot-time" - returns number representing unix
+ // timestamp of oldest unreleased snapshot.
+ static const std::string kOldestSnapshotTime;
+
+ // "rocksdb.oldest-snapshot-sequence" - returns number representing
+ // sequence number of oldest unreleased snapshot.
+ static const std::string kOldestSnapshotSequence;
+
+ // "rocksdb.num-live-versions" - returns number of live versions. `Version`
+ // is an internal data structure. See version_set.h for details. More
+ // live versions often mean more SST files are held from being deleted,
+ // by iterators or unfinished compactions.
+ static const std::string kNumLiveVersions;
+
+ // "rocksdb.current-super-version-number" - returns number of current LSM
+ // version. It is a uint64_t integer number, incremented after there is
+ // any change to the LSM tree. The number is not preserved after restarting
+ // the DB. After DB restart, it will start from 0 again.
+ static const std::string kCurrentSuperVersionNumber;
+
+ // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
+ // live data in bytes. For BlobDB, it also includes the exact value of
+ // live bytes in the blob files of the version.
+ static const std::string kEstimateLiveDataSize;
+
+ // "rocksdb.min-log-number-to-keep" - return the minimum log number of the
+ // log files that should be kept.
+ static const std::string kMinLogNumberToKeep;
+
+ // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+ // number for an obsolete SST to be kept. The max value of `uint64_t`
+ // will be returned if all obsolete files can be deleted.
+ static const std::string kMinObsoleteSstNumberToKeep;
+
+ // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
+ // files.
+ // WARNING: may slow down online queries if there are too many files.
+ static const std::string kTotalSstFilesSize;
+
+ // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
+ // files belong to the latest LSM tree.
+ static const std::string kLiveSstFilesSize;
+
+ // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes)
+ // of SST files at all certain file temperature
+ static const std::string kLiveSstFilesSizeAtTemperature;
+
+ // "rocksdb.base-level" - returns number of level to which L0 data will be
+ // compacted.
+ static const std::string kBaseLevel;
+
+ // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
+ // number of bytes compaction needs to rewrite to get all levels down
+ // to under target size. Not valid for other compactions than level-
+ // based.
+ static const std::string kEstimatePendingCompactionBytes;
+
+ // "rocksdb.aggregated-table-properties" - returns a string or map
+ // representation of the aggregated table properties of the target
+ // column family. Only properties that make sense for aggregation
+ // are included.
+ static const std::string kAggregatedTableProperties;
+
+ // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
+ // one but only returns the aggregated table properties of the
+ // specified level "N" at the target column family.
+ static const std::string kAggregatedTablePropertiesAtLevel;
+
+ // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
+ // write rate. 0 means no delay.
+ static const std::string kActualDelayedWriteRate;
+
+ // "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
+ static const std::string kIsWriteStopped;
+
+ // "rocksdb.estimate-oldest-key-time" - returns an estimation of
+ // oldest key timestamp in the DB. Currently only available for
+ // FIFO compaction with
+ // compaction_options_fifo.allow_compaction = false.
+ static const std::string kEstimateOldestKeyTime;
+
+ // "rocksdb.block-cache-capacity" - returns block cache capacity.
+ static const std::string kBlockCacheCapacity;
+
+ // "rocksdb.block-cache-usage" - returns the memory size for the entries
+ // residing in block cache.
+ static const std::string kBlockCacheUsage;
+
+ // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
+ // entries being pinned.
+ static const std::string kBlockCachePinnedUsage;
+
+ // "rocksdb.options-statistics" - returns multi-line string
+ // of options.statistics
+ static const std::string kOptionsStatistics;
+
+ // "rocksdb.num-blob-files" - returns number of blob files in the current
+ // version.
+ static const std::string kNumBlobFiles;
+
+ // "rocksdb.blob-stats" - return the total number and size of all blob
+ // files, and total amount of garbage (bytes) in the blob files in
+ // the current version.
+ static const std::string kBlobStats;
+
+ // "rocksdb.total-blob-file-size" - returns the total size of all blob
+ // files over all versions.
+ static const std::string kTotalBlobFileSize;
+
+ // "rocksdb.live-blob-file-size" - returns the total size of all blob
+ // files in the current version.
+ static const std::string kLiveBlobFileSize;
+
+ // "rocksdb.live-blob-file-garbage-size" - returns the total amount of
+ // garbage in the blob files in the current version.
+ static const std::string kLiveBlobFileGarbageSize;
+
+ // "rocksdb.blob-cache-capacity" - returns blob cache capacity.
+ static const std::string kBlobCacheCapacity;
+
+ // "rocksdb.blob-cache-usage" - returns the memory size for the entries
+ // residing in blob cache.
+ static const std::string kBlobCacheUsage;
+
+ // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the
+ // entries being pinned in blob cache.
+ static const std::string kBlobCachePinnedUsage;
+ };
+#endif /* ROCKSDB_LITE */
+
+ // DB implementations export properties about their state via this method.
+ // If "property" is a valid "string" property understood by this DB
+ // implementation (see Properties struct above for valid options), fills
+ // "*value" with its current value and returns true. Otherwise, returns
+ // false.
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) = 0;
+ virtual bool GetProperty(const Slice& property, std::string* value) {
+ return GetProperty(DefaultColumnFamily(), property, value);
+ }
+
+ // Like GetProperty but for valid "map" properties. (Some properties can be
+ // accessed as either "string" properties or "map" properties.)
+ virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
+ const Slice& property,
+ std::map<std::string, std::string>* value) = 0;
+ virtual bool GetMapProperty(const Slice& property,
+ std::map<std::string, std::string>* value) {
+ return GetMapProperty(DefaultColumnFamily(), property, value);
+ }
+
+ // Similar to GetProperty(), but only works for a subset of properties whose
+ // return value is an integer. Return the value by integer. Supported
+ // properties:
+ // "rocksdb.num-immutable-mem-table"
+ // "rocksdb.mem-table-flush-pending"
+ // "rocksdb.compaction-pending"
+ // "rocksdb.background-errors"
+ // "rocksdb.cur-size-active-mem-table"
+ // "rocksdb.cur-size-all-mem-tables"
+ // "rocksdb.size-all-mem-tables"
+ // "rocksdb.num-entries-active-mem-table"
+ // "rocksdb.num-entries-imm-mem-tables"
+ // "rocksdb.num-deletes-active-mem-table"
+ // "rocksdb.num-deletes-imm-mem-tables"
+ // "rocksdb.estimate-num-keys"
+ // "rocksdb.estimate-table-readers-mem"
+ // "rocksdb.is-file-deletions-enabled"
+ // "rocksdb.num-snapshots"
+ // "rocksdb.oldest-snapshot-time"
+ // "rocksdb.num-live-versions"
+ // "rocksdb.current-super-version-number"
+ // "rocksdb.estimate-live-data-size"
+ // "rocksdb.min-log-number-to-keep"
+ // "rocksdb.min-obsolete-sst-number-to-keep"
+ // "rocksdb.total-sst-files-size"
+ // "rocksdb.live-sst-files-size"
+ // "rocksdb.base-level"
+ // "rocksdb.estimate-pending-compaction-bytes"
+ // "rocksdb.num-running-compactions"
+ // "rocksdb.num-running-flushes"
+ // "rocksdb.actual-delayed-write-rate"
+ // "rocksdb.is-write-stopped"
+ // "rocksdb.estimate-oldest-key-time"
+ // "rocksdb.block-cache-capacity"
+ // "rocksdb.block-cache-usage"
+ // "rocksdb.block-cache-pinned-usage"
+ //
+ // Properties dedicated for BlobDB:
+ // "rocksdb.num-blob-files"
+ // "rocksdb.total-blob-file-size"
+ // "rocksdb.live-blob-file-size"
+ // "rocksdb.blob-cache-capacity"
+ // "rocksdb.blob-cache-usage"
+ // "rocksdb.blob-cache-pinned-usage"
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) = 0;
+ virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
+ return GetIntProperty(DefaultColumnFamily(), property, value);
+ }
+
+ // Reset internal stats for DB and all column families.
+ // Note this doesn't reset options.statistics as it is not owned by
+ // DB.
+ virtual Status ResetStats() {
+ return Status::NotSupported("Not implemented");
+ }
+
+ // Same as GetIntProperty(), but this one returns the aggregated int
+ // property from all column families.
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* value) = 0;
+
+ // Flags for DB::GetSizeApproximation that specify whether memtable
+ // stats should be included, or file stats approximation or both
+ enum class SizeApproximationFlags : uint8_t {
+ NONE = 0,
+ INCLUDE_MEMTABLES = 1 << 0,
+ INCLUDE_FILES = 1 << 1
+ };
+
+ // For each i in [0,n-1], store in "sizes[i]", the approximate
+ // file system space used by keys in "[range[i].start .. range[i].limit)"
+ // in a single column family.
+ //
+ // Note that the returned sizes measure file system space usage, so
+ // if the user data compresses by a factor of ten, the returned
+ // sizes will be one-tenth the size of the corresponding user data size.
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* ranges, int n,
+ uint64_t* sizes) = 0;
+
+ // Simpler versions of the GetApproximateSizes() method above.
+ // The include_flags argument must of type DB::SizeApproximationFlags
+ // and can not be NONE.
+ virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
+ const Range* ranges, int n,
+ uint64_t* sizes,
+ SizeApproximationFlags include_flags =
+ SizeApproximationFlags::INCLUDE_FILES);
+
+ virtual Status GetApproximateSizes(
+ const Range* ranges, int n, uint64_t* sizes,
+ SizeApproximationFlags include_flags =
+ SizeApproximationFlags::INCLUDE_FILES) {
+ return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
+ include_flags);
+ }
+
+ // The method is similar to GetApproximateSizes, except it
+ // returns approximate number of records in memtables.
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) = 0;
+ virtual void GetApproximateMemTableStats(const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) {
+ GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
+ }
+
+ // Compact the underlying storage for the key range [*begin,*end].
+ // The actual compaction interval might be superset of [*begin, *end].
+ // In particular, deleted and overwritten versions are discarded,
+ // and the data is rearranged to reduce the cost of operations
+ // needed to access the data. This operation should typically only
+ // be invoked by users who understand the underlying implementation.
+ // This call blocks until the operation completes successfully, fails,
+ // or is aborted (Status::Incomplete). See DisableManualCompaction.
+ //
+ // begin==nullptr is treated as a key before all keys in the database.
+ // end==nullptr is treated as a key after all keys in the database.
+ // Therefore the following call will compact the entire database:
+ // db->CompactRange(options, nullptr, nullptr);
+ // Note that after the entire database is compacted, all data are pushed
+ // down to the last level containing any data. If the total data size after
+ // compaction is reduced, that level might not be appropriate for hosting all
+ // the files. In this case, client could set options.change_level to true, to
+ // move the files back to the minimum level capable of holding the data set
+ // or a given level (specified by non-negative options.target_level).
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) = 0;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ const Slice* begin, const Slice* end) {
+ return CompactRange(options, DefaultColumnFamily(), begin, end);
+ }
+
+ // Dynamically change column family options or table factory options in a
+ // running DB, for the specified column family. Only options internally
+ // marked as "mutable" can be changed. Options not listed in `opts_map` will
+ // keep their current values. See GetColumnFamilyOptionsFromMap() in
+ // convenience.h for the details of `opts_map`. Not supported in LITE mode.
+ //
+ // USABILITY NOTE: SetOptions is intended only for expert users, and does
+ // not apply the same sanitization to options as the standard DB::Open code
+ // path does. Use with caution.
+ //
+ // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for
+ // reliability, and this is a slow call because a new OPTIONS file is
+ // serialized and persisted for each call. Use only infrequently.
+ //
+ // EXAMPLES:
+ // s = db->SetOptions(cfh, {{"ttl", "36000"}});
+ // s = db->SetOptions(cfh, {{"block_based_table_factory",
+ // "{prepopulate_block_cache=kDisable;}"}});
+ virtual Status SetOptions(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::unordered_map<std::string, std::string>& /*opts_map*/) {
+ return Status::NotSupported("Not implemented");
+ }
+ // Shortcut for SetOptions on the default column family handle.
+ virtual Status SetOptions(
+ const std::unordered_map<std::string, std::string>& new_options) {
+ return SetOptions(DefaultColumnFamily(), new_options);
+ }
+
+ // Like SetOptions but for DBOptions, including the same caveats for
+ // usability, reliability, and performance. See GetDBOptionsFromMap() (and
+ // GetColumnFamilyOptionsFromMap()) in convenience.h for details on
+ // `opts_map`. Note supported in LITE mode.
+ //
+ // EXAMPLES:
+ // s = db->SetDBOptions({{"max_subcompactions", "2"}});
+ // s = db->SetDBOptions({{"stats_dump_period_sec", "0"},
+ // {"stats_persist_period_sec", "0"}});
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& new_options) = 0;
+
+ // CompactFiles() inputs a list of files specified by file numbers and
+ // compacts them to the specified level. A small difference compared to
+ // CompactRange() is that CompactFiles() performs the compaction job
+ // using the CURRENT thread, so is not considered a "background" job.
+ //
+ // @see GetDataBaseMetaData
+ // @see GetColumnFamilyMetaData
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) = 0;
+
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) {
+ return CompactFiles(compact_options, DefaultColumnFamily(),
+ input_file_names, output_level, output_path_id,
+ output_file_names, compaction_job_info);
+ }
+
+ // This function will wait until all currently running background processes
+ // finish. After it returns, no background process will be run until
+ // ContinueBackgroundWork is called, once for each preceding OK-returning
+ // call to PauseBackgroundWork.
+ virtual Status PauseBackgroundWork() = 0;
+ virtual Status ContinueBackgroundWork() = 0;
+
+ // This function will enable automatic compactions for the given column
+ // families if they were previously disabled. The function will first set the
+ // disable_auto_compactions option for each column family to 'false', after
+ // which it will schedule a flush/compaction.
+ //
+ // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
+ // does NOT schedule a flush/compaction afterwards, and only changes the
+ // parameter itself within the column family option.
+ //
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
+
+ // After this function call, CompactRange() or CompactFiles() will not
+ // run compactions and fail. Calling this function will tell outstanding
+ // manual compactions to abort and will wait for them to finish or abort
+ // before returning.
+ virtual void DisableManualCompaction() = 0;
+ // Re-enable CompactRange() and ComapctFiles() that are disabled by
+ // DisableManualCompaction(). This function must be called as many times
+ // as DisableManualCompaction() has been called in order to re-enable
+ // manual compactions, and must not be called more times than
+ // DisableManualCompaction() has been called.
+ virtual void EnableManualCompaction() = 0;
+
+ // Number of levels used for this DB.
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+ virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+ // Maximum level to which a new compacted memtable is pushed if it
+ // does not create overlap.
+ virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+ virtual int MaxMemCompactionLevel() {
+ return MaxMemCompactionLevel(DefaultColumnFamily());
+ }
+
+ // Number of files in level-0 that would stop writes.
+ virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+ virtual int Level0StopWriteTrigger() {
+ return Level0StopWriteTrigger(DefaultColumnFamily());
+ }
+
+ // Get DB name -- the exact same name that was provided as an argument to
+ // DB::Open()
+ virtual const std::string& GetName() const = 0;
+
+ // Get Env object from the DB
+ virtual Env* GetEnv() const = 0;
+
+ // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for
+ // efficiency.
+ virtual FileSystem* GetFileSystem() const;
+
+ // Get DB Options that we use. During the process of opening the
+ // column family, the options provided when calling DB::Open() or
+ // DB::CreateColumnFamily() will have been "sanitized" and transformed
+ // in an implementation-defined manner.
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
+ virtual Options GetOptions() const {
+ return GetOptions(DefaultColumnFamily());
+ }
+
+ virtual DBOptions GetDBOptions() const = 0;
+
+ // Flush all mem-table data.
+ // Flush a single column family, even when atomic flush is enabled. To flush
+ // multiple column families, use Flush(options, column_families).
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+ virtual Status Flush(const FlushOptions& options) {
+ return Flush(options, DefaultColumnFamily());
+ }
+ // Flushes multiple column families.
+ // If atomic flush is not enabled, Flush(options, column_families) is
+ // equivalent to calling Flush(options, column_family) multiple times.
+ // If atomic flush is enabled, Flush(options, column_families) will flush all
+ // column families specified in 'column_families' up to the latest sequence
+ // number at the time when flush is requested.
+ // Note that RocksDB 5.15 and earlier may not be able to open later versions
+ // with atomic flush enabled.
+ virtual Status Flush(
+ const FlushOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families) = 0;
+
+ // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
+ // afterwards.
+ virtual Status FlushWAL(bool /*sync*/) {
+ return Status::NotSupported("FlushWAL not implemented");
+ }
+ // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
+ // same as Write() with sync=true: in the latter case the changes won't be
+ // visible until the sync is done.
+ // Currently only works if allow_mmap_writes = false in Options.
+ virtual Status SyncWAL() = 0;
+
+ // Lock the WAL. Also flushes the WAL after locking.
+ virtual Status LockWAL() {
+ return Status::NotSupported("LockWAL not implemented");
+ }
+
+ // Unlock the WAL.
+ virtual Status UnlockWAL() {
+ return Status::NotSupported("UnlockWAL not implemented");
+ }
+
+ // The sequence number of the most recent transaction.
+ virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+ // Prevent file deletions. Compactions will continue to occur,
+ // but no obsolete files will be deleted. Calling this multiple
+ // times have the same effect as calling it once.
+ virtual Status DisableFileDeletions() = 0;
+
+ // Increase the full_history_ts of column family. The new ts_low value should
+ // be newer than current full_history_ts value.
+ // If another thread updates full_history_ts_low concurrently to a higher
+ // timestamp than the requested ts_low, a try again error will be returned.
+ virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string ts_low) = 0;
+
+ // Get current full_history_ts value.
+ virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string* ts_low) = 0;
+
+ // Allow compactions to delete obsolete files.
+ // If force == true, the call to EnableFileDeletions() will guarantee that
+ // file deletions are enabled after the call, even if DisableFileDeletions()
+ // was called multiple times before.
+ // If force == false, EnableFileDeletions will only enable file deletion
+ // after it's been called at least as many times as DisableFileDeletions(),
+ // enabling the two methods to be called by two threads concurrently without
+ // synchronization -- i.e., file deletions will be enabled only after both
+ // threads call EnableFileDeletions()
+ virtual Status EnableFileDeletions(bool force = true) = 0;
+
+#ifndef ROCKSDB_LITE
+ // Retrieves the creation time of the oldest file in the DB.
+ // This API only works if max_open_files = -1, if it is not then
+ // Status returned is Status::NotSupported()
+ // The file creation time is set using the env provided to the DB.
+ // If the DB was created from a very old release then its possible that
+ // the SST files might not have file_creation_time property and even after
+ // moving to a newer release its possible that some files never got compacted
+ // and may not have file_creation_time property. In both the cases
+ // file_creation_time is considered 0 which means this API will return
+ // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+ virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
+ // Note: this API is not yet consistent with WritePrepared transactions.
+ //
+ // Sets iter to an iterator that is positioned at a write-batch whose
+ // sequence number range [start_seq, end_seq] covers seq_number. If no such
+ // write-batch exists, then iter is positioned at the next write-batch whose
+ // start_seq > seq_number.
+ //
+ // Returns Status::OK if iterator is valid
+ // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+ // use this api, else the WAL files will get
+ // cleared aggressively and the iterator might keep getting invalid before
+ // an update is read.
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options =
+ TransactionLogIterator::ReadOptions()) = 0;
+
+// Windows API macro interference
+#undef DeleteFile
+ // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+ // operate at the proper level of abstraction for a key-value store, and its
+ // contract/restrictions are poorly documented. For example, it returns non-OK
+ // `Status` for non-bottommost files and files undergoing compaction. Since we
+ // do not plan to maintain it, the contract will likely remain underspecified
+ // until its removal. Any user is encouraged to read the implementation
+ // carefully and migrate away from it when possible.
+ //
+ // Delete the file name from the db directory and update the internal state to
+ // reflect that. Supports deletion of sst and log files only. 'name' must be
+ // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+ virtual Status DeleteFile(std::string name) = 0;
+
+ // Obtains a list of all live table (SST) files and how they fit into the
+ // LSM-trees, such as column family, level, key range, etc.
+ // This builds a de-normalized form of GetAllColumnFamilyMetaData().
+ // For information about all files in a DB, use GetLiveFilesStorageInfo().
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* /*metadata*/) {}
+
+ // Return a list of all table (SST) and blob files checksum info.
+ // Note: This function might be of limited use because it cannot be
+ // synchronized with other "live files" APIs. GetLiveFilesStorageInfo()
+ // is recommended instead.
+ virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
+
+ // Get information about all live files that make up a DB, for making
+ // live copies (Checkpoint, backups, etc.) or other storage-related purposes.
+ // If creating a live copy, use DisableFileDeletions() before and
+ // EnableFileDeletions() after to prevent deletions.
+ // For LSM-tree metadata, use Get*MetaData() functions instead.
+ virtual Status GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& opts,
+ std::vector<LiveFileStorageInfo>* files) = 0;
+
+ // Obtains the LSM-tree meta data of the specified column family of the DB,
+ // including metadata for each live table (SST) file in that column family.
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+ ColumnFamilyMetaData* /*metadata*/) {}
+
+ // Get the metadata of the default column family.
+ void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
+ GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+ }
+
+ // Obtains the LSM-tree meta data of all column families of the DB, including
+ // metadata for each live table (SST) file and each blob file in the DB.
+ virtual void GetAllColumnFamilyMetaData(
+ std::vector<ColumnFamilyMetaData>* /*metadata*/) {}
+
+ // Retrieve the list of all files in the database except WAL files. The files
+ // are relative to the dbname (or db_paths/cf_paths), not absolute paths.
+ // (Not recommended with db_paths/cf_paths because that information is not
+ // returned.) Despite being relative paths, the file names begin with "/".
+ // The valid size of the manifest file is returned in manifest_file_size.
+ // The manifest file is an ever growing file, but only the portion specified
+ // by manifest_file_size is valid for this snapshot. Setting flush_memtable
+ // to true does Flush before recording the live files (unless DB is
+ // read-only). Setting flush_memtable to false is useful when we don't want
+ // to wait for flush which may have to wait for compaction to complete
+ // taking an indeterminate time.
+ //
+ // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
+ // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended
+ // instead, because it ensures a single consistent view of all files is
+ // captured in one call.
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) = 0;
+
+ // Retrieve the sorted list of all wal files with earliest file first
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+ // Retrieve information about the current wal file
+ //
+ // Note that the log might have rolled after this call in which case
+ // the current_log_file would not point to the current log file.
+ //
+ // Additionally, for the sake of optimization current_log_file->StartSequence
+ // would always be set to 0
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) = 0;
+
+ // IngestExternalFile() will load a list of external SST files (1) into the DB
+ // Two primary modes are supported:
+ // - Duplicate keys in the new files will overwrite exiting keys (default)
+ // - Duplicate keys will be skipped (set ingest_behind=true)
+ // In the first mode we will try to find the lowest possible level that
+ // the file can fit in, and ingest the file into this level (2). A file that
+ // have a key range that overlap with the memtable key range will require us
+ // to Flush the memtable first before ingesting the file.
+ // In the second mode we will always ingest in the bottom most level (see
+ // docs to IngestExternalFileOptions::ingest_behind).
+ //
+ // (1) External SST files can be created using SstFileWriter
+ // (2) We will try to ingest the files to the lowest possible level
+ // even if the file compression doesn't match the level compression
+ // (3) If IngestExternalFileOptions->ingest_behind is set to true,
+ // we always ingest at the bottommost level, which should be reserved
+ // for this purpose (see DBOPtions::allow_ingest_behind flag).
+ // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
+ // true, then this method can return Status:TryAgain() indicating that
+ // the files cannot be ingested to the bottommost level, and it is the
+ // user's responsibility to clear the bottommost level in the overlapping
+ // range before re-attempting the ingestion.
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) = 0;
+
+ virtual Status IngestExternalFile(
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) {
+ return IngestExternalFile(DefaultColumnFamily(), external_files, options);
+ }
+
+ // IngestExternalFiles() will ingest files for multiple column families, and
+ // record the result atomically to the MANIFEST.
+ // If this function returns OK, all column families' ingestion must succeed.
+ // If this function returns NOK, or the process crashes, then non-of the
+ // files will be ingested into the database after recovery.
+ // Note that it is possible for application to observe a mixed state during
+ // the execution of this function. If the user performs range scan over the
+ // column families with iterators, iterator on one column family may return
+ // ingested data, while iterator on other column family returns old data.
+ // Users can use snapshot for a consistent view of data.
+ // If your db ingests multiple SST files using this API, i.e. args.size()
+ // > 1, then RocksDB 5.15 and earlier will not be able to open it.
+ //
+ // REQUIRES: each arg corresponds to a different column family: namely, for
+ // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) = 0;
+
+ // CreateColumnFamilyWithImport() will create a new column family with
+ // column_family_name and import external SST files specified in metadata into
+ // this column family.
+ // (1) External SST files can be created using SstFileWriter.
+ // (2) External SST files can be exported from a particular column family in
+ // an existing DB using Checkpoint::ExportColumnFamily.
+ // Option in import_options specifies whether the external files are copied or
+ // moved (default is copy). When option specifies copy, managing files at
+ // external_file_path is caller's responsibility. When option specifies a
+ // move, the call makes a best effort to delete the specified files at
+ // external_file_path on successful return, logging any failure to delete
+ // rather than returning in Status. Files are not modified on any error
+ // return, and a best effort is made to remove any newly-created files.
+ // On error return, column family handle returned will be nullptr.
+ // ColumnFamily will be present on successful return and will not be present
+ // on error return. ColumnFamily may be present on any crash during this call.
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) = 0;
+
+ // Verify the checksums of files in db. Currently the whole-file checksum of
+ // table files are checked.
+ virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
+ return Status::NotSupported("File verification not supported");
+ }
+
+ // Verify the block checksums of files in db. The block checksums of table
+ // files are checked.
+ virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+ virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+#endif // ROCKSDB_LITE
+
+ // Returns the unique ID which is read from IDENTITY file during the opening
+ // of database by setting in the identity variable
+ // Returns Status::OK if identity could be set properly
+ virtual Status GetDbIdentity(std::string& identity) const = 0;
+
+ // Return a unique identifier for each DB object that is opened
+ // This DB session ID should be unique among all open DB instances on all
+ // hosts, and should be unique among re-openings of the same or other DBs.
+ // (Two open DBs have the same identity from other function GetDbIdentity when
+ // one is physically copied from the other.)
+ virtual Status GetDbSessionId(std::string& session_id) const = 0;
+
+ // Returns default column family handle
+ virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+
+ virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) = 0;
+ virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+ return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+ }
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) = 0;
+
+ virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) {
+ return Status::NotSupported("SuggestCompactRange() is not implemented.");
+ }
+
+ virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
+ int /*target_level*/) {
+ return Status::NotSupported("PromoteL0() is not implemented.");
+ }
+
+ // Trace DB operations. Use EndTrace() to stop tracing.
+ virtual Status StartTrace(const TraceOptions& /*options*/,
+ std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartTrace() is not implemented.");
+ }
+
+ virtual Status EndTrace() {
+ return Status::NotSupported("EndTrace() is not implemented.");
+ }
+
+ // IO Tracing operations. Use EndIOTrace() to stop tracing.
+ virtual Status StartIOTrace(const TraceOptions& /*options*/,
+ std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartIOTrace() is not implemented.");
+ }
+
+ virtual Status EndIOTrace() {
+ return Status::NotSupported("EndIOTrace() is not implemented.");
+ }
+
+ // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+ virtual Status StartBlockCacheTrace(
+ const TraceOptions& /*trace_options*/,
+ std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+ }
+
+ virtual Status StartBlockCacheTrace(
+ const BlockCacheTraceOptions& /*options*/,
+ std::unique_ptr<BlockCacheTraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+ }
+
+ virtual Status EndBlockCacheTrace() {
+ return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+ }
+
+ // Create a default trace replayer.
+ virtual Status NewDefaultReplayer(
+ const std::vector<ColumnFamilyHandle*>& /*handles*/,
+ std::unique_ptr<TraceReader>&& /*reader*/,
+ std::unique_ptr<Replayer>* /*replayer*/) {
+ return Status::NotSupported("NewDefaultReplayer() is not implemented.");
+ }
+
+#endif // ROCKSDB_LITE
+
+ // Needed for StackableDB
+ virtual DB* GetRootDB() { return this; }
+
+ // Given a window [start_time, end_time), setup a StatsHistoryIterator
+ // to access stats history. Note the start_time and end_time are epoch
+ // time measured in seconds, and end_time is an exclusive bound.
+ virtual Status GetStatsHistory(
+ uint64_t /*start_time*/, uint64_t /*end_time*/,
+ std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
+ return Status::NotSupported("GetStatsHistory() is not implemented.");
+ }
+
+#ifndef ROCKSDB_LITE
+ // Make the secondary instance catch up with the primary by tailing and
+ // replaying the MANIFEST and WAL of the primary.
+ // Column families created by the primary after the secondary instance starts
+ // will be ignored unless the secondary instance closes and restarts with the
+ // newly created column families.
+ // Column families that exist before secondary instance starts and dropped by
+ // the primary afterwards will be marked as dropped. However, as long as the
+ // secondary instance does not delete the corresponding column family
+ // handles, the data of the column family is still accessible to the
+ // secondary.
+ virtual Status TryCatchUpWithPrimary() {
+ return Status::NotSupported("Supported only by secondary instance");
+ }
+#endif // !ROCKSDB_LITE
+};
+
+// Overloaded operators for enum class SizeApproximationFlags.
+inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs,
+ DB::SizeApproximationFlags rhs) {
+ return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) &
+ static_cast<uint8_t>(rhs));
+}
+inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs,
+ DB::SizeApproximationFlags rhs) {
+ return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) |
+ static_cast<uint8_t>(rhs));
+}
+
+inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
+ const Range* ranges, int n,
+ uint64_t* sizes,
+ SizeApproximationFlags include_flags) {
+ SizeApproximationOptions options;
+ options.include_memtables =
+ ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
+ SizeApproximationFlags::NONE);
+ options.include_files =
+ ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
+ SizeApproximationFlags::NONE);
+ return GetApproximateSizes(options, column_family, ranges, n, sizes);
+}
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options,
+ const std::vector<ColumnFamilyDescriptor>& column_families =
+ std::vector<ColumnFamilyDescriptor>());
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+//
+// With this API, we will warn and skip data associated with column families not
+// specified in column_families.
+//
+// @param column_families Descriptors for known column families
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families);
+
+// @param unknown_cf_opts Options for column families encountered during the
+// repair that were not specified in column_families.
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& unknown_cf_opts);
+
+// @param options These options will be used for the database and for ALL column
+// families encountered during the repair
+Status RepairDB(const std::string& dbname, const Options& options);
+
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h
new file mode 100644
index 000000000..17f4e6bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_bench_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_bench_tool(int argc, char** argv);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h
new file mode 100644
index 000000000..b7d4766a2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_dump_tool.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct DumpOptions {
+ // Database that will be dumped
+ std::string db_path;
+ // File location that will contain dump output
+ std::string dump_location;
+ // Don't include db information header in the dump
+ bool anonymous = false;
+};
+
+class DbDumpTool {
+ public:
+ bool Run(const DumpOptions& dump_options,
+ ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+
+struct UndumpOptions {
+ // Database that we will load the dumped file into
+ std::string db_path;
+ // File location of the dumped file that will be loaded
+ std::string dump_location;
+ // Compact the db after loading the dumped file
+ bool compact_db = false;
+};
+
+class DbUndumpTool {
+ public:
+ bool Run(const UndumpOptions& undump_options,
+ ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h
new file mode 100644
index 000000000..7d3d42c9d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_stress_tool(int argc, char** argv);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
new file mode 100644
index 000000000..bef60a212
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -0,0 +1,1893 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc. Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/functor_wrapper.h"
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
+ __attribute__((__format__(__printf__, format_param, dots_param)))
+#else
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DynamicLibrary;
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+struct DataVerificationInfo;
+class WritableFile;
+class RandomRWFile;
+class MemoryMappedFileBuffer;
+class Directory;
+struct DBOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+class ThreadStatusUpdater;
+struct ThreadStatus;
+class FileSystem;
+class SystemClock;
+struct ConfigOptions;
+
+const size_t kDefaultPageSize = 4 * 1024;
+
+enum class CpuPriority {
+ kIdle = 0,
+ kLow = 1,
+ kNormal = 2,
+ kHigh = 3,
+};
+
+// Options while opening a file to read/write
+struct EnvOptions {
+ // Construct with default Options
+ EnvOptions();
+
+ // Construct from Options
+ explicit EnvOptions(const DBOptions& options);
+
+ // If true, then use mmap to read data.
+ // Not recommended for 32-bit OS.
+ bool use_mmap_reads = false;
+
+ // If true, then use mmap to write data
+ bool use_mmap_writes = true;
+
+ // If true, then use O_DIRECT for reading data
+ bool use_direct_reads = false;
+
+ // If true, then use O_DIRECT for writing data
+ bool use_direct_writes = false;
+
+ // If false, fallocate() calls are bypassed
+ bool allow_fallocate = true;
+
+ // If true, set the FD_CLOEXEC on open fd.
+ bool set_fd_cloexec = true;
+
+ // Allows OS to incrementally sync files to disk while they are being
+ // written, in the background. Issue one request for every bytes_per_sync
+ // written. 0 turns it off.
+ // Default: 0
+ uint64_t bytes_per_sync = 0;
+
+ // When true, guarantees the file has at most `bytes_per_sync` bytes submitted
+ // for writeback at any given time.
+ //
+ // - If `sync_file_range` is supported it achieves this by waiting for any
+ // prior `sync_file_range`s to finish before proceeding. In this way,
+ // processing (compression, etc.) can proceed uninhibited in the gap
+ // between `sync_file_range`s, and we block only when I/O falls behind.
+ // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+ // always blocks, thus preventing the interleaving of I/O and processing.
+ //
+ // Note: Enabling this option does not provide any additional persistence
+ // guarantees, as it may use `sync_file_range`, which does not write out
+ // metadata.
+ //
+ // Default: false
+ bool strict_bytes_per_sync = false;
+
+ // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+ // means that file size won't change as part of preallocation.
+ // If false, preallocation will also change the file size. This option will
+ // improve the performance in workloads where you sync the data on every
+ // write. By default, we set it to true for MANIFEST writes and false for
+ // WAL writes
+ bool fallocate_with_keep_size = true;
+
+ // See DBOptions doc
+ size_t compaction_readahead_size = 0;
+
+ // See DBOptions doc
+ size_t random_access_max_buffer_size = 0;
+
+ // See DBOptions doc
+ size_t writable_file_max_buffer_size = 1024 * 1024;
+
+ // If not nullptr, write rate limiting is enabled for flush and compaction
+ RateLimiter* rate_limiter = nullptr;
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Env : public Customizable {
+ public:
+ static const char* kDefaultName() { return "DefaultEnv"; }
+ struct FileAttributes {
+ // File name
+ std::string name;
+
+ // Size of file in bytes
+ uint64_t size_bytes;
+ };
+
+ Env();
+ // Construct an Env with a separate FileSystem and/or SystemClock
+ // implementation
+ explicit Env(const std::shared_ptr<FileSystem>& fs);
+ Env(const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& clock);
+ // No copying allowed
+ Env(const Env&) = delete;
+ void operator=(const Env&) = delete;
+
+ ~Env() override;
+
+ static const char* Type() { return "Environment"; }
+
+ // Deprecated. Will be removed in a major release. Derived classes
+ // should implement this method.
+ const char* Name() const override { return ""; }
+
+ // Loads the environment specified by the input value into the result
+ // The CreateFromString alternative should be used; this method may be
+ // deprecated in a future release.
+ static Status LoadEnv(const std::string& value, Env** result);
+
+ // Loads the environment specified by the input value into the result
+ // The CreateFromString alternative should be used; this method may be
+ // deprecated in a future release.
+ static Status LoadEnv(const std::string& value, Env** result,
+ std::shared_ptr<Env>* guard);
+
+ // Loads the environment specified by the input value into the result
+ // @see Customizable for a more detailed description of the parameters and
+ // return codes
+ //
+ // @param config_options Controls how the environment is loaded.
+ // @param value the name and associated properties for the environment.
+ // @param result On success, the environment that was loaded.
+ // @param guard If specified and the loaded environment is not static,
+ // this value will contain the loaded environment (guard.get() ==
+ // result).
+ // @return OK If the environment was successfully loaded (and optionally
+ // prepared)
+ // @return not-OK if the load failed.
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value, Env** result);
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value, Env** result,
+ std::shared_ptr<Env>* guard);
+
+ // Loads the environment specified by the env and fs uri.
+ // If both are specified, an error is returned.
+ // Otherwise, the environment is created by loading (via CreateFromString)
+ // the appropriate env/fs from the corresponding values.
+ static Status CreateFromUri(const ConfigOptions& options,
+ const std::string& env_uri,
+ const std::string& fs_uri, Env** result,
+ std::shared_ptr<Env>* guard);
+
+ // Return a default environment suitable for the current operating
+ // system. Sophisticated users may wish to provide their own Env
+ // implementation instead of relying on this default environment.
+ //
+ // The result of Default() belongs to rocksdb and must never be deleted.
+ static Env* Default();
+
+ // See FileSystem::RegisterDbPaths.
+ virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+ return Status::OK();
+ }
+ // See FileSystem::UnregisterDbPaths.
+ virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+ return Status::OK();
+ }
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewSequentialFile(const std::string& fname,
+ std::unique_ptr<SequentialFile>* result,
+ const EnvOptions& options) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& options) = 0;
+ // These values match Linux definition
+ // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+ enum WriteLifeTimeHint {
+ WLTH_NOT_SET = 0, // No hint information set
+ WLTH_NONE, // No hints about write life time
+ WLTH_SHORT, // Data written has a short life time
+ WLTH_MEDIUM, // Data written has a medium life time
+ WLTH_LONG, // Data written has a long life time
+ WLTH_EXTREME, // Data written has an extremely long life time
+ };
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) = 0;
+
+ // Create an object that writes to a file with the specified name.
+ // `WritableFile::Append()`s will append after any existing content. If the
+ // file does not already exist, creates it.
+ //
+ // On success, stores a pointer to the file in *result and returns OK. On
+ // failure stores nullptr in *result and returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status ReopenWritableFile(const std::string& /*fname*/,
+ std::unique_ptr<WritableFile>* /*result*/,
+ const EnvOptions& /*options*/) {
+ return Status::NotSupported("Env::ReopenWritableFile() not supported.");
+ }
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ virtual Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options);
+
+ // Open `fname` for random read and write, if file doesn't exist the file
+ // will be created. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewRandomRWFile(const std::string& /*fname*/,
+ std::unique_ptr<RandomRWFile>* /*result*/,
+ const EnvOptions& /*options*/) {
+ return Status::NotSupported("RandomRWFile is not implemented in this Env");
+ }
+
+ // Opens `fname` as a memory-mapped file for read and write (in-place updates
+ // only, i.e., no appends). On success, stores a raw buffer covering the whole
+ // file in `*result`. The file must exist prior to this call.
+ virtual Status NewMemoryMappedFileBuffer(
+ const std::string& /*fname*/,
+ std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+ return Status::NotSupported(
+ "MemoryMappedFileBuffer is not implemented in this Env");
+ }
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ virtual Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) = 0;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status FileExists(const std::string& fname) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir", and shall never include the
+ // names `.` or `..`.
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) = 0;
+
+ // Store in *result the attributes of the children of the specified directory.
+ // In case the implementation lists the directory prior to iterating the files
+ // and files are concurrently deleted, the deleted files will be omitted from
+ // result.
+ // The name attributes are relative to "dir", and shall never include the
+ // names `.` or `..`.
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status GetChildrenFileAttributes(const std::string& dir,
+ std::vector<FileAttributes>* result);
+
+ // Delete the named file.
+ virtual Status DeleteFile(const std::string& fname) = 0;
+
+ // Truncate the named file to the specified size.
+ virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) {
+ return Status::NotSupported("Truncate is not supported for this Env");
+ }
+
+ // Create the specified directory. Returns error if directory exists.
+ virtual Status CreateDir(const std::string& dirname) = 0;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+ // Delete the specified directory.
+ // Many implementations of this function will only delete a directory if it is
+ // empty.
+ virtual Status DeleteDir(const std::string& dirname) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+ // Store the last modification time of fname in *file_mtime.
+ virtual Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) = 0;
+ // Rename file src to target.
+ virtual Status RenameFile(const std::string& src,
+ const std::string& target) = 0;
+
+ // Hard Link file src to target.
+ virtual Status LinkFile(const std::string& /*src*/,
+ const std::string& /*target*/) {
+ return Status::NotSupported("LinkFile is not supported for this Env");
+ }
+
+ virtual Status NumFileLinks(const std::string& /*fname*/,
+ uint64_t* /*count*/) {
+ return Status::NotSupported(
+ "Getting number of file links is not supported for this Env");
+ }
+
+ virtual Status AreFilesSame(const std::string& /*first*/,
+ const std::string& /*second*/, bool* /*res*/) {
+ return Status::NotSupported("AreFilesSame is not supported for this Env");
+ }
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual Status UnlockFile(FileLock* lock) = 0;
+
+ // Opens `lib_name` as a dynamic library.
+ // If the 'search_path' is specified, breaks the path into its components
+ // based on the appropriate platform separator (";" or ";") and looks for the
+ // library in those directories. If 'search path is not specified, uses the
+ // default library path search mechanism (such as LD_LIBRARY_PATH). On
+ // success, stores a dynamic library in `*result`.
+ virtual Status LoadLibrary(const std::string& /*lib_name*/,
+ const std::string& /*search_path */,
+ std::shared_ptr<DynamicLibrary>* /*result*/) {
+ return Status::NotSupported("LoadLibrary is not implemented in this Env");
+ }
+
+ // Priority for scheduling job in thread pool
+ enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
+
+ static std::string PriorityToString(Priority priority);
+
+ // Priority for requesting bytes in rate limiter scheduler
+ enum IOPriority {
+ IO_LOW = 0,
+ IO_MID = 1,
+ IO_HIGH = 2,
+ IO_USER = 3,
+ IO_TOTAL = 4
+ };
+
+ // Arrange to run "(*function)(arg)" once in a background thread, in
+ // the thread pool specified by pri. By default, jobs go to the 'LOW'
+ // priority thread pool.
+
+ // "function" may run in an unspecified thread. Multiple functions
+ // added to the same Env may run concurrently in different threads.
+ // I.e., the caller may not assume that background work items are
+ // serialized.
+ // When the UnSchedule function is called, the unschedFunction
+ // registered at the time of Schedule is invoked with arg as a parameter.
+ virtual void Schedule(void (*function)(void* arg), void* arg,
+ Priority pri = LOW, void* tag = nullptr,
+ void (*unschedFunction)(void* arg) = nullptr) = 0;
+
+ // Arrange to remove jobs for given arg from the queue_ if they are not
+ // already scheduled. Caller is expected to have exclusive lock on arg.
+ virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; }
+
+ // Start a new thread, invoking "function(arg)" within the new thread.
+ // When "function(arg)" returns, the thread will be destroyed.
+ virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+ // Start a new thread, invoking "function(args...)" within the new thread.
+ // When "function(args...)" returns, the thread will be destroyed.
+ template <typename FunctionT, typename... Args>
+ void StartThreadTyped(FunctionT function, Args&&... args) {
+ using FWType = FunctorWrapper<Args...>;
+ StartThread(
+ [](void* arg) {
+ auto* functor = static_cast<FWType*>(arg);
+ functor->invoke();
+ delete functor;
+ },
+ new FWType(std::function<void(Args...)>(function),
+ std::forward<Args>(args)...));
+ }
+
+ // Wait for all threads started by StartThread to terminate.
+ virtual void WaitForJoin() {}
+
+ // Reserve available background threads in the specified thread pool.
+ virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) {
+ return 0;
+ }
+
+ // Release a specific number of reserved threads from the specified thread
+ // pool
+ virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) {
+ return 0;
+ }
+
+ // Get thread pool queue length for specific thread pool.
+ virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
+ return 0;
+ }
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual Status GetTestDirectory(std::string* path) = 0;
+
+ // Create and returns a default logger (an instance of EnvLogger) for storing
+ // informational messages. Derived classes can override to provide custom
+ // logger.
+ virtual Status NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result);
+
+ // Returns the number of micro-seconds since some fixed point in time.
+ // It is often used as system time such as in GenericRateLimiter
+ // and other places so a port needs to return system time in order to work.
+ virtual uint64_t NowMicros() = 0;
+
+ // Returns the number of nano-seconds since some fixed point in time. Only
+ // useful for computing deltas of time in one run.
+ // Default implementation simply relies on NowMicros.
+ // In platform-specific implementations, NowNanos() should return time points
+ // that are MONOTONIC.
+ virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+ // 0 indicates not supported.
+ virtual uint64_t NowCPUNanos() { return 0; }
+
+ // Sleep/delay the thread for the prescribed number of micro-seconds.
+ virtual void SleepForMicroseconds(int micros) = 0;
+
+ // Get the current host name as a null terminated string iff the string
+ // length is < len. The hostname should otherwise be truncated to len.
+ virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+ // Get the current hostname from the given env as a std::string in result.
+ // The result may be truncated if the hostname is too
+ // long
+ virtual Status GetHostNameString(std::string* result);
+
+ // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+ // Only overwrites *unix_time on success.
+ virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+ // Get full directory name for this db.
+ virtual Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) = 0;
+
+ // The number of background worker threads of a specific thread pool
+ // for this environment. 'LOW' is the default pool.
+ // default number: 1
+ virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+ virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
+
+ virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
+ return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported.");
+ }
+
+ // Enlarge number of background worker threads of a specific thread pool
+ // for this environment if it is smaller than specified. 'LOW' is the default
+ // pool.
+ virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
+ // Lower IO priority for threads from the specified pool.
+ virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
+
+ // Lower CPU priority for threads from the specified pool.
+ virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/,
+ CpuPriority /*pri*/) {
+ return Status::NotSupported(
+ "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported");
+ }
+
+ // Lower CPU priority for threads from the specified pool.
+ virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
+
+ // Converts seconds-since-Jan-01-1970 to a printable string
+ virtual std::string TimeToString(uint64_t time) = 0;
+
+ // Generates a human-readable unique ID that can be used to identify a DB.
+ // In built-in implementations, this is an RFC-4122 UUID string, but might
+ // not be in all implementations. Overriding is not recommended.
+ // NOTE: this has not be validated for use in cryptography
+ virtual std::string GenerateUniqueId();
+
+ // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+ // the EnvOptions in the parameters, but is optimized for reading log files.
+ virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const;
+
+ // OptimizeForManifestRead will create a new EnvOptions object that is a copy
+ // of the EnvOptions in the parameters, but is optimized for reading manifest
+ // files.
+ virtual EnvOptions OptimizeForManifestRead(
+ const EnvOptions& env_options) const;
+
+ // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+ // the EnvOptions in the parameters, but is optimized for writing log files.
+ // Default implementation returns the copy of the same object.
+ virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const;
+ // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+ // of the EnvOptions in the parameters, but is optimized for writing manifest
+ // files. Default implementation returns the copy of the same object.
+ virtual EnvOptions OptimizeForManifestWrite(
+ const EnvOptions& env_options) const;
+
+ // OptimizeForCompactionTableWrite will create a new EnvOptions object that is
+ // a copy of the EnvOptions in the parameters, but is optimized for writing
+ // table files.
+ virtual EnvOptions OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& immutable_ops) const;
+
+ // OptimizeForCompactionTableWrite will create a new EnvOptions object that
+ // is a copy of the EnvOptions in the parameters, but is optimized for reading
+ // table files.
+ virtual EnvOptions OptimizeForCompactionTableRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const;
+
+ // OptimizeForBlobFileRead will create a new EnvOptions object that
+ // is a copy of the EnvOptions in the parameters, but is optimized for reading
+ // blob files.
+ virtual EnvOptions OptimizeForBlobFileRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const;
+
+ // Returns the status of all threads that belong to the current Env.
+ virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
+ return Status::NotSupported("Env::GetThreadList() not supported.");
+ }
+
+ // Returns the pointer to ThreadStatusUpdater. This function will be
+ // used in RocksDB internally to update thread status and supports
+ // GetThreadList().
+ virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+ return thread_status_updater_;
+ }
+
+ // Returns the ID of the current thread.
+ virtual uint64_t GetThreadID() const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+ // Get the amount of free disk space
+ virtual Status GetFreeSpace(const std::string& /*path*/,
+ uint64_t* /*diskfree*/) {
+ return Status::NotSupported("Env::GetFreeSpace() not supported.");
+ }
+
+ // Check whether the specified path is a directory
+ virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) {
+ return Status::NotSupported("Env::IsDirectory() not supported.");
+ }
+
+ virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
+ // Get the FileSystem implementation this Env was constructed with. It
+ // could be a fully implemented one, or a wrapper class around the Env
+ const std::shared_ptr<FileSystem>& GetFileSystem() const;
+
+ // Get the SystemClock implementation this Env was constructed with. It
+ // could be a fully implemented one, or a wrapper class around the Env
+ const std::shared_ptr<SystemClock>& GetSystemClock() const;
+
+ // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ protected:
+ // The pointer to an internal structure that will update the
+ // status of each thread.
+ ThreadStatusUpdater* thread_status_updater_;
+
+ // Pointer to the underlying FileSystem implementation
+ std::shared_ptr<FileSystem> file_system_;
+
+ // Pointer to the underlying SystemClock implementation
+ std::shared_ptr<SystemClock> system_clock_;
+
+ private:
+ static const size_t kMaxHostNameLen = 256;
+};
+
+// The factory function to construct a ThreadStatusUpdater. Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+ SequentialFile() {}
+ virtual ~SequentialFile();
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Skip(uint64_t n) = 0;
+
+ // Indicates the upper layers if the current SequentialFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported(
+ "SequentialFile::InvalidateCache not supported.");
+ }
+
+ // Positioned Read for direct I/O
+ // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+ virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+ Slice* /*result*/, char* /*scratch*/) {
+ return Status::NotSupported(
+ "SequentialFile::PositionedRead() not supported.");
+ }
+
+ // If you're adding methods here, remember to add them to
+ // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+ // File offset in bytes
+ uint64_t offset;
+
+ // Length to read in bytes. `result` only returns fewer bytes if end of file
+ // is hit (or `status` is not OK).
+ size_t len;
+
+ // A buffer that MultiRead() can optionally place data in. It can
+ // ignore this and allocate its own buffer
+ char* scratch;
+
+ // Output parameter set by MultiRead() to point to the data buffer, and
+ // the number of valid bytes
+ Slice result;
+
+ // Status of read
+ Status status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+ RandomAccessFile() {}
+ virtual ~RandomAccessFile();
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // Safe for concurrent use by multiple threads.
+ // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+
+ // Readahead the file starting from offset by n bytes for caching.
+ virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) {
+ return Status::OK();
+ }
+
+ // Read a bunch of blocks as described by reqs. The blocks can
+ // optionally be read in parallel. This is a synchronous call, i.e it
+ // should return after all reads have completed. The reads will be
+ // non-overlapping. If the function return Status is not ok, status of
+ // individual requests will be ignored and return status will be assumed
+ // for all read requests. The function return status is only meant for
+ // any errors that occur before even processing specific read requests
+ virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+ assert(reqs != nullptr);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ ReadRequest& req = reqs[i];
+ req.status = Read(req.offset, req.len, &req.result, req.scratch);
+ }
+ return Status::OK();
+ }
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to each other by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ // compatibility.
+ }
+
+ enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+ virtual void Hint(AccessPattern /*pattern*/) {}
+
+ // Indicates the upper layers if the current RandomAccessFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported(
+ "RandomAccessFile::InvalidateCache not supported.");
+ }
+
+ // If you're adding methods here, remember to add them to
+ // RandomAccessFileWrapper too.
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+ WritableFile()
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(false) {}
+
+ explicit WritableFile(const EnvOptions& options)
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+ // No copying allowed
+ WritableFile(const WritableFile&) = delete;
+ void operator=(const WritableFile&) = delete;
+
+ virtual ~WritableFile();
+
+ // Append data to the end of the file
+ // Note: A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ virtual Status Append(const Slice& data) = 0;
+
+ // Append data with verification information.
+ // Note that this API change is experimental and it might be changed in
+ // the future. Currently, RocksDB only generates crc32c based checksum for
+ // the file writes when the checksum handoff option is set.
+ // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+ // WritableFile, the information in DataVerificationInfo can be ignored
+ // (i.e. does not perform checksum verification).
+ virtual Status Append(const Slice& data,
+ const DataVerificationInfo& /* verification_info */) {
+ return Append(data);
+ }
+
+ // PositionedAppend data to the specified offset. The new EOF after append
+ // must be larger than the previous EOF. This is to be used when writes are
+ // not backed by OS buffers and hence has to always start from the start of
+ // the sector. The implementation thus needs to also rewrite the last
+ // partial sector.
+ // Note: PositionAppend does not guarantee moving the file offset after the
+ // write. A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ //
+ // PositionedAppend() can only happen on the page/sector boundaries. For that
+ // reason, if the last write was an incomplete sector we still need to rewind
+ // back to the nearest sector/page and rewrite the portion of it with whatever
+ // we need to add. We need to keep where we stop writing.
+ //
+ // PositionedAppend() can only write whole sectors. For that reason we have to
+ // pad with zeros for the last write and trim the file when closing according
+ // to the position we keep in the previous step.
+ //
+ // PositionedAppend() requires aligned buffer to be passed in. The alignment
+ // required is queried via GetRequiredBufferAlignment()
+ virtual Status PositionedAppend(const Slice& /* data */,
+ uint64_t /* offset */) {
+ return Status::NotSupported(
+ "WritableFile::PositionedAppend() not supported.");
+ }
+
+ // PositionedAppend data with verification information.
+ // Note that this API change is experimental and it might be changed in
+ // the future. Currently, RocksDB only generates crc32c based checksum for
+ // the file writes when the checksum handoff option is set.
+ // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+ // WritableFile, the information in DataVerificationInfo can be ignored
+ // (i.e. does not perform checksum verification).
+ virtual Status PositionedAppend(
+ const Slice& /* data */, uint64_t /* offset */,
+ const DataVerificationInfo& /* verification_info */) {
+ return Status::NotSupported("PositionedAppend");
+ }
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); }
+ virtual Status Close() = 0;
+ virtual Status Flush() = 0;
+ virtual Status Sync() = 0; // sync data
+
+ /*
+ * Sync data and/or metadata as well.
+ * By default, sync only data.
+ * Override this method for environments where we need to sync
+ * metadata as well.
+ */
+ virtual Status Fsync() { return Sync(); }
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ virtual bool IsSyncThreadSafe() const { return false; }
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ /*
+ * If rate limiting is enabled, change the file-granularity priority used in
+ * rate-limiting writes.
+ *
+ * In the presence of finer-granularity priority such as
+ * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+ * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+ * a fallback for Env::IO_TOTAL finer-granularity priority.
+ *
+ * If rate limiting is not enabled, this call has no effect.
+ */
+ virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+ virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+ virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+ write_hint_ = hint;
+ }
+
+ virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+ /*
+ * Get the size of valid data in the file.
+ */
+ virtual uint64_t GetFileSize() { return 0; }
+
+ /*
+ * Get and set the default pre-allocation block size for writes to
+ * this file. If non-zero, then Allocate will be used to extend the
+ * underlying storage of a file (generally via fallocate) if the Env
+ * instance supports it.
+ */
+ virtual void SetPreallocationBlockSize(size_t size) {
+ preallocation_block_size_ = size;
+ }
+
+ virtual void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) {
+ *last_allocated_block = last_preallocated_block_;
+ *block_size = preallocation_block_size_;
+ }
+
+ // For documentation, refer to RandomAccessFile::GetUniqueId()
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported("WritableFile::InvalidateCache not supported.");
+ }
+
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
+ if (strict_bytes_per_sync_) {
+ return Sync();
+ }
+ return Status::OK();
+ }
+
+ // PrepareWrite performs any necessary preparation for a write
+ // before the write actually occurs. This allows for pre-allocation
+ // of space on devices where it can result in less file
+ // fragmentation and/or less waste from over-zealous filesystem
+ // pre-allocation.
+ virtual void PrepareWrite(size_t offset, size_t len) {
+ if (preallocation_block_size_ == 0) {
+ return;
+ }
+ // If this write would cross one or more preallocation blocks,
+ // determine what the last preallocation block necessary to
+ // cover this write would be and Allocate to that point.
+ const auto block_size = preallocation_block_size_;
+ size_t new_last_preallocated_block =
+ (offset + len + block_size - 1) / block_size;
+ if (new_last_preallocated_block > last_preallocated_block_) {
+ size_t num_spanned_blocks =
+ new_last_preallocated_block - last_preallocated_block_;
+ // TODO: Don't ignore errors from allocate
+ Allocate(block_size * last_preallocated_block_,
+ block_size * num_spanned_blocks)
+ .PermitUncheckedError();
+ last_preallocated_block_ = new_last_preallocated_block;
+ }
+ }
+
+ // Pre-allocates space for a file.
+ virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) {
+ return Status::OK();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // WritableFileWrapper too.
+
+ protected:
+ size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+ size_t last_preallocated_block_;
+ size_t preallocation_block_size_;
+
+ protected:
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+ RandomRWFile() {}
+ // No copying allowed
+ RandomRWFile(const RandomRWFile&) = delete;
+ RandomRWFile& operator=(const RandomRWFile&) = delete;
+
+ virtual ~RandomRWFile() {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ virtual Status Write(uint64_t offset, const Slice& data) = 0;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // Returns Status::OK() on success.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+
+ virtual Status Flush() = 0;
+
+ virtual Status Sync() = 0;
+
+ virtual Status Fsync() { return Sync(); }
+
+ virtual Status Close() = 0;
+
+ // If you're adding methods here, remember to add them to
+ // RandomRWFileWrapper too.
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class MemoryMappedFileBuffer {
+ public:
+ MemoryMappedFileBuffer(void* _base, size_t _length)
+ : base_(_base), length_(_length) {}
+
+ virtual ~MemoryMappedFileBuffer() = 0;
+
+ // We do not want to unmap this twice. We can make this class
+ // movable if desired, however, since
+ MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete;
+ MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete;
+
+ void* GetBase() const { return base_; }
+ size_t GetLen() const { return length_; }
+
+ protected:
+ void* base_;
+ const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+ virtual ~Directory() {}
+ // Fsync directory. Can be called concurrently from multiple threads.
+ virtual Status Fsync() = 0;
+ // Close directory.
+ virtual Status Close() { return Status::NotSupported("Close"); }
+
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0;
+ }
+
+ // If you're adding methods here, remember to add them to
+ // DirectoryWrapper too.
+};
+
+enum InfoLogLevel : unsigned char {
+ DEBUG_LEVEL = 0,
+ INFO_LEVEL,
+ WARN_LEVEL,
+ ERROR_LEVEL,
+ FATAL_LEVEL,
+ HEADER_LEVEL,
+ NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Logger {
+ public:
+ size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
+
+ explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+ : closed_(false), log_level_(log_level) {}
+ // No copying allowed
+ Logger(const Logger&) = delete;
+ void operator=(const Logger&) = delete;
+
+ virtual ~Logger();
+
+ // Close the log file. Must be called before destructor. If the return
+ // status is NotSupported(), it means the implementation does cleanup in
+ // the destructor
+ virtual Status Close();
+
+ // Write a header to the log file with the specified format
+ // It is recommended that you log all header information at the start of the
+ // application. But it is not enforced.
+ virtual void LogHeader(const char* format, va_list ap) {
+ // Default implementation does a simple INFO level log write.
+ // Please override as per the logger class requirement.
+ Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+ }
+
+ // Write an entry to the log file with the specified format.
+ //
+ // Users who override the `Logv()` overload taking `InfoLogLevel` do not need
+ // to implement this, unless they explicitly invoke it in
+ // `Logv(InfoLogLevel, ...)`.
+ virtual void Logv(const char* /* format */, va_list /* ap */) {
+ assert(false);
+ }
+
+ // Write an entry to the log file with the specified log level
+ // and format. Any log with level under the internal log level
+ // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+ // printed.
+ virtual void Logv(const InfoLogLevel log_level, const char* format,
+ va_list ap);
+
+ virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
+ // Flush to the OS buffers
+ virtual void Flush() {}
+ virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+ virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+ log_level_ = log_level;
+ }
+
+ // If you're adding methods here, remember to add them to LoggerWrapper too.
+
+ protected:
+ virtual Status CloseImpl();
+ bool closed_;
+
+ private:
+ InfoLogLevel log_level_;
+};
+
+// Identifies a locked file. Except in custom Env/Filesystem implementations,
+// the lifetime of a FileLock object should be managed only by LockFile() and
+// UnlockFile().
+class FileLock {
+ public:
+ FileLock() {}
+ virtual ~FileLock();
+
+ private:
+ // No copying allowed
+ FileLock(const FileLock&) = delete;
+ void operator=(const FileLock&) = delete;
+};
+
+class DynamicLibrary {
+ public:
+ virtual ~DynamicLibrary() {}
+
+ // Returns the name of the dynamic library.
+ virtual const char* Name() const = 0;
+
+ // Loads the symbol for sym_name from the library and updates the input
+ // function. Returns the loaded symbol.
+ template <typename T>
+ Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+ assert(nullptr != function);
+ void* ptr = nullptr;
+ Status s = LoadSymbol(sym_name, &ptr);
+ *function = reinterpret_cast<T*>(ptr);
+ return s;
+ }
+ // Loads and returns the symbol for sym_name from the library.
+ virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
+};
+
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+ const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// a set of log functions with different log levels.
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+extern void LogFlush(Logger* info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+ const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname,
+ bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+ std::string* data);
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::SequentialFileWrapper {
+// public:
+// MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target):
+// ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {}
+// Status Read(size_t n, Slice* result, char* scratch) override {
+// cout << "Doing a read of size " << n << "!" << endl;
+// return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result,
+// scratch);
+// }
+// // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+// forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+// rocksdb class. Unless you actually want to override the behavior.
+// (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+ // The Target struct allows an Env to be stored as a raw (Env*) or
+ // std::shared_ptr<Env>. By using this struct, the wrapping/calling
+ // class does not need to worry about the ownership/lifetime of the
+ // wrapped target env. If the guard is set, then the Env will point
+ // to the guard.get().
+ struct Target {
+ Env* env; // The raw Env
+ std::shared_ptr<Env> guard; // The guarded Env
+
+ // Creates a Target without assuming ownership of the target Env
+ explicit Target(Env* t) : env(t) {}
+
+ // Creates a Target from the guarded env, assuming ownership
+ explicit Target(std::unique_ptr<Env>&& t) : guard(t.release()) {
+ env = guard.get();
+ }
+
+ // Creates a Target from the guarded env, assuming ownership
+ explicit Target(const std::shared_ptr<Env>& t) : guard(t) {
+ env = guard.get();
+ }
+
+ // Makes sure the raw Env is not nullptr
+ void Prepare() {
+ if (guard.get() != nullptr) {
+ env = guard.get();
+ } else if (env == nullptr) {
+ env = Env::Default();
+ }
+ }
+ };
+
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit EnvWrapper(Env* t);
+ explicit EnvWrapper(std::unique_ptr<Env>&& t);
+ explicit EnvWrapper(const std::shared_ptr<Env>& t);
+ ~EnvWrapper() override;
+
+ // Return the target to which this Env forwards all calls
+ Env* target() const { return target_.env; }
+
+ // Deprecated. Will be removed in a major release. Derived classes
+ // should implement this method.
+ const char* Name() const override { return target_.env->Name(); }
+
+ // The following text is boilerplate that forwards all methods to target()
+ Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+ return target_.env->RegisterDbPaths(paths);
+ }
+
+ Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+ return target_.env->UnregisterDbPaths(paths);
+ }
+
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) override {
+ return target_.env->NewSequentialFile(f, r, options);
+ }
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) override {
+ return target_.env->NewRandomAccessFile(f, r, options);
+ }
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override {
+ return target_.env->NewWritableFile(f, r, options);
+ }
+ Status ReopenWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override {
+ return target_.env->ReopenWritableFile(fname, result, options);
+ }
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override {
+ return target_.env->ReuseWritableFile(fname, old_fname, r, options);
+ }
+ Status NewRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& options) override {
+ return target_.env->NewRandomRWFile(fname, result, options);
+ }
+ Status NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return target_.env->NewMemoryMappedFileBuffer(fname, result);
+ }
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override {
+ return target_.env->NewDirectory(name, result);
+ }
+ Status FileExists(const std::string& f) override {
+ return target_.env->FileExists(f);
+ }
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* r) override {
+ return target_.env->GetChildren(dir, r);
+ }
+ Status GetChildrenFileAttributes(
+ const std::string& dir, std::vector<FileAttributes>* result) override {
+ return target_.env->GetChildrenFileAttributes(dir, result);
+ }
+ Status DeleteFile(const std::string& f) override {
+ return target_.env->DeleteFile(f);
+ }
+ Status Truncate(const std::string& fname, size_t size) override {
+ return target_.env->Truncate(fname, size);
+ }
+ Status CreateDir(const std::string& d) override {
+ return target_.env->CreateDir(d);
+ }
+ Status CreateDirIfMissing(const std::string& d) override {
+ return target_.env->CreateDirIfMissing(d);
+ }
+ Status DeleteDir(const std::string& d) override {
+ return target_.env->DeleteDir(d);
+ }
+ Status GetFileSize(const std::string& f, uint64_t* s) override {
+ return target_.env->GetFileSize(f, s);
+ }
+
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override {
+ return target_.env->GetFileModificationTime(fname, file_mtime);
+ }
+
+ Status RenameFile(const std::string& s, const std::string& t) override {
+ return target_.env->RenameFile(s, t);
+ }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ return target_.env->LinkFile(s, t);
+ }
+
+ Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+ return target_.env->NumFileLinks(fname, count);
+ }
+
+ Status AreFilesSame(const std::string& first, const std::string& second,
+ bool* res) override {
+ return target_.env->AreFilesSame(first, second, res);
+ }
+
+ Status LockFile(const std::string& f, FileLock** l) override {
+ return target_.env->LockFile(f, l);
+ }
+
+ Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); }
+
+ Status IsDirectory(const std::string& path, bool* is_dir) override {
+ return target_.env->IsDirectory(path, is_dir);
+ }
+
+ Status LoadLibrary(const std::string& lib_name,
+ const std::string& search_path,
+ std::shared_ptr<DynamicLibrary>* result) override {
+ return target_.env->LoadLibrary(lib_name, search_path, result);
+ }
+
+ void Schedule(void (*f)(void* arg), void* a, Priority pri,
+ void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+ return target_.env->Schedule(f, a, pri, tag, u);
+ }
+
+ int UnSchedule(void* tag, Priority pri) override {
+ return target_.env->UnSchedule(tag, pri);
+ }
+
+ void StartThread(void (*f)(void*), void* a) override {
+ return target_.env->StartThread(f, a);
+ }
+ void WaitForJoin() override { return target_.env->WaitForJoin(); }
+ unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+ return target_.env->GetThreadPoolQueueLen(pri);
+ }
+
+ int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
+ return target_.env->ReserveThreads(threads_to_be_reserved, pri);
+ }
+
+ int ReleaseThreads(int threads_to_be_released, Priority pri) override {
+ return target_.env->ReleaseThreads(threads_to_be_released, pri);
+ }
+
+ Status GetTestDirectory(std::string* path) override {
+ return target_.env->GetTestDirectory(path);
+ }
+ Status NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result) override {
+ return target_.env->NewLogger(fname, result);
+ }
+ uint64_t NowMicros() override { return target_.env->NowMicros(); }
+ uint64_t NowNanos() override { return target_.env->NowNanos(); }
+ uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); }
+
+ void SleepForMicroseconds(int micros) override {
+ target_.env->SleepForMicroseconds(micros);
+ }
+ Status GetHostName(char* name, uint64_t len) override {
+ return target_.env->GetHostName(name, len);
+ }
+ Status GetCurrentTime(int64_t* unix_time) override {
+ return target_.env->GetCurrentTime(unix_time);
+ }
+ Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) override {
+ return target_.env->GetAbsolutePath(db_path, output_path);
+ }
+ void SetBackgroundThreads(int num, Priority pri) override {
+ return target_.env->SetBackgroundThreads(num, pri);
+ }
+ int GetBackgroundThreads(Priority pri) override {
+ return target_.env->GetBackgroundThreads(pri);
+ }
+
+ Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+ return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
+ }
+
+ void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+ return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+ }
+
+ void LowerThreadPoolIOPriority(Priority pool) override {
+ target_.env->LowerThreadPoolIOPriority(pool);
+ }
+
+ void LowerThreadPoolCPUPriority(Priority pool) override {
+ target_.env->LowerThreadPoolCPUPriority(pool);
+ }
+
+ Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+ return target_.env->LowerThreadPoolCPUPriority(pool, pri);
+ }
+
+ std::string TimeToString(uint64_t time) override {
+ return target_.env->TimeToString(time);
+ }
+
+ Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+ return target_.env->GetThreadList(thread_list);
+ }
+
+ ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+ return target_.env->GetThreadStatusUpdater();
+ }
+
+ uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
+
+ std::string GenerateUniqueId() override {
+ return target_.env->GenerateUniqueId();
+ }
+
+ EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+ return target_.env->OptimizeForLogRead(env_options);
+ }
+ EnvOptions OptimizeForManifestRead(
+ const EnvOptions& env_options) const override {
+ return target_.env->OptimizeForManifestRead(env_options);
+ }
+ EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const override {
+ return target_.env->OptimizeForLogWrite(env_options, db_options);
+ }
+ EnvOptions OptimizeForManifestWrite(
+ const EnvOptions& env_options) const override {
+ return target_.env->OptimizeForManifestWrite(env_options);
+ }
+ EnvOptions OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return target_.env->OptimizeForCompactionTableWrite(env_options,
+ immutable_ops);
+ }
+ EnvOptions OptimizeForCompactionTableRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_.env->OptimizeForCompactionTableRead(env_options, db_options);
+ }
+ EnvOptions OptimizeForBlobFileRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_.env->OptimizeForBlobFileRead(env_options, db_options);
+ }
+ Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+ return target_.env->GetFreeSpace(path, diskfree);
+ }
+ void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+ target_.env->SanitizeEnvOptions(env_opts);
+ }
+ Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const override;
+#endif // ROCKSDB_LITE
+
+ private:
+ Target target_;
+};
+
+class SequentialFileWrapper : public SequentialFile {
+ public:
+ explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ return target_->Read(n, result, scratch);
+ }
+ Status Skip(uint64_t n) override { return target_->Skip(n); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+ char* scratch) override {
+ return target_->PositionedRead(offset, n, result, scratch);
+ }
+
+ private:
+ SequentialFile* target_;
+};
+
+class RandomAccessFileWrapper : public RandomAccessFile {
+ public:
+ explicit RandomAccessFileWrapper(RandomAccessFile* target)
+ : target_(target) {}
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+ Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+ return target_->MultiRead(reqs, num_reqs);
+ }
+ Status Prefetch(uint64_t offset, size_t n) override {
+ return target_->Prefetch(offset, n);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+ void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ private:
+ RandomAccessFile* target_;
+};
+
+class WritableFileWrapper : public WritableFile {
+ public:
+ explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
+
+ Status Append(const Slice& data) override { return target_->Append(data); }
+ Status Append(const Slice& data,
+ const DataVerificationInfo& verification_info) override {
+ return target_->Append(data, verification_info);
+ }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ return target_->PositionedAppend(data, offset);
+ }
+ Status PositionedAppend(
+ const Slice& data, uint64_t offset,
+ const DataVerificationInfo& verification_info) override {
+ return target_->PositionedAppend(data, offset, verification_info);
+ }
+ Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+ Status Close() override { return target_->Close(); }
+ Status Flush() override { return target_->Flush(); }
+ Status Sync() override { return target_->Sync(); }
+ Status Fsync() override { return target_->Fsync(); }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetIOPriority(Env::IOPriority pri) override {
+ target_->SetIOPriority(pri);
+ }
+
+ Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ return target_->RangeSync(offset, nbytes);
+ }
+
+ void PrepareWrite(size_t offset, size_t len) override {
+ target_->PrepareWrite(offset, len);
+ }
+
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return target_->Allocate(offset, len);
+ }
+
+ private:
+ WritableFile* target_;
+};
+
+class RandomRWFileWrapper : public RandomRWFile {
+ public:
+ explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {}
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status Write(uint64_t offset, const Slice& data) override {
+ return target_->Write(offset, data);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+ Status Flush() override { return target_->Flush(); }
+ Status Sync() override { return target_->Sync(); }
+ Status Fsync() override { return target_->Fsync(); }
+ Status Close() override { return target_->Close(); }
+
+ private:
+ RandomRWFile* target_;
+};
+
+class DirectoryWrapper : public Directory {
+ public:
+ explicit DirectoryWrapper(Directory* target) : target_(target) {}
+
+ Status Fsync() override { return target_->Fsync(); }
+ Status Close() override { return target_->Close(); }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ Directory* target_;
+};
+
+class LoggerWrapper : public Logger {
+ public:
+ explicit LoggerWrapper(Logger* target) : target_(target) {}
+
+ Status Close() override { return target_->Close(); }
+ void LogHeader(const char* format, va_list ap) override {
+ return target_->LogHeader(format, ap);
+ }
+ void Logv(const char* format, va_list ap) override {
+ return target_->Logv(format, ap);
+ }
+ void Logv(const InfoLogLevel log_level, const char* format,
+ va_list ap) override {
+ return target_->Logv(log_level, format, ap);
+ }
+ size_t GetLogFileSize() const override { return target_->GetLogFileSize(); }
+ void Flush() override { return target_->Flush(); }
+ InfoLogLevel GetInfoLogLevel() const override {
+ return target_->GetInfoLogLevel();
+ }
+ void SetInfoLogLevel(const InfoLogLevel log_level) override {
+ return target_->SetInfoLogLevel(log_level);
+ }
+
+ private:
+ Logger* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+// Returns a new environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+// This is a factory method for TimedEnv defined in utilities/env_timed.cc.
+Env* NewTimedEnv(Env* base_env);
+
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+ std::shared_ptr<Logger>* result);
+
+// Creates a new Env based on Env::Default() but modified to use the specified
+// FileSystem.
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h
new file mode 100644
index 000000000..282db6ed4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env_encryption.h
@@ -0,0 +1,465 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EncryptionProvider;
+
+struct ConfigOptions;
+
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env,
+ const std::shared_ptr<EncryptionProvider>& provider);
+std::shared_ptr<FileSystem> NewEncryptedFS(
+ const std::shared_ptr<FileSystem>& base_fs,
+ const std::shared_ptr<EncryptionProvider>& provider);
+
+// BlockAccessCipherStream is the base class for any cipher stream that
+// supports random access at block level (without requiring data from other
+// blocks). E.g. CTR (Counter operation mode) supports this requirement.
+class BlockAccessCipherStream {
+ public:
+ virtual ~BlockAccessCipherStream(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() = 0;
+
+ // Encrypt one or more (partial) blocks of data at the file offset.
+ // Length of data is given in dataSize.
+ virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ // Decrypt one or more (partial) blocks of data at the file offset.
+ // Length of data is given in dataSize.
+ virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ protected:
+ // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+ virtual void AllocateScratch(std::string&) = 0;
+
+ // Encrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) = 0;
+
+ // Decrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) = 0;
+};
+
+// BlockCipher
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class BlockCipher : public Customizable {
+ public:
+ virtual ~BlockCipher(){};
+
+ // Creates a new BlockCipher from the input config_options and value
+ // The value describes the type of provider (and potentially optional
+ // configuration parameters) used to create this provider.
+ // For example, if the value is "ROT13", a ROT13BlockCipher is created.
+ //
+ // @param config_options Options to control how this cipher is created
+ // and initialized.
+ // @param value The value might be:
+ // - ROT13 Create a ROT13 Cipher
+ // - ROT13:nn Create a ROT13 Cipher with block size of nn
+ // @param result The new cipher object
+ // @return OK if the cipher was successfully created
+ // @return NotFound if an invalid name was specified in the value
+ // @return InvalidArgument if either the options were not valid
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<BlockCipher>* result);
+
+ static const char* Type() { return "BlockCipher"; }
+ // Short-cut method to create a ROT13 BlockCipher.
+ // This cipher is only suitable for test purposes and should not be used in
+ // production!!!
+ static std::shared_ptr<BlockCipher> NewROT13Cipher(size_t block_size);
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() = 0;
+
+ // Encrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Encrypt(char* data) = 0;
+
+ // Decrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Decrypt(char* data) = 0;
+};
+
+// The encryption provider is used to create a cipher stream for a specific
+// file. The returned cipher stream will be used for actual
+// encryption/decryption actions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class EncryptionProvider : public Customizable {
+ public:
+ virtual ~EncryptionProvider(){};
+
+ // Creates a new EncryptionProvider from the input config_options and value
+ // The value describes the type of provider (and potentially optional
+ // configuration parameters) used to create this provider.
+ // For example, if the value is "CTR", a CTREncryptionProvider will be
+ // created. If the value is ends with "://test" (e.g CTR://test"), the
+ // provider will be initialized in "TEST" mode prior to being returned.
+ //
+ // @param config_options Options to control how this provider is created
+ // and initialized.
+ // @param value The value might be:
+ // - CTR Create a CTR provider
+ // - CTR://test Create a CTR provider and initialize it for tests.
+ // @param result The new provider object
+ // @return OK if the provider was successfully created
+ // @return NotFound if an invalid name was specified in the value
+ // @return InvalidArgument if either the options were not valid
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<EncryptionProvider>* result);
+
+ static const char* Type() { return "EncryptionProvider"; }
+
+ // Short-cut method to create a CTR-provider
+ static std::shared_ptr<EncryptionProvider> NewCTRProvider(
+ const std::shared_ptr<BlockCipher>& cipher);
+
+ // GetPrefixLength returns the length of the prefix that is added to every
+ // file and used for storing encryption options. For optimal performance, the
+ // prefix length should be a multiple of the page size.
+ virtual size_t GetPrefixLength() const = 0;
+
+ // CreateNewPrefix initialized an allocated block of prefix memory
+ // for a new file.
+ virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+ size_t prefixLength) const = 0;
+
+ // Method to add a new cipher key for use by the EncryptionProvider.
+ // @param description Descriptor for this key.
+ // @param cipher The cryptographic key to use
+ // @param len The length of the cipher key
+ // @param for_write If true, this cipher should be used for writing files.
+ // If false, this cipher should only be used for reading
+ // files
+ // @return OK if the cipher was successfully added to the provider, non-OK
+ // otherwise
+ virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+ size_t len, bool for_write) = 0;
+
+ // CreateCipherStream creates a block access cipher stream for a file given
+ // given name and options.
+ virtual Status CreateCipherStream(
+ const std::string& fname, const EnvOptions& options, Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result) = 0;
+
+ // Returns a string representing an encryption marker prefix for this
+ // provider. If a marker is provided, this marker can be used to tell whether
+ // or not a file is encrypted by this provider. The maker will also be part
+ // of any encryption prefix for this provider.
+ virtual std::string GetMarker() const { return ""; }
+};
+
+class EncryptedSequentialFile : public FSSequentialFile {
+ protected:
+ std::unique_ptr<FSSequentialFile> file_;
+ std::unique_ptr<BlockAccessCipherStream> stream_;
+ uint64_t offset_;
+ size_t prefixLength_;
+
+ public:
+ // Default ctor. Given underlying sequential file is supposed to be at
+ // offset == prefixLength.
+ EncryptedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+ std::unique_ptr<BlockAccessCipherStream>&& s,
+ size_t prefixLength)
+ : file_(std::move(f)),
+ stream_(std::move(s)),
+ offset_(prefixLength),
+ prefixLength_(prefixLength) {}
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ IOStatus Skip(uint64_t n) override;
+
+ // Indicates the upper layers if the current SequentialFile implementation
+ // uses direct IO.
+ bool use_direct_io() const override;
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ size_t GetRequiredBufferAlignment() const override;
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ // Positioned Read for direct I/O
+ // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class EncryptedRandomAccessFile : public FSRandomAccessFile {
+ protected:
+ std::unique_ptr<FSRandomAccessFile> file_;
+ std::unique_ptr<BlockAccessCipherStream> stream_;
+ size_t prefixLength_;
+
+ public:
+ EncryptedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+ std::unique_ptr<BlockAccessCipherStream>&& s,
+ size_t prefixLength)
+ : file_(std::move(f)),
+ stream_(std::move(s)),
+ prefixLength_(prefixLength) {}
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // Safe for concurrent use by multiple threads.
+ // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ // Readahead the file starting from offset by n bytes for caching.
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to each other by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ size_t GetUniqueId(char* id, size_t max_size) const override;
+
+ void Hint(AccessPattern pattern) override;
+
+ // Indicates the upper layers if the current RandomAccessFile implementation
+ // uses direct IO.
+ bool use_direct_io() const override;
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ size_t GetRequiredBufferAlignment() const override;
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class EncryptedWritableFile : public FSWritableFile {
+ protected:
+ std::unique_ptr<FSWritableFile> file_;
+ std::unique_ptr<BlockAccessCipherStream> stream_;
+ size_t prefixLength_;
+
+ public:
+ // Default ctor. Prefix is assumed to be written already.
+ EncryptedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+ std::unique_ptr<BlockAccessCipherStream>&& s,
+ size_t prefixLength)
+ : file_(std::move(f)),
+ stream_(std::move(s)),
+ prefixLength_(prefixLength) {}
+
+ using FSWritableFile::Append;
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ using FSWritableFile::PositionedAppend;
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ bool IsSyncThreadSafe() const override;
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ bool use_direct_io() const override;
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ size_t GetRequiredBufferAlignment() const override;
+
+ /*
+ * Get the size of valid data in the file.
+ */
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // PrepareWrite performs any necessary preparation for a write
+ // before the write actually occurs. This allows for pre-allocation
+ // of space on devices where it can result in less file
+ // fragmentation and/or less waste from over-zealous filesystem
+ // pre-allocation.
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ void SetPreallocationBlockSize(size_t size) override;
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override;
+
+ // Pre-allocates space for a file.
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+// A file abstraction for random reading and writing.
+class EncryptedRandomRWFile : public FSRandomRWFile {
+ protected:
+ std::unique_ptr<FSRandomRWFile> file_;
+ std::unique_ptr<BlockAccessCipherStream> stream_;
+ size_t prefixLength_;
+
+ public:
+ EncryptedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+ std::unique_ptr<BlockAccessCipherStream>&& s,
+ size_t prefixLength)
+ : file_(std::move(f)),
+ stream_(std::move(s)),
+ prefixLength_(prefixLength) {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ bool use_direct_io() const override;
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ size_t GetRequiredBufferAlignment() const override;
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ // Returns Status::OK() on success.
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+class EncryptedFileSystem : public FileSystemWrapper {
+ public:
+ explicit EncryptedFileSystem(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base) {}
+ // Method to add a new cipher key for use by the EncryptionProvider.
+ // @param description Descriptor for this key.
+ // @param cipher The cryptographic key to use
+ // @param len The length of the cipher key
+ // @param for_write If true, this cipher should be used for writing files.
+ // If false, this cipher should only be used for reading
+ // files
+ // @return OK if the cipher was successfully added to the provider, non-OK
+ // otherwise
+ virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+ size_t len, bool for_write) = 0;
+ static const char* kClassName() { return "EncryptedFileSystem"; }
+ bool IsInstanceOf(const std::string& name) const override {
+ if (name == kClassName()) {
+ return true;
+ } else {
+ return FileSystemWrapper::IsInstanceOf(name);
+ }
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h
new file mode 100644
index 000000000..b59395255
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/experimental.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+// Supported only for Leveled compaction
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end);
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
+
+// Move all L0 files to target_level skipping compaction.
+// This operation succeeds only if the files in L0 have disjoint ranges; this
+// is guaranteed to happen, for instance, if keys are inserted in sorted
+// order. Furthermore, all levels between 1 and target_level must be empty.
+// If any of the above condition is violated, InvalidArgument will be
+// returned.
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
+ int target_level = 1);
+
+struct UpdateManifestForFilesStateOptions {
+ // When true, read current file temperatures from FileSystem and update in
+ // DB manifest when a temperature other than Unknown is reported and
+ // inconsistent with manifest.
+ bool update_temperatures = true;
+
+ // TODO: new_checksums: to update files to latest file checksum algorithm
+};
+
+// Utility for updating manifest of DB directory (not open) for current state
+// of files on filesystem. See UpdateManifestForFilesStateOptions.
+//
+// To minimize interference with ongoing DB operations, only the following
+// guarantee is provided, assuming no IO error encountered:
+// * Only files live in DB at start AND end of call to
+// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in
+// manifest.
+// * For example, new files after start of call to
+// UpdateManifestForFilesState() might not be updated, but that is not
+// typically required to achieve goal of manifest consistency/completeness
+// (because current DB configuration would ensure new files get the desired
+// consistent metadata).
+Status UpdateManifestForFilesState(
+ const DBOptions& db_opts, const std::string& db_name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const UpdateManifestForFilesStateOptions& opts = {});
+
+} // namespace experimental
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h
new file mode 100644
index 000000000..758bae4ac
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_checksum.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The unknown file checksum.
+constexpr char kUnknownFileChecksum[] = "";
+// The unknown sst file checksum function name.
+constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// The standard DB file checksum function name.
+// This is the name of the checksum function returned by
+// GetFileChecksumGenCrc32cFactory();
+constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+
+struct FileChecksumGenContext {
+ std::string file_name;
+ // The name of the requested checksum generator.
+ // Checksum factories may use or ignore requested_checksum_func_name,
+ // and checksum factories written before this field was available are still
+ // compatible.
+ std::string requested_checksum_func_name;
+};
+
+// FileChecksumGenerator is the class to generates the checksum value
+// for each file when the file is written to the file system.
+// Implementations may assume that
+// * Finalize is called at most once during the life of the object
+// * All calls to Update come before Finalize
+// * All calls to GetChecksum come after Finalize
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenerator {
+ public:
+ virtual ~FileChecksumGenerator() {}
+
+ // Update the current result after process the data. For different checksum
+ // functions, the temporal results may be stored and used in Update to
+ // include the new data.
+ virtual void Update(const char* data, size_t n) = 0;
+
+ // Generate the final results if no further new data will be updated.
+ virtual void Finalize() = 0;
+
+ // Get the checksum. The result should not be the empty string and may
+ // include arbitrary bytes, including non-printable characters.
+ virtual std::string GetChecksum() const = 0;
+
+ // Returns a name that identifies the current file checksum function.
+ virtual const char* Name() const = 0;
+};
+
+// Create the FileChecksumGenerator object for each SST file.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenFactory : public Customizable {
+ public:
+ ~FileChecksumGenFactory() override {}
+ static const char* Type() { return "FileChecksumGenFactory"; }
+ static Status CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<FileChecksumGenFactory>* result);
+
+ // Create a new FileChecksumGenerator.
+ virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+ const FileChecksumGenContext& context) = 0;
+
+ // Return the name of this FileChecksumGenFactory.
+ const char* Name() const override = 0;
+};
+
+// FileChecksumList stores the checksum information of a list of files (e.g.,
+// SST files). The FileChecksumList can be used to store the checksum
+// information of all SST file getting from the MANIFEST, which are
+// the checksum information of all valid SST file of a DB instance. It can
+// also be used to store the checksum information of a list of SST files to
+// be ingested.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumList {
+ public:
+ virtual ~FileChecksumList() {}
+
+ // Clean the previously stored file checksum information.
+ virtual void reset() = 0;
+
+ // Get the number of checksums in the checksum list
+ virtual size_t size() const = 0;
+
+ // Return all the file checksum information being stored in a unordered_map.
+ // File_number is the key, the first part of the value is checksum value,
+ // and the second part of the value is checksum function name.
+ virtual Status GetAllFileChecksums(
+ std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+ std::vector<std::string>* checksum_func_names) = 0;
+
+ // Given the file_number, it searches if the file checksum information is
+ // stored.
+ virtual Status SearchOneFileChecksum(uint64_t file_number,
+ std::string* checksum,
+ std::string* checksum_func_name) = 0;
+
+ // Insert the checksum information of one file to the FileChecksumList.
+ virtual Status InsertOneFileChecksum(
+ uint64_t file_number, const std::string& checksum,
+ const std::string& checksum_func_name) = 0;
+
+ // Remove the checksum information of one SST file.
+ virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0;
+};
+
+// Create a new file checksum list.
+extern FileChecksumList* NewFileChecksumList();
+
+// Return a shared_ptr of the builtin Crc32c based file checksum generator
+// factory object, which can be shared to create the Crc32c based checksum
+// generator object.
+// Note: this implementation is compatible with many other crc32c checksum
+// implementations and uses big-endian encoding of the result, unlike most
+// other crc32c checksums in RocksDB, which alter the result with
+// crc32c::Mask and use little-endian encoding.
+extern std::shared_ptr<FileChecksumGenFactory>
+GetFileChecksumGenCrc32cFactory();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h
new file mode 100644
index 000000000..91ad47218
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_system.h
@@ -0,0 +1,1849 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// A FileSystem is an interface used by the rocksdb implementation to access
+// storage functionality like the filesystem etc. Callers
+// may wish to provide a custom FileSystem object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All FileSystem implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+//
+// WARNING: Since this is a new interface, it is expected that there will be
+// some changes as storage systems are ported over.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <chrono>
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileLock;
+class FSDirectory;
+class FSRandomAccessFile;
+class FSRandomRWFile;
+class FSSequentialFile;
+class FSWritableFile;
+class Logger;
+class Slice;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+struct ConfigOptions;
+
+using AccessPattern = RandomAccessFile::AccessPattern;
+using FileAttributes = Env::FileAttributes;
+
+// DEPRECATED
+// Priority of an IO request. This is a hint and does not guarantee any
+// particular QoS.
+// IO_LOW - Typically background reads/writes such as compaction/flush
+// IO_HIGH - Typically user reads/synchronous WAL writes
+enum class IOPriority : uint8_t {
+ kIOLow,
+ kIOHigh,
+ kIOTotal,
+};
+
+// Type of the data begin read/written. It can be passed down as a flag
+// for the FileSystem implementation to optionally handle different types in
+// different ways
+enum class IOType : uint8_t {
+ kData,
+ kFilter,
+ kIndex,
+ kMetadata,
+ kWAL,
+ kManifest,
+ kLog,
+ kUnknown,
+ kInvalid,
+};
+
+// Per-request options that can be passed down to the FileSystem
+// implementation. These are hints and are not necessarily guaranteed to be
+// honored. More hints can be added here in the future to indicate things like
+// storage media (HDD/SSD) to be used, replication level etc.
+struct IOOptions {
+ // Timeout for the operation in microseconds
+ std::chrono::microseconds timeout;
+
+ // DEPRECATED
+ // Priority - high or low
+ IOPriority prio;
+
+ // Priority used to charge rate limiter configured in file system level (if
+ // any)
+ // Limitation: right now RocksDB internal does not consider this
+ // rate_limiter_priority
+ Env::IOPriority rate_limiter_priority;
+
+ // Type of data being read/written
+ IOType type;
+
+ // EXPERIMENTAL
+ // An option map that's opaque to RocksDB. It can be used to implement a
+ // custom contract between a FileSystem user and the provider. This is only
+ // useful in cases where a RocksDB user directly uses the FileSystem or file
+ // object for their own purposes, and wants to pass extra options to APIs
+ // such as NewRandomAccessFile and NewWritableFile.
+ std::unordered_map<std::string, std::string> property_bag;
+
+ // Force directory fsync, some file systems like btrfs may skip directory
+ // fsync, set this to force the fsync
+ bool force_dir_fsync;
+
+ // Can be used by underlying file systems to skip recursing through sub
+ // directories and list only files in GetChildren API.
+ bool do_not_recurse;
+
+ IOOptions() : IOOptions(false) {}
+
+ explicit IOOptions(bool force_dir_fsync_)
+ : timeout(std::chrono::microseconds::zero()),
+ prio(IOPriority::kIOLow),
+ rate_limiter_priority(Env::IO_TOTAL),
+ type(IOType::kUnknown),
+ force_dir_fsync(force_dir_fsync_),
+ do_not_recurse(false) {}
+};
+
+struct DirFsyncOptions {
+ enum FsyncReason : uint8_t {
+ kNewFileSynced,
+ kFileRenamed,
+ kDirRenamed,
+ kFileDeleted,
+ kDefault,
+ } reason;
+
+ std::string renamed_new_name; // for kFileRenamed
+ // add other options for other FsyncReason
+
+ DirFsyncOptions();
+
+ explicit DirFsyncOptions(std::string file_renamed_new_name);
+
+ explicit DirFsyncOptions(FsyncReason fsync_reason);
+};
+
+// File scope options that control how a file is opened/created and accessed
+// while its open. We may add more options here in the future such as
+// redundancy level, media to use etc.
+struct FileOptions : EnvOptions {
+ // Embedded IOOptions to control the parameters for any IOs that need
+ // to be issued for the file open/creation
+ IOOptions io_options;
+
+ // EXPERIMENTAL
+ // The feature is in development and is subject to change.
+ // When creating a new file, set the temperature of the file so that
+ // underlying file systems can put it with appropriate storage media and/or
+ // coding.
+ Temperature temperature = Temperature::kUnknown;
+
+ // The checksum type that is used to calculate the checksum value for
+ // handoff during file writes.
+ ChecksumType handoff_checksum_type;
+
+ FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+ FileOptions(const DBOptions& opts)
+ : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+ FileOptions(const EnvOptions& opts)
+ : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+ FileOptions(const FileOptions& opts)
+ : EnvOptions(opts),
+ io_options(opts.io_options),
+ temperature(opts.temperature),
+ handoff_checksum_type(opts.handoff_checksum_type) {}
+
+ FileOptions& operator=(const FileOptions&) = default;
+};
+
+// A structure to pass back some debugging information from the FileSystem
+// implementation to RocksDB in case of an IO error
+struct IODebugContext {
+ // file_path to be filled in by RocksDB in case of an error
+ std::string file_path;
+
+ // A map of counter names to values - set by the FileSystem implementation
+ std::map<std::string, uint64_t> counters;
+
+ // To be set by the FileSystem implementation
+ std::string msg;
+
+ // To be set by the underlying FileSystem implementation.
+ std::string request_id;
+
+ // In order to log required information in IO tracing for different
+ // operations, Each bit in trace_data stores which corresponding info from
+ // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
+ // means bit at position 0 is set so TraceData::kRequestID (request_id) will
+ // be logged in the trace record.
+ //
+ enum TraceData : char {
+ // The value of each enum represents the bitwise position for
+ // that information in trace_data which will be used by IOTracer for
+ // tracing. Make sure to add them sequentially.
+ kRequestID = 0,
+ };
+ uint64_t trace_data = 0;
+
+ IODebugContext() {}
+
+ void AddCounter(std::string& name, uint64_t value) {
+ counters.emplace(name, value);
+ }
+
+ // Called by underlying file system to set request_id and log request_id in
+ // IOTracing.
+ void SetRequestId(const std::string& _request_id) {
+ request_id = _request_id;
+ trace_data |= (1 << TraceData::kRequestID);
+ }
+
+ std::string ToString() {
+ std::ostringstream ss;
+ ss << file_path << ", ";
+ for (auto counter : counters) {
+ ss << counter.first << " = " << counter.second << ",";
+ }
+ ss << msg;
+ return ss.str();
+ }
+};
+
+// A function pointer type for custom destruction of void pointer passed to
+// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer
+// allocated by FS in ReadAsync API.
+using IOHandleDeleter = std::function<void(void*)>;
+
+// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile,
+// FSRandomRWFileclass, and FSDIrectory classes define the interface between
+// RocksDB and storage systems, such as Posix filesystems,
+// remote filesystems etc.
+// The interface allows for fine grained control of individual IO operations,
+// such as setting a timeout, prioritization, hints on data placement,
+// different handling based on type of IO etc.
+// This is accomplished by passing an instance of IOOptions to every
+// API call that can potentially perform IO. Additionally, each such API is
+// passed a pointer to a IODebugContext structure that can be used by the
+// storage system to include troubleshooting information. The return values
+// of the APIs is of type IOStatus, which can indicate an error code/sub-code,
+// as well as metadata about the error such as its scope and whether its
+// retryable.
+// NewCompositeEnv can be used to create an Env with a custom FileSystem for
+// DBOptions::env.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileSystem : public Customizable {
+ public:
+ FileSystem();
+
+ // No copying allowed
+ FileSystem(const FileSystem&) = delete;
+
+ virtual ~FileSystem();
+
+ static const char* Type() { return "FileSystem"; }
+ static const char* kDefaultName() { return "DefaultFileSystem"; }
+
+ // Loads the FileSystem specified by the input value into the result
+ // The CreateFromString alternative should be used; this method may be
+ // deprecated in a future release.
+ static Status Load(const std::string& value,
+ std::shared_ptr<FileSystem>* result);
+
+ // Loads the FileSystem specified by the input value into the result
+ // @see Customizable for a more detailed description of the parameters and
+ // return codes
+ // @param config_options Controls how the FileSystem is loaded
+ // @param value The name and optional properties describing the file system
+ // to load.
+ // @param result On success, returns the loaded FileSystem
+ // @return OK if the FileSystem was successfully loaded.
+ // @return not-OK if the load failed.
+ static Status CreateFromString(const ConfigOptions& options,
+ const std::string& value,
+ std::shared_ptr<FileSystem>* result);
+
+ // Return a default FileSystem suitable for the current operating
+ // system.
+ static std::shared_ptr<FileSystem> Default();
+
+ // Handles the event when a new DB or a new ColumnFamily starts using the
+ // specified data paths.
+ //
+ // The data paths might be shared by different DBs or ColumnFamilies,
+ // so RegisterDbPaths might be called with the same data paths.
+ // For example, when CreateColumnFamily is called multiple times with the same
+ // data path, RegisterDbPaths will also be called with the same data path.
+ //
+ // If the return status is ok, then the paths must be correspondingly
+ // called in UnregisterDbPaths;
+ // otherwise this method should have no side effect, and UnregisterDbPaths
+ // do not need to be called for the paths.
+ //
+ // Different implementations may take different actions.
+ // By default, it's a no-op and returns Status::OK.
+ virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+ return Status::OK();
+ }
+ // Handles the event a DB or a ColumnFamily stops using the specified data
+ // paths.
+ //
+ // It should be called corresponding to each successful RegisterDbPaths.
+ //
+ // Different implementations may take different actions.
+ // By default, it's a no-op and returns Status::OK.
+ virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+ return Status::OK();
+ }
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual IOStatus NewRandomAccessFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) = 0;
+ // These values match Linux definition
+ // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+ enum WriteLifeTimeHint {
+ kWLTHNotSet = 0, // No hint information set
+ kWLTHNone, // No hints about write life time
+ kWLTHShort, // Data written has a short life time
+ kWLTHMedium, // Data written has a medium life time
+ kWLTHLong, // Data written has a long life time
+ kWLTHExtreme, // Data written has an extremely long life time
+ };
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) = 0;
+
+ // Create an object that writes to a file with the specified name.
+ // `FSWritableFile::Append()`s will append after any existing content. If the
+ // file does not already exist, creates it.
+ //
+ // On success, stores a pointer to the file in *result and returns OK. On
+ // failure stores nullptr in *result and returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus ReopenWritableFile(
+ const std::string& /*fname*/, const FileOptions& /*options*/,
+ std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("ReopenWritableFile");
+ }
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ virtual IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg);
+
+ // Open `fname` for random read and write, if file doesn't exist the file
+ // will be created. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewRandomRWFile(const std::string& /*fname*/,
+ const FileOptions& /*options*/,
+ std::unique_ptr<FSRandomRWFile>* /*result*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "RandomRWFile is not implemented in this FileSystem");
+ }
+
+ // Opens `fname` as a memory-mapped file for read and write (in-place updates
+ // only, i.e., no appends). On success, stores a raw buffer covering the whole
+ // file in `*result`. The file must exist prior to this call.
+ virtual IOStatus NewMemoryMappedFileBuffer(
+ const std::string& /*fname*/,
+ std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+ return IOStatus::NotSupported(
+ "MemoryMappedFileBuffer is not implemented in this FileSystem");
+ }
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ virtual IOStatus NewDirectory(const std::string& name,
+ const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) = 0;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus FileExists(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) = 0;
+
+ // Store in *result the attributes of the children of the specified directory.
+ // In case the implementation lists the directory prior to iterating the files
+ // and files are concurrently deleted, the deleted files will be omitted from
+ // result.
+ // The name attributes are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus GetChildrenFileAttributes(
+ const std::string& dir, const IOOptions& options,
+ std::vector<FileAttributes>* result, IODebugContext* dbg) {
+ assert(result != nullptr);
+ std::vector<std::string> child_fnames;
+ IOStatus s = GetChildren(dir, options, &child_fnames, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ result->resize(child_fnames.size());
+ size_t result_size = 0;
+ for (size_t i = 0; i < child_fnames.size(); ++i) {
+ const std::string path = dir + "/" + child_fnames[i];
+ if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes,
+ dbg))
+ .ok()) {
+ if (FileExists(path, options, dbg).IsNotFound()) {
+ // The file may have been deleted since we listed the directory
+ continue;
+ }
+ return s;
+ }
+ (*result)[result_size].name = std::move(child_fnames[i]);
+ result_size++;
+ }
+ result->resize(result_size);
+ return IOStatus::OK();
+ }
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+ // Delete the named file.
+ virtual IOStatus DeleteFile(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Truncate the named file to the specified size.
+ virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "Truncate is not supported for this FileSystem");
+ }
+
+ // Create the specified directory. Returns error if directory exists.
+ virtual IOStatus CreateDir(const std::string& dirname,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ virtual IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Delete the specified directory.
+ virtual IOStatus DeleteDir(const std::string& dirname,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual IOStatus GetFileSize(const std::string& fname,
+ const IOOptions& options, uint64_t* file_size,
+ IODebugContext* dbg) = 0;
+
+ // Store the last modification time of fname in *file_mtime.
+ virtual IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) = 0;
+ // Rename file src to target.
+ virtual IOStatus RenameFile(const std::string& src, const std::string& target,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Hard Link file src to target.
+ virtual IOStatus LinkFile(const std::string& /*src*/,
+ const std::string& /*target*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "LinkFile is not supported for this FileSystem");
+ }
+
+ virtual IOStatus NumFileLinks(const std::string& /*fname*/,
+ const IOOptions& /*options*/,
+ uint64_t* /*count*/, IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "Getting number of file links is not supported for this FileSystem");
+ }
+
+ virtual IOStatus AreFilesSame(const std::string& /*first*/,
+ const std::string& /*second*/,
+ const IOOptions& /*options*/, bool* /*res*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "AreFilesSame is not supported for this FileSystem");
+ }
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) = 0;
+
+ // Create and returns a default logger (an instance of EnvLogger) for storing
+ // informational messages. Derived classes can override to provide custom
+ // logger.
+ virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg);
+
+ // Get full directory name for this db.
+ virtual IOStatus GetAbsolutePath(const std::string& db_path,
+ const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) = 0;
+
+ // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions
+ // copy constructor
+ virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {}
+
+ // OptimizeForLogRead will create a new FileOptions object that is a copy of
+ // the FileOptions in the parameters, but is optimized for reading log files.
+ virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
+
+ // OptimizeForManifestRead will create a new FileOptions object that is a copy
+ // of the FileOptions in the parameters, but is optimized for reading manifest
+ // files.
+ virtual FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const;
+
+ // OptimizeForLogWrite will create a new FileOptions object that is a copy of
+ // the FileOptions in the parameters, but is optimized for writing log files.
+ // Default implementation returns the copy of the same object.
+ virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const;
+
+ // OptimizeForManifestWrite will create a new FileOptions object that is a
+ // copy of the FileOptions in the parameters, but is optimized for writing
+ // manifest files. Default implementation returns the copy of the same
+ // object.
+ virtual FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const;
+
+ // OptimizeForCompactionTableWrite will create a new FileOptions object that
+ // is a copy of the FileOptions in the parameters, but is optimized for
+ // writing table files.
+ virtual FileOptions OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& immutable_ops) const;
+
+ // OptimizeForCompactionTableRead will create a new FileOptions object that
+ // is a copy of the FileOptions in the parameters, but is optimized for
+ // reading table files.
+ virtual FileOptions OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const;
+
+ // OptimizeForBlobFileRead will create a new FileOptions object that
+ // is a copy of the FileOptions in the parameters, but is optimized for
+ // reading blob files.
+ virtual FileOptions OptimizeForBlobFileRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+
+ // Get the amount of free disk space
+ virtual IOStatus GetFreeSpace(const std::string& /*path*/,
+ const IOOptions& /*options*/,
+ uint64_t* /*diskfree*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("GetFreeSpace");
+ }
+
+ virtual IOStatus IsDirectory(const std::string& /*path*/,
+ const IOOptions& options, bool* is_dir,
+ IODebugContext* /*dgb*/) = 0;
+
+ // EXPERIMENTAL
+ // Poll for completion of read IO requests. The Poll() method should call the
+ // callback functions to indicate completion of read requests.
+ // Underlying FS is required to support Poll API. Poll implementation should
+ // ensure that the callback gets called at IO completion, and return only
+ // after the callback has been called.
+ // If Poll returns partial results for any reads, its caller reponsibility to
+ // call Read or ReadAsync in order to get the remaining bytes.
+ //
+ // Default implementation is to return IOStatus::OK.
+
+ virtual IOStatus Poll(std::vector<void*>& /*io_handles*/,
+ size_t /*min_completions*/) {
+ return IOStatus::OK();
+ }
+
+ // EXPERIMENTAL
+ // Abort the read IO requests submitted asynchronously. Underlying FS is
+ // required to support AbortIO API. AbortIO implementation should ensure that
+ // the all the read requests related to io_handles should be aborted and
+ // it shouldn't call the callback for these io_handles.
+ //
+ // Default implementation is to return IOStatus::OK.
+ virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
+ return IOStatus::OK();
+ }
+
+ // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ private:
+ void operator=(const FileSystem&);
+};
+
+// A file abstraction for reading sequentially through a file
+class FSSequentialFile {
+ public:
+ FSSequentialFile() {}
+
+ virtual ~FSSequentialFile() {}
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // REQUIRES: External synchronization
+ virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) = 0;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ virtual IOStatus Skip(uint64_t n) = 0;
+
+ // Indicates the upper layers if the current SequentialFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Positioned Read for direct I/O
+ // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+ virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+ const IOOptions& /*options*/,
+ Slice* /*result*/, char* /*scratch*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("PositionedRead");
+ }
+
+ // EXPERIMENTAL
+ // When available, returns the actual temperature for the file. This is
+ // useful in case some outside process moves a file from one tier to another,
+ // though the temperature is generally expected not to change while a file is
+ // open.
+ virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+ // If you're adding methods here, remember to add them to
+ // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead and asynchronous Read APIs.
+struct FSReadRequest {
+ // Input parameter that represents the file offset in bytes.
+ uint64_t offset;
+
+ // Input parameter that represents the length to read in bytes. `result` only
+ // returns fewer bytes if end of file is hit (or `status` is not OK).
+ size_t len;
+
+ // A buffer that MultiRead() can optionally place data in. It can
+ // ignore this and allocate its own buffer.
+ // The lifecycle of scratch will be until IO is completed.
+ //
+ // In case of asynchronous reads, its an output parameter and it will be
+ // maintained until callback has been called. Scratch is allocated by RocksDB
+ // and will be passed to underlying FileSystem.
+ char* scratch;
+
+ // Output parameter set by MultiRead() to point to the data buffer, and
+ // the number of valid bytes
+ //
+ // In case of asynchronous reads, this output parameter is set by Async Read
+ // APIs to point to the data buffer, and
+ // the number of valid bytes.
+ // Slice result should point to scratch i.e the data should
+ // always be read into scratch.
+ Slice result;
+
+ // Output parameter set by underlying FileSystem that represents status of
+ // read request.
+ IOStatus status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class FSRandomAccessFile {
+ public:
+ FSRandomAccessFile() {}
+
+ virtual ~FSRandomAccessFile() {}
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // Safe for concurrent use by multiple threads.
+ // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const = 0;
+
+ // Readahead the file starting from offset by n bytes for caching.
+ // If it's not implemented (default: `NotSupported`), RocksDB will create
+ // internal prefetch buffer to improve read performance.
+ virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("Prefetch");
+ }
+
+ // Read a bunch of blocks as described by reqs. The blocks can
+ // optionally be read in parallel. This is a synchronous call, i.e it
+ // should return after all reads have completed. The reads will be
+ // non-overlapping but can be in any order. If the function return Status
+ // is not ok, status of individual requests will be ignored and return
+ // status will be assumed for all read requests. The function return status
+ // is only meant for errors that occur before processing individual read
+ // requests.
+ virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) {
+ assert(reqs != nullptr);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ FSReadRequest& req = reqs[i];
+ req.status =
+ Read(req.offset, req.len, options, &req.result, req.scratch, dbg);
+ }
+ return IOStatus::OK();
+ }
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to each other by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ // compatibility.
+ };
+
+ enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed };
+
+ virtual void Hint(AccessPattern /*pattern*/) {}
+
+ // Indicates the upper layers if the current RandomAccessFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // EXPERIMENTAL
+ // This API reads the requested data in FSReadRequest asynchronously. This is
+ // a asynchronous call, i.e it should return after submitting the request.
+ //
+ // When the read request is completed, callback function specified in cb
+ // should be called with arguments cb_arg and the result populated in
+ // FSReadRequest with result and status fileds updated by FileSystem.
+ // cb_arg should be used by the callback to track the original request
+ // submitted.
+ //
+ // This API should also populate io_handle which should be used by
+ // underlying FileSystem to store the context in order to distinguish the read
+ // requests at their side and provide the custom deletion function in del_fn.
+ // RocksDB guarantees that the del_fn for io_handle will be called after
+ // receiving the callback. Furthermore, RocksDB guarantees that if it calls
+ // the Poll API for this io_handle, del_fn will be called after the Poll
+ // returns. RocksDB is responsible for managing the lifetime of io_handle.
+ //
+ // req contains the request offset and size passed as input parameter of read
+ // request and result and status fields are output parameter set by underlying
+ // FileSystem. The data should always be read into scratch field.
+ //
+ // Default implementation is to read the data synchronously.
+ virtual IOStatus ReadAsync(
+ FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) {
+ req.status =
+ Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg);
+ cb(req, cb_arg);
+ return IOStatus::OK();
+ }
+
+ // EXPERIMENTAL
+ // When available, returns the actual temperature for the file. This is
+ // useful in case some outside process moves a file from one tier to another,
+ // though the temperature is generally expected not to change while a file is
+ // open.
+ virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+ // If you're adding methods here, remember to add them to
+ // RandomAccessFileWrapper too.
+};
+
+// A data structure brings the data verification information, which is
+// used together with data being written to a file.
+struct DataVerificationInfo {
+ // checksum of the data being written.
+ Slice checksum;
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class FSWritableFile {
+ public:
+ FSWritableFile()
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(false) {}
+
+ explicit FSWritableFile(const FileOptions& options)
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+
+ virtual ~FSWritableFile() {}
+
+ // Append data to the end of the file
+ // Note: A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ virtual IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Append data with verification information.
+ // Note that this API change is experimental and it might be changed in
+ // the future. Currently, RocksDB only generates crc32c based checksum for
+ // the file writes when the checksum handoff option is set.
+ // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+ // ChecksumType::kCRC32C is set as default) is not supported by this
+ // FSWritableFile, the information in DataVerificationInfo can be ignored
+ // (i.e. does not perform checksum verification).
+ virtual IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) {
+ return Append(data, options, dbg);
+ }
+
+ // PositionedAppend data to the specified offset. The new EOF after append
+ // must be larger than the previous EOF. This is to be used when writes are
+ // not backed by OS buffers and hence has to always start from the start of
+ // the sector. The implementation thus needs to also rewrite the last
+ // partial sector.
+ // Note: PositionAppend does not guarantee moving the file offset after the
+ // write. A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ //
+ // PositionedAppend() can only happen on the page/sector boundaries. For that
+ // reason, if the last write was an incomplete sector we still need to rewind
+ // back to the nearest sector/page and rewrite the portion of it with whatever
+ // we need to add. We need to keep where we stop writing.
+ //
+ // PositionedAppend() can only write whole sectors. For that reason we have to
+ // pad with zeros for the last write and trim the file when closing according
+ // to the position we keep in the previous step.
+ //
+ // PositionedAppend() requires aligned buffer to be passed in. The alignment
+ // required is queried via GetRequiredBufferAlignment()
+ virtual IOStatus PositionedAppend(const Slice& /* data */,
+ uint64_t /* offset */,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("PositionedAppend");
+ }
+
+ // PositionedAppend data with verification information.
+ // Note that this API change is experimental and it might be changed in
+ // the future. Currently, RocksDB only generates crc32c based checksum for
+ // the file writes when the checksum handoff option is set.
+ // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+ // ChecksumType::kCRC32C is set as default) is not supported by this
+ // FSWritableFile, the information in DataVerificationInfo can be ignored
+ // (i.e. does not perform checksum verification).
+ virtual IOStatus PositionedAppend(
+ const Slice& /* data */, uint64_t /* offset */,
+ const IOOptions& /*options*/,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("PositionedAppend");
+ }
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+ }
+ virtual IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) = 0;
+
+ virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+ virtual IOStatus Sync(const IOOptions& options,
+ IODebugContext* dbg) = 0; // sync data
+
+ /*
+ * Sync data and/or metadata as well.
+ * By default, sync only data.
+ * Override this method for environments where we need to sync
+ * metadata as well.
+ */
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return Sync(options, dbg);
+ }
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ virtual bool IsSyncThreadSafe() const { return false; }
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+ write_hint_ = hint;
+ }
+
+ /*
+ * If rate limiting is enabled, change the file-granularity priority used in
+ * rate-limiting writes.
+ *
+ * In the presence of finer-granularity priority such as
+ * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+ * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+ * a fallback for Env::IO_TOTAL finer-granularity priority.
+ *
+ * If rate limiting is not enabled, this call has no effect.
+ */
+ virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+ virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+ virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+ /*
+ * Get the size of valid data in the file.
+ */
+ virtual uint64_t GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return 0;
+ }
+
+ /*
+ * Get and set the default pre-allocation block size for writes to
+ * this file. If non-zero, then Allocate will be used to extend the
+ * underlying storage of a file (generally via fallocate) if the Env
+ * instance supports it.
+ */
+ virtual void SetPreallocationBlockSize(size_t size) {
+ preallocation_block_size_ = size;
+ }
+
+ virtual void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) {
+ *last_allocated_block = last_preallocated_block_;
+ *block_size = preallocation_block_size_;
+ }
+
+ // For documentation, refer to RandomAccessFile::GetUniqueId()
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+ const IOOptions& options, IODebugContext* dbg) {
+ if (strict_bytes_per_sync_) {
+ return Sync(options, dbg);
+ }
+ return IOStatus::OK();
+ }
+
+ // PrepareWrite performs any necessary preparation for a write
+ // before the write actually occurs. This allows for pre-allocation
+ // of space on devices where it can result in less file
+ // fragmentation and/or less waste from over-zealous filesystem
+ // pre-allocation.
+ virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) {
+ if (preallocation_block_size_ == 0) {
+ return;
+ }
+ // If this write would cross one or more preallocation blocks,
+ // determine what the last preallocation block necessary to
+ // cover this write would be and Allocate to that point.
+ const auto block_size = preallocation_block_size_;
+ size_t new_last_preallocated_block =
+ (offset + len + block_size - 1) / block_size;
+ if (new_last_preallocated_block > last_preallocated_block_) {
+ size_t num_spanned_blocks =
+ new_last_preallocated_block - last_preallocated_block_;
+ Allocate(block_size * last_preallocated_block_,
+ block_size * num_spanned_blocks, options, dbg)
+ .PermitUncheckedError();
+ last_preallocated_block_ = new_last_preallocated_block;
+ }
+ }
+
+ // Pre-allocates space for a file.
+ virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // WritableFileWrapper too.
+
+ protected:
+ size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+ size_t last_preallocated_block_;
+ size_t preallocation_block_size_;
+ // No copying allowed
+ FSWritableFile(const FSWritableFile&);
+ void operator=(const FSWritableFile&);
+
+ protected:
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class FSRandomRWFile {
+ public:
+ FSRandomRWFile() {}
+
+ virtual ~FSRandomRWFile() {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ virtual IOStatus Write(uint64_t offset, const Slice& data,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ //
+ // After call, result->size() < n only if end of file has been
+ // reached (or non-OK status). Read might fail if called again after
+ // first result->size() < n.
+ //
+ // Returns Status::OK() on success.
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const = 0;
+
+ virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return Sync(options, dbg);
+ }
+
+ virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // EXPERIMENTAL
+ // When available, returns the actual temperature for the file. This is
+ // useful in case some outside process moves a file from one tier to another,
+ // though the temperature is generally expected not to change while a file is
+ // open.
+ virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+ // If you're adding methods here, remember to add them to
+ // RandomRWFileWrapper too.
+
+ // No copying allowed
+ FSRandomRWFile(const RandomRWFile&) = delete;
+ FSRandomRWFile& operator=(const RandomRWFile&) = delete;
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class FSMemoryMappedFileBuffer {
+ public:
+ FSMemoryMappedFileBuffer(void* _base, size_t _length)
+ : base_(_base), length_(_length) {}
+
+ virtual ~FSMemoryMappedFileBuffer() = 0;
+
+ // We do not want to unmap this twice. We can make this class
+ // movable if desired, however, since
+ FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete;
+ FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete;
+
+ void* GetBase() const { return base_; }
+ size_t GetLen() const { return length_; }
+
+ protected:
+ void* base_;
+ const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class FSDirectory {
+ public:
+ virtual ~FSDirectory() {}
+ // Fsync directory. Can be called concurrently from multiple threads.
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it
+ // may fsync directory or just the renaming file (e.g. btrfs). By default, it
+ // just calls directory fsync.
+ virtual IOStatus FsyncWithDirOptions(
+ const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& /*dir_fsync_options*/) {
+ return Fsync(options, dbg);
+ }
+
+ // Close directory
+ virtual IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("Close");
+ }
+
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0;
+ }
+
+ // If you're adding methods here, remember to add them to
+ // DirectoryWrapper too.
+};
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper {
+// public:
+// MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target):
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {}
+// Status Read(size_t n, FileSystem::IOOptions& options, Slice* result,
+// char* scratch, FileSystem::IODebugContext* dbg) override {
+// cout << "Doing a read of size " << n << "!" << endl;
+// return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options,
+// result,
+// scratch, dbg);
+// }
+// // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+// forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+// rocksdb class. Unless you actually want to override the behavior.
+// (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class FileSystemWrapper : public FileSystem {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit FileSystemWrapper(const std::shared_ptr<FileSystem>& t);
+ ~FileSystemWrapper() override {}
+
+ // Return the target to which this Env forwards all calls
+ FileSystem* target() const { return target_.get(); }
+
+ // The following text is boilerplate that forwards all methods to target()
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewSequentialFile(f, file_opts, r, dbg);
+ }
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewRandomAccessFile(f, file_opts, r, dbg);
+ }
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewWritableFile(f, file_opts, r, dbg);
+ }
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ return target_->ReopenWritableFile(fname, file_opts, result, dbg);
+ }
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override {
+ return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg);
+ }
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override {
+ return target_->NewRandomRWFile(fname, file_opts, result, dbg);
+ }
+ IOStatus NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return target_->NewMemoryMappedFileBuffer(fname, result);
+ }
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override {
+ return target_->NewDirectory(name, io_opts, result, dbg);
+ }
+ IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+ IODebugContext* dbg) override {
+ return target_->FileExists(f, io_opts, dbg);
+ }
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override {
+ return target_->GetChildren(dir, io_opts, r, dbg);
+ }
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override {
+ return target_->GetChildrenFileAttributes(dir, options, result, dbg);
+ }
+ IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->DeleteFile(f, options, dbg);
+ }
+ IOStatus Truncate(const std::string& fname, size_t size,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Truncate(fname, size, options, dbg);
+ }
+ IOStatus CreateDir(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->CreateDir(d, options, dbg);
+ }
+ IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->CreateDirIfMissing(d, options, dbg);
+ }
+ IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->DeleteDir(d, options, dbg);
+ }
+ IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+ uint64_t* s, IODebugContext* dbg) override {
+ return target_->GetFileSize(f, options, s, dbg);
+ }
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override {
+ return target_->GetFileModificationTime(fname, options, file_mtime, dbg);
+ }
+
+ IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) override {
+ return target_->GetAbsolutePath(db_path, options, output_path, dbg);
+ }
+
+ IOStatus RenameFile(const std::string& s, const std::string& t,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->RenameFile(s, t, options, dbg);
+ }
+
+ IOStatus LinkFile(const std::string& s, const std::string& t,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->LinkFile(s, t, options, dbg);
+ }
+
+ IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
+ uint64_t* count, IODebugContext* dbg) override {
+ return target_->NumFileLinks(fname, options, count, dbg);
+ }
+
+ IOStatus AreFilesSame(const std::string& first, const std::string& second,
+ const IOOptions& options, bool* res,
+ IODebugContext* dbg) override {
+ return target_->AreFilesSame(first, second, options, res, dbg);
+ }
+
+ IOStatus LockFile(const std::string& f, const IOOptions& options,
+ FileLock** l, IODebugContext* dbg) override {
+ return target_->LockFile(f, options, l, dbg);
+ }
+
+ IOStatus UnlockFile(FileLock* l, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->UnlockFile(l, options, dbg);
+ }
+
+ IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) override {
+ return target_->GetTestDirectory(options, path, dbg);
+ }
+ IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override {
+ return target_->NewLogger(fname, options, result, dbg);
+ }
+
+ void SanitizeFileOptions(FileOptions* opts) const override {
+ target_->SanitizeFileOptions(opts);
+ }
+
+ FileOptions OptimizeForLogRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForLogRead(file_options);
+ }
+ FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestRead(file_options);
+ }
+ FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const override {
+ return target_->OptimizeForLogWrite(file_options, db_options);
+ }
+ FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestWrite(file_options);
+ }
+ FileOptions OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return target_->OptimizeForCompactionTableWrite(file_options,
+ immutable_ops);
+ }
+ FileOptions OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForCompactionTableRead(file_options, db_options);
+ }
+ FileOptions OptimizeForBlobFileRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForBlobFileRead(file_options, db_options);
+ }
+ IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
+ uint64_t* diskfree, IODebugContext* dbg) override {
+ return target_->GetFreeSpace(path, options, diskfree, dbg);
+ }
+ IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+ bool* is_dir, IODebugContext* dbg) override {
+ return target_->IsDirectory(path, options, is_dir, dbg);
+ }
+
+ const Customizable* Inner() const override { return target_.get(); }
+ Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const override;
+#endif // ROCKSDB_LITE
+
+ virtual IOStatus Poll(std::vector<void*>& io_handles,
+ size_t min_completions) override {
+ return target_->Poll(io_handles, min_completions);
+ }
+
+ virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
+ return target_->AbortIO(io_handles);
+ }
+
+ protected:
+ std::shared_ptr<FileSystem> target_;
+};
+
+class FSSequentialFileWrapper : public FSSequentialFile {
+ public:
+ // Creates a FileWrapper around the input File object and without
+ // taking ownership of the object
+ explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {}
+
+ FSSequentialFile* target() const { return target_; }
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override {
+ return target_->Read(n, options, result, scratch, dbg);
+ }
+ IOStatus Skip(uint64_t n) override { return target_->Skip(n); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override {
+ return target_->PositionedRead(offset, n, options, result, scratch, dbg);
+ }
+ Temperature GetTemperature() const override {
+ return target_->GetTemperature();
+ }
+
+ private:
+ FSSequentialFile* target_;
+};
+
+class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper {
+ public:
+ // Creates a FileWrapper around the input File object and takes
+ // ownership of the object
+ explicit FSSequentialFileOwnerWrapper(std::unique_ptr<FSSequentialFile>&& t)
+ : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+ std::unique_ptr<FSSequentialFile> guard_;
+};
+
+class FSRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+ // Creates a FileWrapper around the input File object and without
+ // taking ownership of the object
+ explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {}
+
+ FSRandomAccessFile* target() const { return target_; }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ return target_->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->MultiRead(reqs, num_reqs, options, dbg);
+ }
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Prefetch(offset, n, options, dbg);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ };
+ void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb,
+ void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+ IODebugContext* dbg) override {
+ return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+ }
+ Temperature GetTemperature() const override {
+ return target_->GetTemperature();
+ }
+
+ private:
+ std::unique_ptr<FSRandomAccessFile> guard_;
+ FSRandomAccessFile* target_;
+};
+
+class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper {
+ public:
+ // Creates a FileWrapper around the input File object and takes
+ // ownership of the object
+ explicit FSRandomAccessFileOwnerWrapper(
+ std::unique_ptr<FSRandomAccessFile>&& t)
+ : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+ std::unique_ptr<FSRandomAccessFile> guard_;
+};
+
+class FSWritableFileWrapper : public FSWritableFile {
+ public:
+ // Creates a FileWrapper around the input File object and without
+ // taking ownership of the object
+ explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {}
+
+ FSWritableFile* target() const { return target_; }
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Append(data, options, dbg);
+ }
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override {
+ return target_->Append(data, options, verification_info, dbg);
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->PositionedAppend(data, offset, options, dbg);
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override {
+ return target_->PositionedAppend(data, offset, options, verification_info,
+ dbg);
+ }
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Truncate(size, options, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Close(options, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->GetFileSize(options, dbg);
+ }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->RangeSync(offset, nbytes, options, dbg);
+ }
+
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ target_->PrepareWrite(offset, len, options, dbg);
+ }
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Allocate(offset, len, options, dbg);
+ }
+
+ private:
+ FSWritableFile* target_;
+};
+
+class FSWritableFileOwnerWrapper : public FSWritableFileWrapper {
+ public:
+ // Creates a FileWrapper around the input File object and takes
+ // ownership of the object
+ explicit FSWritableFileOwnerWrapper(std::unique_ptr<FSWritableFile>&& t)
+ : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+ std::unique_ptr<FSWritableFile> guard_;
+};
+
+class FSRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+ // Creates a FileWrapper around the input File object and without
+ // taking ownership of the object
+ explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {}
+
+ FSRandomRWFile* target() const { return target_; }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Write(offset, data, options, dbg);
+ }
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ return target_->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Close(options, dbg);
+ }
+ Temperature GetTemperature() const override {
+ return target_->GetTemperature();
+ }
+
+ private:
+ FSRandomRWFile* target_;
+};
+
+class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper {
+ public:
+ // Creates a FileWrapper around the input File object and takes
+ // ownership of the object
+ explicit FSRandomRWFileOwnerWrapper(std::unique_ptr<FSRandomRWFile>&& t)
+ : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+ std::unique_ptr<FSRandomRWFile> guard_;
+};
+
+class FSDirectoryWrapper : public FSDirectory {
+ public:
+ // Creates a FileWrapper around the input File object and takes
+ // ownership of the object
+ explicit FSDirectoryWrapper(std::unique_ptr<FSDirectory>&& t)
+ : guard_(std::move(t)) {
+ target_ = guard_.get();
+ }
+
+ // Creates a FileWrapper around the input File object and without
+ // taking ownership of the object
+ explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {}
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+
+ IOStatus FsyncWithDirOptions(
+ const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& dir_fsync_options) override {
+ return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Close(options, dbg);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ std::unique_ptr<FSDirectory> guard_;
+ FSDirectory* target_;
+};
+
+// A utility routine: write "data" to the named file.
+extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+ const std::string& fname,
+ bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+ std::string* data);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h
new file mode 100644
index 000000000..954d15b4a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/filter_policy.h
@@ -0,0 +1,206 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys. These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#pragma once
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct BlockBasedTableOptions;
+struct ConfigOptions;
+
+// As of RocksDB 7.0, the details of these classes are internal
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext(). References other
+// structs because this is expected to be a temporary, stack-allocated object.
+struct FilterBuildingContext {
+ // This constructor is for internal use only and subject to change.
+ FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+ // Options for the table being built
+ const BlockBasedTableOptions& table_options;
+
+ // BEGIN from (DB|ColumnFamily)Options in effect at table creation time
+ CompactionStyle compaction_style = kCompactionStyleLevel;
+
+ // Number of LSM levels, or -1 if unknown
+ int num_levels = -1;
+
+ // An optional logger for reporting errors, warnings, etc.
+ Logger* info_log = nullptr;
+ // END from (DB|ColumnFamily)Options
+
+ // Name of the column family for the table (or empty string if unknown)
+ // TODO: consider changing to Slice
+ std::string column_family_name;
+
+ // The table level at time of constructing the SST file, or -1 if unknown
+ // or N/A as in SstFileWriter. (The table file could later be used at a
+ // different level.)
+ int level_at_creation = -1;
+
+ // True if known to be going into bottommost sorted run for applicable
+ // key range (which might not even be last level with data). False
+ // otherwise.
+ bool is_bottommost = false;
+
+ // Reason for creating the file with the filter
+ TableFileCreationReason reason = TableFileCreationReason::kMisc;
+};
+
+// Determines what kind of filter (if any) to generate in SST files, and under
+// which conditions. API users can create custom filter policies that
+// defer to other built-in policies (see NewBloomFilterPolicy and
+// NewRibbonFilterPolicy) based on the context provided to
+// GetBuilderWithContext.
+class FilterPolicy : public Customizable {
+ public:
+ virtual ~FilterPolicy();
+ static const char* Type() { return "FilterPolicy"; }
+
+ // The name used for identifying whether a filter on disk is readable
+ // by this FilterPolicy. If this FilterPolicy is part of a family that
+ // can read each others filters, such as built-in BloomFilterPolcy and
+ // RibbonFilterPolicy, the CompatibilityName is a shared family name,
+ // while kinds of filters in the family can have distinct Customizable
+ // Names. This function is pure virtual so that wrappers around built-in
+ // policies are prompted to defer to CompatibilityName() of the wrapped
+ // policy, which is important for compatibility.
+ //
+ // For custom filter policies that are not part of a read-compatible
+ // family (rare), implementations may return Name().
+ virtual const char* CompatibilityName() const = 0;
+
+ // Creates a new FilterPolicy based on the input value string and returns the
+ // result The value might be an ID, and ID with properties, or an old-style
+ // policy string.
+ // The value describes the FilterPolicy being created.
+ // For BloomFilters, value may be a ":"-delimited value of the form:
+ // "bloomfilter:[bits_per_key]",
+ // e.g. ""bloomfilter:4"
+ // The above string is equivalent to calling NewBloomFilterPolicy(4).
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<const FilterPolicy>* result);
+
+ // Return a new FilterBitsBuilder for constructing full or partitioned
+ // filter blocks, or return nullptr to indicate "no filter". Custom
+ // implementations should defer to a built-in FilterPolicy to get a
+ // new FilterBitsBuilder, but the FilterBuildingContext can be used
+ // to decide which built-in FilterPolicy to defer to.
+ virtual FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext&) const = 0;
+
+ // Return a new FilterBitsReader for full or partitioned filter blocks.
+ // Caller retains ownership of any buffer pointed to by the input Slice.
+ // Custom implementation should defer to GetFilterBitsReader on any
+ // built-in FilterPolicy, which can read filters generated by any other
+ // built-in FilterPolicy.
+ virtual FilterBitsReader* GetFilterBitsReader(
+ const Slice& /*contents*/) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key. See
+// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+//
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
+// To avoid configurations that are unlikely to produce good filtering
+// value for the CPU overhead, bits_per_key < 0.5 is rounded down to 0.0
+// which means "generate no filter", and 0.5 <= bits_per_key < 1.0 is
+// rounded up to 1.0, for a 62% FP rate.
+//
+// The caller is responsible for eventually deleting the result, though
+// this is typically handled automatically with BlockBasedTableOptions:
+// table_options.filter_policy.reset(NewBloomFilterPolicy(...));
+//
+// As of RocksDB 7.0, the use_block_based_builder parameter is ignored.
+// (The old, inefficient block-based filter is no longer accessible in
+// the public API.)
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys. For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(
+ double bits_per_key, bool IGNORED_use_block_based_builder = false);
+
+// A new Bloom alternative that saves about 30% space compared to
+// Bloom filters, with similar query times but roughly 3-4x CPU time
+// and 3x temporary space usage during construction. For example, if
+// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same
+// 0.95% FP rate as Bloom filter but only using about 7 bits per key.
+//
+// The space savings of Ribbon filters makes sense for lower (higher
+// numbered; larger; longer-lived) levels of LSM, whereas the speed of
+// Bloom filters make sense for highest levels of LSM. Setting
+// bloom_before_level allows for this design with Level and Universal
+// compaction styles. For example, bloom_before_level=1 means that Bloom
+// filters will be used in level 0, including flushes, and Ribbon
+// filters elsewhere, including FIFO compaction and external SST files.
+// For this option, memtable flushes are considered level -1 (so that
+// flushes can be distinguished from intra-L0 compaction).
+// bloom_before_level=0 (default) -> Generate Bloom filters only for
+// flushes under Level and Universal compaction styles.
+// bloom_before_level=-1 -> Always generate Ribbon filters (except in
+// some extreme or exceptional cases).
+//
+// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier
+// versions reading the data will behave as if no filter was used
+// (degraded performance until compaction rebuilds filters). All
+// built-in FilterPolicies (Bloom or Ribbon) are able to read other
+// kinds of built-in filters.
+//
+// Note: the current Ribbon filter schema uses some extra resources
+// when constructing very large filters. For example, for 100 million
+// keys in a single filter (one SST file without partitioned filters),
+// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom.
+// However, the savings in filter space from just ~60 open SST files
+// makes up for the additional temporary memory use.
+//
+// Also consider using optimize_filters_for_memory to save filter
+// memory.
+extern const FilterPolicy* NewRibbonFilterPolicy(
+ double bloom_equivalent_bits_per_key, int bloom_before_level = 0);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h
new file mode 100644
index 000000000..7a5dd957e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class BlockBuilder;
+struct ConfigOptions;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FlushBlockPolicy {
+ public:
+ // Keep track of the key/value sequences and return the boolean value to
+ // determine if table builder should flush current data block.
+ virtual bool Update(const Slice& key, const Slice& value) = 0;
+
+ virtual ~FlushBlockPolicy() {}
+};
+
+class FlushBlockPolicyFactory : public Customizable {
+ public:
+ static const char* Type() { return "FlushBlockPolicyFactory"; }
+
+ // Creates a FlushBlockPolicyFactory based on the input value.
+ // By default, this method can create EveryKey or BySize PolicyFactory,
+ // which take now config_options.
+ static Status CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::shared_ptr<FlushBlockPolicyFactory>* result);
+
+ // Return a new block flush policy that flushes data blocks by data size.
+ // FlushBlockPolicy may need to access the metadata of the data block
+ // builder to determine when to flush the blocks.
+ //
+ // Callers must delete the result after any database that is using the
+ // result has been closed.
+ virtual FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& table_options,
+ const BlockBuilder& data_block_builder) const = 0;
+
+ virtual ~FlushBlockPolicyFactory() {}
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ FlushBlockBySizePolicyFactory();
+
+ static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; }
+ const char* Name() const override { return kClassName(); }
+
+ FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& table_options,
+ const BlockBuilder& data_block_builder) const override;
+
+ static FlushBlockPolicy* NewFlushBlockPolicy(
+ const uint64_t size, const int deviation,
+ const BlockBuilder& data_block_builder);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/functor_wrapper.h b/src/rocksdb/include/rocksdb/functor_wrapper.h
new file mode 100644
index 000000000..17b021bf7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/functor_wrapper.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+template <std::size_t...>
+struct IndexSequence {};
+
+template <std::size_t N, std::size_t... Next>
+struct IndexSequenceHelper
+ : public IndexSequenceHelper<N - 1U, N - 1U, Next...> {};
+
+template <std::size_t... Next>
+struct IndexSequenceHelper<0U, Next...> {
+ using type = IndexSequence<Next...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSequenceHelper<N>::type;
+
+template <typename Function, typename Tuple, size_t... I>
+void call(Function f, Tuple t, IndexSequence<I...>) {
+ f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+void call(Function f, Tuple t) {
+ static constexpr auto size = std::tuple_size<Tuple>::value;
+ call(f, t, make_index_sequence<size>{});
+}
+} // namespace detail
+
+template <typename... Args>
+class FunctorWrapper {
+ public:
+ explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+ : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
+
+ void invoke() { detail::call(functor_, args_); }
+
+ private:
+ std::function<void(Args...)> functor_;
+ std::tuple<Args...> args_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h
new file mode 100644
index 000000000..0bf5e939a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/io_status.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// An IOStatus encapsulates the result of an operation. It may indicate
+// success, or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on an IOStatus without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same IOStatus must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+
+#include "status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IOStatus : public Status {
+ public:
+ using Code = Status::Code;
+ using SubCode = Status::SubCode;
+
+ enum IOErrorScope : unsigned char {
+ kIOErrorScopeFileSystem,
+ kIOErrorScopeFile,
+ kIOErrorScopeRange,
+ kIOErrorScopeMax,
+ };
+
+ // Create a success status.
+ IOStatus() : IOStatus(kOk, kNone) {}
+ ~IOStatus() {}
+
+ // Copy the specified status.
+ IOStatus(const IOStatus& s);
+ IOStatus& operator=(const IOStatus& s);
+ IOStatus(IOStatus&& s) noexcept;
+ IOStatus& operator=(IOStatus&& s) noexcept;
+ bool operator==(const IOStatus& rhs) const;
+ bool operator!=(const IOStatus& rhs) const;
+
+ void SetRetryable(bool retryable) { retryable_ = retryable; }
+ void SetDataLoss(bool data_loss) { data_loss_ = data_loss; }
+ void SetScope(IOErrorScope scope) {
+ scope_ = static_cast<unsigned char>(scope);
+ }
+
+ bool GetRetryable() const { return retryable_; }
+ bool GetDataLoss() const { return data_loss_; }
+ IOErrorScope GetScope() const { return static_cast<IOErrorScope>(scope_); }
+
+ // Return a success status.
+ static IOStatus OK() { return IOStatus(); }
+
+ static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kNotSupported, msg, msg2);
+ }
+ static IOStatus NotSupported(SubCode msg = kNone) {
+ return IOStatus(kNotSupported, msg);
+ }
+
+ // Return error status of an appropriate type.
+ static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kNotFound, msg, msg2);
+ }
+ // Fast path for not found without malloc;
+ static IOStatus NotFound(SubCode msg = kNone) {
+ return IOStatus(kNotFound, msg);
+ }
+
+ static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kCorruption, msg, msg2);
+ }
+ static IOStatus Corruption(SubCode msg = kNone) {
+ return IOStatus(kCorruption, msg);
+ }
+
+ static IOStatus InvalidArgument(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return IOStatus(kInvalidArgument, msg, msg2);
+ }
+ static IOStatus InvalidArgument(SubCode msg = kNone) {
+ return IOStatus(kInvalidArgument, msg);
+ }
+
+ static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, msg, msg2);
+ }
+ static IOStatus IOError(SubCode msg = kNone) {
+ return IOStatus(kIOError, msg);
+ }
+
+ static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); }
+ static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kBusy, msg, msg2);
+ }
+
+ static IOStatus TimedOut(SubCode msg = kNone) {
+ return IOStatus(kTimedOut, msg);
+ }
+ static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kTimedOut, msg, msg2);
+ }
+
+ static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); }
+ static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, kNoSpace, msg, msg2);
+ }
+
+ static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); }
+ static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, kPathNotFound, msg, msg2);
+ }
+
+ static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); }
+ static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, kIOFenced, msg, msg2);
+ }
+
+ static IOStatus Aborted(SubCode msg = kNone) {
+ return IOStatus(kAborted, msg);
+ }
+ static IOStatus Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kAborted, msg, msg2);
+ }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ // std::string ToString() const;
+
+ private:
+ friend IOStatus status_to_io_status(Status&&);
+
+ explicit IOStatus(Code _code, SubCode _subcode = kNone)
+ : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {}
+
+ IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+ IOStatus(Code _code, const Slice& msg, const Slice& msg2)
+ : IOStatus(_code, kNone, msg, msg2) {}
+};
+
+inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg,
+ const Slice& msg2)
+ : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {
+ assert(code_ != kOk);
+ assert(subcode_ != kMaxSubCode);
+ const size_t len1 = msg.size();
+ const size_t len2 = msg2.size();
+ const size_t size = len1 + (len2 ? (2 + len2) : 0);
+ char* const result = new char[size + 1]; // +1 for null terminator
+ memcpy(result, msg.data(), len1);
+ if (len2) {
+ result[len1] = ':';
+ result[len1 + 1] = ' ';
+ memcpy(result + len1 + 2, msg2.data(), len2);
+ }
+ result[size] = '\0'; // null terminator for C style string
+ state_.reset(result);
+}
+
+inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ s.checked_ = true;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline IOStatus& IOStatus::operator=(const IOStatus& s) {
+ // The following condition catches both aliasing (when this == &s),
+ // and the common case where both s and *this are ok.
+ if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ s.checked_ = true;
+ checked_ = false;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ code_ = s.code_;
+ subcode_ = s.subcode_;
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+ }
+ return *this;
+}
+
+inline IOStatus::IOStatus(IOStatus&& s) noexcept : IOStatus() {
+ *this = std::move(s);
+}
+
+inline IOStatus& IOStatus::operator=(IOStatus&& s) noexcept {
+ if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ s.checked_ = true;
+ checked_ = false;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ code_ = std::move(s.code_);
+ s.code_ = kOk;
+ subcode_ = std::move(s.subcode_);
+ s.subcode_ = kNone;
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ s.scope_ = kIOErrorScopeFileSystem;
+ state_ = std::move(s.state_);
+ }
+ return *this;
+}
+
+inline bool IOStatus::operator==(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ checked_ = true;
+ rhs.checked_ = true;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ return (code_ == rhs.code_);
+}
+
+inline bool IOStatus::operator!=(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ checked_ = true;
+ rhs.checked_ = true;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ return !(*this == rhs);
+}
+
+inline IOStatus status_to_io_status(Status&& status) {
+ IOStatus io_s;
+ Status& s = io_s;
+ s = std::move(status);
+ return io_s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
new file mode 100644
index 000000000..559d44c57
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+// A thread local context for gathering io-stats efficiently and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each
+// item in Temperature class.
+struct FileIOByTemperature {
+ // the number of bytes read to Temperature::kHot file
+ uint64_t hot_file_bytes_read;
+ // the number of bytes read to Temperature::kWarm file
+ uint64_t warm_file_bytes_read;
+ // the number of bytes read to Temperature::kCold file
+ uint64_t cold_file_bytes_read;
+ // total number of reads to Temperature::kHot file
+ uint64_t hot_file_read_count;
+ // total number of reads to Temperature::kWarm file
+ uint64_t warm_file_read_count;
+ // total number of reads to Temperature::kCold file
+ uint64_t cold_file_read_count;
+ // reset all the statistics to 0.
+ void Reset() {
+ hot_file_bytes_read = 0;
+ warm_file_bytes_read = 0;
+ cold_file_bytes_read = 0;
+ hot_file_read_count = 0;
+ warm_file_read_count = 0;
+ cold_file_read_count = 0;
+ }
+};
+
+struct IOStatsContext {
+ // reset all io-stats counter to zero
+ void Reset();
+
+ std::string ToString(bool exclude_zero_counters = false) const;
+
+ // the thread pool id
+ uint64_t thread_pool_id;
+
+ // number of bytes that has been written.
+ uint64_t bytes_written;
+ // number of bytes that has been read.
+ uint64_t bytes_read;
+
+ // time spent in open() and fopen().
+ uint64_t open_nanos;
+ // time spent in fallocate().
+ uint64_t allocate_nanos;
+ // time spent in write() and pwrite().
+ uint64_t write_nanos;
+ // time spent in read() and pread()
+ uint64_t read_nanos;
+ // time spent in sync_file_range().
+ uint64_t range_sync_nanos;
+ // time spent in fsync
+ uint64_t fsync_nanos;
+ // time spent in preparing write (fallocate etc).
+ uint64_t prepare_write_nanos;
+ // time spent in Logger::Logv().
+ uint64_t logger_nanos;
+ // CPU time spent in write() and pwrite()
+ uint64_t cpu_write_nanos;
+ // CPU time spent in read() and pread()
+ uint64_t cpu_read_nanos;
+
+ FileIOByTemperature file_io_stats_by_temperature;
+
+ // It is not consistent that whether iostats follows PerfLevel.Timer counters
+ // follows it but BackupEngine relies on counter metrics to always be there.
+ // Here we create a backdoor option to disable some counters, so that some
+ // existing stats are not polluted by file operations, such as logging, by
+ // turning this off.
+ bool disable_iostats = false;
+};
+
+// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
+// non-thread-local IOStatsContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise, a pointer to a thread-local IOStatsContext object will be
+// returned.
+//
+// This function never returns nullptr.
+IOStatsContext* get_iostats_context();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h
new file mode 100644
index 000000000..9d4c9f73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iterator.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface. Multiple implementations
+// are provided by this library. In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/cleanable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator : public Cleanable {
+ public:
+ Iterator() {}
+ // No copying allowed
+ Iterator(const Iterator&) = delete;
+ void operator=(const Iterator&) = delete;
+
+ virtual ~Iterator() {}
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ // Always returns false if !status().ok().
+ virtual bool Valid() const = 0;
+
+ // Position at the first key in the source. The iterator is Valid()
+ // after this call iff the source is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last key in the source. The iterator is
+ // Valid() after this call iff the source is not empty.
+ virtual void SeekToLast() = 0;
+
+ // Position at the first key in the source that at or past target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ // All Seek*() methods clear any error status() that the iterator had prior to
+ // the call; after the seek, status() indicates only the error (if any) that
+ // happened during the seek, not any past errors.
+ // Target does not contain timestamp.
+ virtual void Seek(const Slice& target) = 0;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ // Target does not contain timestamp.
+ virtual void SeekForPrev(const Slice& target) = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Moves to the previous entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the first entry in source.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of the
+ // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev
+ // operation).
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return the value for the current entry. If the entry is a plain key-value,
+ // return the value as-is; if it is a wide-column entity, return the value of
+ // the default anonymous column (see kDefaultWideColumnName) if any, or an
+ // empty value otherwise. The underlying storage for the returned slice is
+ // valid only until the next modification of the iterator (i.e. the next
+ // SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+ // REQUIRES: Valid()
+ virtual Slice value() const = 0;
+
+ // Return the wide columns for the current entry. If the entry is a
+ // wide-column entity, return it as-is; if it is a plain key-value, return it
+ // as an entity with a single anonymous column (see kDefaultWideColumnName)
+ // which contains the value. The underlying storage for the returned
+ // structure is valid only until the next modification of the iterator (i.e.
+ // the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+ // REQUIRES: Valid()
+ virtual const WideColumns& columns() const {
+ assert(false);
+ return kNoWideColumns;
+ }
+
+ // If an error has occurred, return it. Else return an ok status.
+ // If non-blocking IO is requested and this operation cannot be
+ // satisfied without doing some IO, then this returns Status::Incomplete().
+ virtual Status status() const = 0;
+
+ // If supported, renew the iterator to represent the latest state. The
+ // iterator will be invalidated after the call. Not supported if
+ // ReadOptions.snapshot is given when creating the iterator.
+ virtual Status Refresh() {
+ return Status::NotSupported("Refresh() is not supported");
+ }
+
+ // Property "rocksdb.iterator.is-key-pinned":
+ // If returning "1", this means that the Slice returned by key() is valid
+ // as long as the iterator is not deleted.
+ // It is guaranteed to always return "1" if
+ // - Iterator created with ReadOptions::pin_data = true
+ // - DB tables were created with
+ // BlockBasedTableOptions::use_delta_encoding = false.
+ // Property "rocksdb.iterator.super-version-number":
+ // LSM version used by the iterator. The same format as DB Property
+ // kCurrentSuperVersionNumber. See its comment for more information.
+ // Property "rocksdb.iterator.internal-key":
+ // Get the user-key portion of the internal key at which the iteration
+ // stopped.
+ virtual Status GetProperty(std::string prop_name, std::string* prop);
+
+ virtual Slice timestamp() const {
+ assert(false);
+ return Slice();
+ }
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h
new file mode 100644
index 000000000..7408cbc87
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/ldb_tool.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An interface for converting a slice to a readable string
+class SliceFormatter {
+ public:
+ virtual ~SliceFormatter() {}
+ virtual std::string Format(const Slice& s) const = 0;
+};
+
+// Options for customizing ldb tool (beyond the DB Options)
+struct LDBOptions {
+ // Create LDBOptions with default values for all fields
+ LDBOptions();
+
+ // Key formatter that converts a slice to a readable string.
+ // Default: Slice::ToString()
+ std::shared_ptr<SliceFormatter> key_formatter;
+
+ std::string print_help_header = "ldb - RocksDB Tool";
+};
+
+class LDBTool {
+ public:
+ void Run(
+ int argc, char** argv, Options db_options = Options(),
+ const LDBOptions& ldb_options = LDBOptions(),
+ const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
new file mode 100644
index 000000000..8644fcf3f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -0,0 +1,847 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using TablePropertiesCollection =
+ std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+class DB;
+class ColumnFamilyHandle;
+class Status;
+struct CompactionJobStats;
+
+struct FileCreationBriefInfo {
+ FileCreationBriefInfo() = default;
+ FileCreationBriefInfo(const std::string& _db_name,
+ const std::string& _cf_name,
+ const std::string& _file_path, int _job_id)
+ : db_name(_db_name),
+ cf_name(_cf_name),
+ file_path(_file_path),
+ job_id(_job_id) {}
+ // the name of the database where the file was created.
+ std::string db_name;
+ // the name of the column family where the file was created.
+ std::string cf_name;
+ // the path to the created file.
+ std::string file_path;
+ // the id of the job (which could be flush or compaction) that
+ // created the file.
+ int job_id = 0;
+};
+
+struct TableFileCreationBriefInfo : public FileCreationBriefInfo {
+ // reason of creating the table.
+ TableFileCreationReason reason;
+};
+
+struct TableFileCreationInfo : public TableFileCreationBriefInfo {
+ TableFileCreationInfo() = default;
+ explicit TableFileCreationInfo(TableProperties&& prop)
+ : table_properties(prop) {}
+ // the size of the file.
+ uint64_t file_size;
+ // Detailed properties of the created file.
+ TableProperties table_properties;
+ // The status indicating whether the creation was successful or not.
+ Status status;
+ // The checksum of the table file being created
+ std::string file_checksum;
+ // The checksum function name of checksum generator used for this table file
+ std::string file_checksum_func_name;
+};
+
+struct BlobFileCreationBriefInfo : public FileCreationBriefInfo {
+ BlobFileCreationBriefInfo(const std::string& _db_name,
+ const std::string& _cf_name,
+ const std::string& _file_path, int _job_id,
+ BlobFileCreationReason _reason)
+ : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id),
+ reason(_reason) {}
+ // reason of creating the blob file.
+ BlobFileCreationReason reason;
+};
+
+struct BlobFileCreationInfo : public BlobFileCreationBriefInfo {
+ BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name,
+ const std::string& _file_path, int _job_id,
+ BlobFileCreationReason _reason,
+ uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+ Status _status, const std::string& _file_checksum,
+ const std::string& _file_checksum_func_name)
+ : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id,
+ _reason),
+ total_blob_count(_total_blob_count),
+ total_blob_bytes(_total_blob_bytes),
+ status(_status),
+ file_checksum(_file_checksum),
+ file_checksum_func_name(_file_checksum_func_name) {}
+
+ // the number of blob in a file.
+ uint64_t total_blob_count;
+ // the total bytes in a file.
+ uint64_t total_blob_bytes;
+ // The status indicating whether the creation was successful or not.
+ Status status;
+ // The checksum of the blob file being created.
+ std::string file_checksum;
+ // The checksum function name of checksum generator used for this blob file.
+ std::string file_checksum_func_name;
+};
+
+enum class CompactionReason : int {
+ kUnknown = 0,
+ // [Level] number of L0 files > level0_file_num_compaction_trigger
+ kLevelL0FilesNum,
+ // [Level] total size of level > MaxBytesForLevel()
+ kLevelMaxLevelSize,
+ // [Universal] Compacting for size amplification
+ kUniversalSizeAmplification,
+ // [Universal] Compacting for size ratio
+ kUniversalSizeRatio,
+ // [Universal] number of sorted runs > level0_file_num_compaction_trigger
+ kUniversalSortedRunNum,
+ // [FIFO] total size > max_table_files_size
+ kFIFOMaxSize,
+ // [FIFO] reduce number of files.
+ kFIFOReduceNumFiles,
+ // [FIFO] files with creation time < (current_time - interval)
+ kFIFOTtl,
+ // Manual compaction
+ kManualCompaction,
+ // DB::SuggestCompactRange() marked files for compaction
+ kFilesMarkedForCompaction,
+ // [Level] Automatic compaction within bottommost level to cleanup duplicate
+ // versions of same user key, usually due to a released snapshot.
+ kBottommostFiles,
+ // Compaction based on TTL
+ kTtl,
+ // According to the comments in flush_job.cc, RocksDB treats flush as
+ // a level 0 compaction in internal stats.
+ kFlush,
+ // Compaction caused by external sst file ingestion
+ kExternalSstIngestion,
+ // Compaction due to SST file being too old
+ kPeriodicCompaction,
+ // Compaction in order to move files to temperature
+ kChangeTemperature,
+ // Compaction scheduled to force garbage collection of blob files
+ kForcedBlobGC,
+ // A special TTL compaction for RoundRobin policy, which basically the same as
+ // kLevelMaxLevelSize, but the goal is to compact TTLed files.
+ kRoundRobinTtl,
+ // total number of compaction reasons, new reasons must be added above this.
+ kNumOfReasons,
+};
+
+enum class FlushReason : int {
+ kOthers = 0x00,
+ kGetLiveFiles = 0x01,
+ kShutDown = 0x02,
+ kExternalFileIngestion = 0x03,
+ kManualCompaction = 0x04,
+ kWriteBufferManager = 0x05,
+ kWriteBufferFull = 0x06,
+ kTest = 0x07,
+ kDeleteFiles = 0x08,
+ kAutoCompaction = 0x09,
+ kManualFlush = 0x0a,
+ kErrorRecovery = 0xb,
+ // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
+ // will not be called to avoid many small immutable memtables.
+ kErrorRecoveryRetryFlush = 0xc,
+ kWalFull = 0xd,
+};
+
+// TODO: In the future, BackgroundErrorReason will only be used to indicate
+// why the BG Error is happening (e.g., flush, compaction). We may introduce
+// other data structure to indicate other essential information such as
+// the file type (e.g., Manifest, SST) and special context.
+enum class BackgroundErrorReason {
+ kFlush,
+ kCompaction,
+ kWriteCallback,
+ kMemTable,
+ kManifestWrite,
+ kFlushNoWAL,
+ kManifestWriteNoWAL,
+};
+
+enum class WriteStallCondition {
+ kNormal,
+ kDelayed,
+ kStopped,
+};
+
+struct WriteStallInfo {
+ // the name of the column family
+ std::string cf_name;
+ // state of the write controller
+ struct {
+ WriteStallCondition cur;
+ WriteStallCondition prev;
+ } condition;
+};
+
+#ifndef ROCKSDB_LITE
+
+struct FileDeletionInfo {
+ FileDeletionInfo() = default;
+
+ FileDeletionInfo(const std::string& _db_name, const std::string& _file_path,
+ int _job_id, Status _status)
+ : db_name(_db_name),
+ file_path(_file_path),
+ job_id(_job_id),
+ status(_status) {}
+ // The name of the database where the file was deleted.
+ std::string db_name;
+ // The path to the deleted file.
+ std::string file_path;
+ // The id of the job which deleted the file.
+ int job_id = 0;
+ // The status indicating whether the deletion was successful or not.
+ Status status;
+};
+
+struct TableFileDeletionInfo : public FileDeletionInfo {};
+
+struct BlobFileDeletionInfo : public FileDeletionInfo {
+ BlobFileDeletionInfo(const std::string& _db_name,
+ const std::string& _file_path, int _job_id,
+ Status _status)
+ : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {}
+};
+
+enum class FileOperationType {
+ kRead,
+ kWrite,
+ kTruncate,
+ kClose,
+ kFlush,
+ kSync,
+ kFsync,
+ kRangeSync,
+ kAppend,
+ kPositionedAppend,
+ kOpen
+};
+
+struct FileOperationInfo {
+ using Duration = std::chrono::nanoseconds;
+ using SteadyTimePoint =
+ std::chrono::time_point<std::chrono::steady_clock, Duration>;
+ using SystemTimePoint =
+ std::chrono::time_point<std::chrono::system_clock, Duration>;
+ using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
+ using FinishTimePoint = SteadyTimePoint;
+
+ FileOperationType type;
+ const std::string& path;
+ // Rocksdb try to provide file temperature information, but it's not
+ // guaranteed.
+ Temperature temperature;
+ uint64_t offset;
+ size_t length;
+ const Duration duration;
+ const SystemTimePoint& start_ts;
+ Status status;
+
+ FileOperationInfo(const FileOperationType _type, const std::string& _path,
+ const StartTimePoint& _start_ts,
+ const FinishTimePoint& _finish_ts, const Status& _status,
+ const Temperature _temperature = Temperature::kUnknown)
+ : type(_type),
+ path(_path),
+ temperature(_temperature),
+ duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
+ _finish_ts - _start_ts.second)),
+ start_ts(_start_ts.first),
+ status(_status) {}
+ static StartTimePoint StartNow() {
+ return std::make_pair<SystemTimePoint, SteadyTimePoint>(
+ std::chrono::system_clock::now(), std::chrono::steady_clock::now());
+ }
+ static FinishTimePoint FinishNow() {
+ return std::chrono::steady_clock::now();
+ }
+};
+
+struct BlobFileInfo {
+ BlobFileInfo(const std::string& _blob_file_path,
+ const uint64_t _blob_file_number)
+ : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {}
+
+ std::string blob_file_path;
+ uint64_t blob_file_number;
+};
+
+struct BlobFileAdditionInfo : public BlobFileInfo {
+ BlobFileAdditionInfo(const std::string& _blob_file_path,
+ const uint64_t _blob_file_number,
+ const uint64_t _total_blob_count,
+ const uint64_t _total_blob_bytes)
+ : BlobFileInfo(_blob_file_path, _blob_file_number),
+ total_blob_count(_total_blob_count),
+ total_blob_bytes(_total_blob_bytes) {}
+ uint64_t total_blob_count;
+ uint64_t total_blob_bytes;
+};
+
+struct BlobFileGarbageInfo : public BlobFileInfo {
+ BlobFileGarbageInfo(const std::string& _blob_file_path,
+ const uint64_t _blob_file_number,
+ const uint64_t _garbage_blob_count,
+ const uint64_t _garbage_blob_bytes)
+ : BlobFileInfo(_blob_file_path, _blob_file_number),
+ garbage_blob_count(_garbage_blob_count),
+ garbage_blob_bytes(_garbage_blob_bytes) {}
+ uint64_t garbage_blob_count;
+ uint64_t garbage_blob_bytes;
+};
+
+struct FlushJobInfo {
+ // the id of the column family
+ uint32_t cf_id;
+ // the name of the column family
+ std::string cf_name;
+ // the path to the newly created file
+ std::string file_path;
+ // the file number of the newly created file
+ uint64_t file_number;
+ // the oldest blob file referenced by the newly created file
+ uint64_t oldest_blob_file_number;
+ // the id of the thread that completed this flush job.
+ uint64_t thread_id;
+ // the job id, which is unique in the same thread.
+ int job_id;
+ // If true, then rocksdb is currently slowing-down all writes to prevent
+ // creating too many Level 0 files as compaction seems not able to
+ // catch up the write request speed. This indicates that there are
+ // too many files in Level 0.
+ bool triggered_writes_slowdown;
+ // If true, then rocksdb is currently blocking any writes to prevent
+ // creating more L0 files. This indicates that there are too many
+ // files in level 0. Compactions should try to compact L0 files down
+ // to lower levels as soon as possible.
+ bool triggered_writes_stop;
+ // The smallest sequence number in the newly created file
+ SequenceNumber smallest_seqno;
+ // The largest sequence number in the newly created file
+ SequenceNumber largest_seqno;
+ // Table properties of the table being flushed
+ TableProperties table_properties;
+
+ FlushReason flush_reason;
+
+ // Compression algorithm used for blob output files
+ CompressionType blob_compression_type;
+
+ // Information about blob files created during flush in Integrated BlobDB.
+ std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+};
+
+struct CompactionFileInfo {
+ // The level of the file.
+ int level;
+
+ // The file number of the file.
+ uint64_t file_number;
+
+ // The file number of the oldest blob file this SST file references.
+ uint64_t oldest_blob_file_number;
+};
+
+struct SubcompactionJobInfo {
+ ~SubcompactionJobInfo() { status.PermitUncheckedError(); }
+ // the id of the column family where the compaction happened.
+ uint32_t cf_id;
+ // the name of the column family where the compaction happened.
+ std::string cf_name;
+ // the status indicating whether the compaction was successful or not.
+ Status status;
+ // the id of the thread that completed this compaction job.
+ uint64_t thread_id;
+ // the job id, which is unique in the same thread.
+ int job_id;
+
+ // sub-compaction job id, which is only unique within the same compaction, so
+ // use both 'job_id' and 'subcompaction_job_id' to identify a subcompaction
+ // within an instance.
+ // For non subcompaction job, it's set to -1.
+ int subcompaction_job_id;
+ // the smallest input level of the compaction.
+ int base_input_level;
+ // the output level of the compaction.
+ int output_level;
+
+ // Reason to run the compaction
+ CompactionReason compaction_reason;
+
+ // Compression algorithm used for output files
+ CompressionType compression;
+
+ // Statistics and other additional details on the compaction
+ CompactionJobStats stats;
+
+ // Compression algorithm used for blob output files.
+ CompressionType blob_compression_type;
+};
+
+struct CompactionJobInfo {
+ ~CompactionJobInfo() { status.PermitUncheckedError(); }
+ // the id of the column family where the compaction happened.
+ uint32_t cf_id;
+ // the name of the column family where the compaction happened.
+ std::string cf_name;
+ // the status indicating whether the compaction was successful or not.
+ Status status;
+ // the id of the thread that completed this compaction job.
+ uint64_t thread_id;
+ // the job id, which is unique in the same thread.
+ int job_id;
+
+ // the smallest input level of the compaction.
+ int base_input_level;
+ // the output level of the compaction.
+ int output_level;
+
+ // The following variables contain information about compaction inputs
+ // and outputs. A file may appear in both the input and output lists
+ // if it was simply moved to a different level. The order of elements
+ // is the same across input_files and input_file_infos; similarly, it is
+ // the same across output_files and output_file_infos.
+
+ // The names of the compaction input files.
+ std::vector<std::string> input_files;
+
+ // Additional information about the compaction input files.
+ std::vector<CompactionFileInfo> input_file_infos;
+
+ // The names of the compaction output files.
+ std::vector<std::string> output_files;
+
+ // Additional information about the compaction output files.
+ std::vector<CompactionFileInfo> output_file_infos;
+
+ // Table properties for input and output tables.
+ // The map is keyed by values from input_files and output_files.
+ TablePropertiesCollection table_properties;
+
+ // Reason to run the compaction
+ CompactionReason compaction_reason;
+
+ // Compression algorithm used for output files
+ CompressionType compression;
+
+ // Statistics and other additional details on the compaction
+ CompactionJobStats stats;
+
+ // Compression algorithm used for blob output files.
+ CompressionType blob_compression_type;
+
+ // Information about blob files created during compaction in Integrated
+ // BlobDB.
+ std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+
+ // Information about blob files deleted during compaction in Integrated
+ // BlobDB.
+ std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
+};
+
+struct MemTableInfo {
+ // the name of the column family to which memtable belongs
+ std::string cf_name;
+ // Sequence number of the first element that was inserted
+ // into the memtable.
+ SequenceNumber first_seqno;
+ // Sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into this
+ // memtable. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ SequenceNumber earliest_seqno;
+ // Total number of entries in memtable
+ uint64_t num_entries;
+ // Total number of deletes in memtable
+ uint64_t num_deletes;
+};
+
+struct ExternalFileIngestionInfo {
+ // the name of the column family
+ std::string cf_name;
+ // Path of the file outside the DB
+ std::string external_file_path;
+ // Path of the file inside the DB
+ std::string internal_file_path;
+ // The global sequence number assigned to keys in this file
+ SequenceNumber global_seqno;
+ // Table properties of the table being flushed
+ TableProperties table_properties;
+};
+
+// Result of auto background error recovery
+struct BackgroundErrorRecoveryInfo {
+ // The original error that triggered the recovery
+ Status old_bg_error;
+
+ // The final bg_error after all recovery attempts. Status::OK() means
+ // the recovery was successful and the database is fully operational.
+ Status new_bg_error;
+};
+
+struct IOErrorInfo {
+ IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
+ const std::string& _file_path, size_t _length, uint64_t _offset)
+ : io_status(_io_status),
+ operation(_operation),
+ file_path(_file_path),
+ length(_length),
+ offset(_offset) {}
+
+ IOStatus io_status;
+ FileOperationType operation;
+ std::string file_path;
+ size_t length;
+ uint64_t offset;
+};
+
+// EventListener class contains a set of callback functions that will
+// be called when specific RocksDB event happens such as flush. It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// IMPORTANT
+// Because compaction is needed to resolve a "writes stopped" condition,
+// calling or waiting for any blocking DB write function (no_slowdown=false)
+// from a compaction-related listener callback can hang RocksDB. For DB
+// writes from a callback we recommend a WriteBatch and no_slowdown=true,
+// because the WriteBatch can accumulate writes for later in case DB::Write
+// returns Status::Incomplete. Similarly, calling CompactRange or similar
+// could hang by waiting for a background worker that is occupied until the
+// callback returns.
+//
+// Otherwise, callback functions should not run for an extended period of
+// time before the function returns, because this will slow RocksDB.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event. For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+//
+// [Locking] All EventListener callbacks are designed to be called without
+// the current thread holding any DB mutex. This is to prevent potential
+// deadlock and performance issue when using EventListener callback
+// in a complex way.
+//
+// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into
+// RocksDB, because RocksDB is not exception-safe. This could cause undefined
+// behavior including data loss, unreported corruption, deadlocks, and more.
+class EventListener : public Customizable {
+ public:
+ static const char* Type() { return "EventListener"; }
+ static Status CreateFromString(const ConfigOptions& options,
+ const std::string& id,
+ std::shared_ptr<EventListener>* result);
+ const char* Name() const override {
+ // Since EventListeners did not have a name previously, we will assume
+ // an empty name. Instances should override this method.
+ return "";
+ }
+ // A callback function to RocksDB which will be called whenever a
+ // registered RocksDB flushes a file. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnFlushCompleted(DB* /*db*/,
+ const FlushJobInfo& /*flush_job_info*/) {}
+
+ // A callback function to RocksDB which will be called before a
+ // RocksDB starts to flush memtables. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnFlushBegin(DB* /*db*/,
+ const FlushJobInfo& /*flush_job_info*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a SST file is deleted. Different from OnCompactionCompleted and
+ // OnFlushCompleted, this callback is designed for external logging
+ // service and thus only provide string parameters instead
+ // of a pointer to DB. Applications that build logic basic based
+ // on file creations and deletions is suggested to implement
+ // OnFlushCompleted and OnCompactionCompleted.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from the
+ // returned value.
+ virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
+
+ // A callback function to RocksDB which will be called before a
+ // RocksDB starts to compact. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a registered RocksDB compacts a file. The default implementation
+ // is a no-op.
+ //
+ // Note that this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // @param db a pointer to the rocksdb instance which just compacted
+ // a file.
+ // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+ // after this function is returned, and must be copied if it is needed
+ // outside of this function.
+ virtual void OnCompactionCompleted(DB* /*db*/,
+ const CompactionJobInfo& /*ci*/) {}
+
+ // A callback function to RocksDB which will be called before a sub-compaction
+ // begins. If a compaction is split to 2 sub-compactions, it will trigger one
+ // `OnCompactionBegin()` first, then two `OnSubcompactionBegin()`.
+ // If compaction is not split, it will still trigger one
+ // `OnSubcompactionBegin()`, as internally, compaction is always handled by
+ // sub-compaction. The default implementation is a no-op.
+ //
+ // Note that this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // @param ci a reference to a CompactionJobInfo struct, it contains a
+ // `sub_job_id` which is only unique within the specified compaction (which
+ // can be identified by `job_id`). 'ci' is released after this function is
+ // returned, and must be copied if it's needed outside this function.
+ // Note: `table_properties` is not set for sub-compaction, the information
+ // could be got from `OnCompactionBegin()`.
+ virtual void OnSubcompactionBegin(const SubcompactionJobInfo& /*si*/) {}
+
+ // A callback function to RocksDB which will be called whenever a
+ // sub-compaction completed. The same as `OnSubcompactionBegin()`, if a
+ // compaction is split to 2 sub-compactions, it will be triggered twice. If
+ // a compaction is not split, it will still be triggered once.
+ // The default implementation is a no-op.
+ //
+ // Note that this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // @param ci a reference to a CompactionJobInfo struct, it contains a
+ // `sub_job_id` which is only unique within the specified compaction (which
+ // can be identified by `job_id`). 'ci' is released after this function is
+ // returned, and must be copied if it's needed outside this function.
+ // Note: `table_properties` is not set for sub-compaction, the information
+ // could be got from `OnCompactionCompleted()`.
+ virtual void OnSubcompactionCompleted(const SubcompactionJobInfo& /*si*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a SST file is created. Different from OnCompactionCompleted and
+ // OnFlushCompleted, this callback is designed for external logging
+ // service and thus only provide string parameters instead
+ // of a pointer to DB. Applications that build logic basic based
+ // on file creations and deletions is suggested to implement
+ // OnFlushCompleted and OnCompactionCompleted.
+ //
+ // Historically it will only be called if the file is successfully created.
+ // Now it will also be called on failure case. User can check info.status
+ // to see if it succeeded or not.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a SST file is being created. It will follow by OnTableFileCreated after
+ // the creation finishes.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a memtable is made immutable.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a column family handle is deleted.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ // @param handle is a pointer to the column family handle to be deleted
+ // which will become a dangling pointer after the deletion.
+ virtual void OnColumnFamilyHandleDeletionStarted(
+ ColumnFamilyHandle* /*handle*/) {}
+
+ // A callback function for RocksDB which will be called after an external
+ // file is ingested using IngestExternalFile.
+ //
+ // Note that the this function will run on the same thread as
+ // IngestExternalFile(), if this function is blocked, IngestExternalFile()
+ // will be blocked from finishing.
+ virtual void OnExternalFileIngested(
+ DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before setting the
+ // background error status to a non-OK value. The new background error status
+ // is provided in `bg_error` and can be modified by the callback. E.g., a
+ // callback can suppress errors by resetting it to Status::OK(), thus
+ // preventing the database from entering read-only mode. We do not provide any
+ // guarantee when failed flushes/compactions will be rescheduled if the user
+ // suppresses an error.
+ //
+ // Note that this function can run on the same threads as flush, compaction,
+ // and user writes. So, it is extremely important not to perform heavy
+ // computations or blocking calls in this function.
+ virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
+ Status* /* bg_error */) {}
+
+ // A callback function for RocksDB which will be called whenever a change
+ // of superversion triggers a change of the stall conditions.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called whenever a file read
+ // operation finishes.
+ virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file write
+ // operation finishes.
+ virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file flush
+ // operation finishes.
+ virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file sync
+ // operation finishes.
+ virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file
+ // rangeSync operation finishes.
+ virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file
+ // truncate operation finishes.
+ virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file close
+ // operation finishes.
+ virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
+
+ // If true, the OnFile*Finish functions will be called. If
+ // false, then they won't be called.
+ virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
+ // A callback function for RocksDB which will be called just before
+ // starting the automatic recovery process for recoverable background
+ // errors, such as NoSpace(). The callback can suppress the automatic
+ // recovery by setting *auto_recovery to false. The database will then
+ // have to be transitioned out of read-only mode by calling DB::Resume()
+ virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+ Status /* bg_error */,
+ bool* /* auto_recovery */) {}
+
+ // DEPRECATED
+ // A callback function for RocksDB which will be called once the database
+ // is recovered from read-only mode after an error. When this is called, it
+ // means normal writes to the database can be issued and the user can
+ // initiate any further recovery actions needed
+ virtual void OnErrorRecoveryCompleted(Status old_bg_error) {
+ old_bg_error.PermitUncheckedError();
+ }
+
+ // A callback function for RocksDB which will be called once the recovery
+ // attempt from a background retryable error is completed. The recovery
+ // may have been successful or not. In either case, the callback is called
+ // with the old and new error. If info.new_bg_error is Status::OK(), that
+ // means the recovery succeeded.
+ virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
+ }
+
+ // A callback function for RocksDB which will be called before
+ // a blob file is being created. It will follow by OnBlobFileCreated after
+ // the creation finishes.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnBlobFileCreationStarted(
+ const BlobFileCreationBriefInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a blob file is created.
+ // It will be called whether the file is successfully created or not. User can
+ // check info.status to see if it succeeded or not.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a blob file is deleted.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called whenever an IO error
+ // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback.
+ virtual void OnIOError(const IOErrorInfo& /*info*/) {}
+
+ ~EventListener() override {}
+};
+
+#else
+
+class EventListener {};
+struct FlushJobInfo {};
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h
new file mode 100644
index 000000000..5cb799e42
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memory_allocator.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator : public Customizable {
+ public:
+ static const char* Type() { return "MemoryAllocator"; }
+ static Status CreateFromString(const ConfigOptions& options,
+ const std::string& value,
+ std::shared_ptr<MemoryAllocator>* result);
+
+ // Allocate a block of at least size. Has to be thread-safe.
+ virtual void* Allocate(size_t size) = 0;
+
+ // Deallocate previously allocated block. Has to be thread-safe.
+ virtual void Deallocate(void* p) = 0;
+
+ // Returns the memory size of the block allocated at p. The default
+ // implementation that just returns the original allocation_size is fine.
+ virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+ // default implementation just returns the allocation size
+ return allocation_size;
+ }
+
+ std::string GetId() const override { return GenerateIndividualId(); }
+};
+
+struct JemallocAllocatorOptions {
+ static const char* kName() { return "JemallocAllocatorOptions"; }
+ // Jemalloc tcache cache allocations by size class. For each size class,
+ // it caches between 20 (for large size classes) to 200 (for small size
+ // classes). To reduce tcache memory usage in case the allocator is access
+ // by large number of threads, we can control whether to cache an allocation
+ // by its size.
+ bool limit_tcache_size = false;
+
+ // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+ // When used with block cache, it is recommended to set it to block_size/4.
+ size_t tcache_size_lower_bound = 1024;
+
+ // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+ // When used with block cache, it is recommended to set it to block_size.
+ size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocator which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvise to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator are through the same arena.
+// The memory allocator hooks memory allocation of the arena, and calls
+// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduction of jemalloc
+// metadata for some workloads.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
+// can be reduced by limiting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+ JemallocAllocatorOptions& options,
+ std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
new file mode 100644
index 000000000..cb5444dca
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -0,0 +1,423 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+// (1) It does not store duplicate items.
+// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+// equality.
+// (3) It can be accessed concurrently by multiple readers and can support
+// during reads. However, it needn't support multiple concurrent writes.
+// (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+// - SkipListRep: This is the default; it is backed by a skip list.
+// - HashSkipListRep: The memtable rep that is best used for keys that are
+// structured like "prefix:suffix" where iteration within a prefix is
+// common and iteration across different prefixes is rare. It is backed by
+// a hash map where each bucket is a skip list.
+// - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <stdexcept>
+#include <unordered_set>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Allocator;
+class LookupKey;
+class SliceTransform;
+class Logger;
+struct DBOptions;
+
+using KeyHandle = void*;
+
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+class MemTableRep {
+ public:
+ // KeyComparator provides a means to compare keys, which are internal keys
+ // concatenated with values.
+ class KeyComparator {
+ public:
+ using DecodedType = ROCKSDB_NAMESPACE::Slice;
+
+ virtual DecodedType decode_key(const char* key) const {
+ // The format of key is frozen and can be treated as a part of the API
+ // contract. Refer to MemTable::Add for details.
+ return GetLengthPrefixedSlice(key);
+ }
+
+ // Compare a and b. Return a negative value if a is less than b, 0 if they
+ // are equal, and a positive value if a is greater than b
+ virtual int operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const = 0;
+
+ virtual int operator()(const char* prefix_len_key,
+ const Slice& key) const = 0;
+
+ virtual ~KeyComparator() {}
+ };
+
+ explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
+
+ // Allocate a buf of len size for storing key. The idea is that a
+ // specific memtable representation knows its underlying data structure
+ // better. By allowing it to allocate memory, it can possibly put
+ // correlated stuff in consecutive memory area to make processor
+ // prefetching more efficient.
+ virtual KeyHandle Allocate(const size_t len, char** buf);
+
+ // Insert key into the collection. (The caller will pack key and value into a
+ // single buffer and pass that in as the parameter to Insert).
+ // REQUIRES: nothing that compares equal to key is currently in the
+ // collection, and no concurrent modifications to the table in progress
+ virtual void Insert(KeyHandle handle) = 0;
+
+ // Same as ::Insert
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKey(KeyHandle handle) {
+ Insert(handle);
+ return true;
+ }
+
+ // Same as Insert(), but in additional pass a hint to insert location for
+ // the key. If hint points to nullptr, a new hint will be populated.
+ // otherwise the hint will be updated to reflect the last insert location.
+ //
+ // Currently only skip-list based memtable implement the interface. Other
+ // implementations will fallback to Insert() by default.
+ virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
+ // Ignore the hint by default.
+ Insert(handle);
+ }
+
+ // Same as ::InsertWithHint
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
+ InsertWithHint(handle, hint);
+ return true;
+ }
+
+ // Same as ::InsertWithHint, but allow concurrent write
+ //
+ // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+ // the hint will be updated to reflect the last insert location. The hint is
+ // owned by the caller and it is the caller's responsibility to delete the
+ // hint later.
+ //
+ // Currently only skip-list based memtable implement the interface. Other
+ // implementations will fallback to InsertConcurrently() by default.
+ virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+ // Ignore the hint by default.
+ InsertConcurrently(handle);
+ }
+
+ // Same as ::InsertWithHintConcurrently
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+ InsertWithHintConcurrently(handle, hint);
+ return true;
+ }
+
+ // Like Insert(handle), but may be called concurrent with other calls
+ // to InsertConcurrently for other handles.
+ //
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual void InsertConcurrently(KeyHandle handle);
+
+ // Same as ::InsertConcurrently
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyConcurrently(KeyHandle handle) {
+ InsertConcurrently(handle);
+ return true;
+ }
+
+ // Returns true iff an entry that compares equal to key is in the collection.
+ virtual bool Contains(const char* key) const = 0;
+
+ // Notify this table rep that it will no longer be added to. By default,
+ // does nothing. After MarkReadOnly() is called, this table rep will
+ // not be written to (ie No more calls to Allocate(), Insert(),
+ // or any writes done directly to entries accessed through the iterator.)
+ virtual void MarkReadOnly() {}
+
+ // Notify this table rep that it has been flushed to stable storage.
+ // By default, does nothing.
+ //
+ // Invariant: MarkReadOnly() is called, before MarkFlushed().
+ // Note that this method if overridden, should not run for an extended period
+ // of time. Otherwise, RocksDB may be blocked.
+ virtual void MarkFlushed() {}
+
+ // Look up key from the mem table, since the first key in the mem table whose
+ // user_key matches the one given k, call the function callback_func(), with
+ // callback_args directly forwarded as the first parameter, and the mem table
+ // key as the second parameter. If the return value is false, then terminates.
+ // Otherwise, go through the next key.
+ //
+ // It's safe for Get() to terminate after having finished all the potential
+ // key for the k.user_key(), or not.
+ //
+ // Default:
+ // Get() function with a default value of dynamically construct an iterator,
+ // seek and call the call back function.
+ virtual void Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry));
+
+ virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
+ const Slice& /*end_key*/) {
+ return 0;
+ }
+
+ // Returns a vector of unique random memtable entries of approximate
+ // size 'target_sample_size' (this size is not strictly enforced).
+ virtual void UniqueRandomSample(const uint64_t num_entries,
+ const uint64_t target_sample_size,
+ std::unordered_set<const char*>* entries) {
+ (void)num_entries;
+ (void)target_sample_size;
+ (void)entries;
+ assert(false);
+ }
+
+ // Report an approximation of how much memory has been used other than memory
+ // that was allocated through the allocator. Safe to call from any thread.
+ virtual size_t ApproximateMemoryUsage() = 0;
+
+ virtual ~MemTableRep() {}
+
+ // Iteration over the contents of a skip collection
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified collection.
+ // The returned iterator is not valid.
+ // explicit Iterator(const MemTableRep* collection);
+ virtual ~Iterator() {}
+
+ // Returns true iff the iterator is positioned at a valid node.
+ virtual bool Valid() const = 0;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ virtual const char* key() const = 0;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Advance to the first entry with a key >= target
+ virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+ // retreat to the first entry with a key <= target
+ virtual void SeekForPrev(const Slice& internal_key,
+ const char* memtable_key) = 0;
+
+ virtual void RandomSeek() {}
+
+ // Position at the first entry in collection.
+ // Final state of iterator is Valid() iff collection is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last entry in collection.
+ // Final state of iterator is Valid() iff collection is not empty.
+ virtual void SeekToLast() = 0;
+ };
+
+ // Return an iterator over the keys in this representation.
+ // arena: If not null, the arena needs to be used to allocate the Iterator.
+ // When destroying the iterator, the caller will not call "delete"
+ // but Iterator::~Iterator() directly. The destructor needs to destroy
+ // all the states but those allocated in arena.
+ virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
+
+ // Return an iterator that has a special Seek semantics. The result of
+ // a Seek might only include keys with the same prefix as the target key.
+ // arena: If not null, the arena is used to allocate the Iterator.
+ // When destroying the iterator, the caller will not call "delete"
+ // but Iterator::~Iterator() directly. The destructor needs to destroy
+ // all the states but those allocated in arena.
+ virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+ return GetIterator(arena);
+ }
+
+ // Return true if the current MemTableRep supports merge operator.
+ // Default: true
+ virtual bool IsMergeOperatorSupported() const { return true; }
+
+ // Return true if the current MemTableRep supports snapshot
+ // Default: true
+ virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+ // When *key is an internal key concatenated with the value, returns the
+ // user key.
+ virtual Slice UserKey(const char* key) const;
+
+ Allocator* allocator_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory : public Customizable {
+ public:
+ ~MemTableRepFactory() override {}
+
+ static const char* Type() { return "MemTableRepFactory"; }
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::unique_ptr<MemTableRepFactory>* factory);
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::shared_ptr<MemTableRepFactory>* factory);
+
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) = 0;
+ virtual MemTableRep* CreateMemTableRep(
+ const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
+ const SliceTransform* slice_transform, Logger* logger,
+ uint32_t /* column_family_id */) {
+ return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
+ }
+
+ const char* Name() const override = 0;
+
+ // Return true if the current MemTableRep supports concurrent inserts
+ // Default: false
+ virtual bool IsInsertConcurrentlySupported() const { return false; }
+
+ // Return true if the current MemTableRep supports detecting duplicate
+ // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
+ // false when if the <key,seq> already exists.
+ // Default: false
+ virtual bool CanHandleDuplicatedKey() const { return false; }
+};
+
+// This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+// lookahead: If non-zero, each iterator's seek operation will start the
+// search from the previously visited record (doing at most 'lookahead'
+// steps). This is an optimization for the access pattern including many
+// seeks with consecutive keys.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+ explicit SkipListFactory(size_t lookahead = 0);
+
+ // Methods for Configurable/Customizable class overrides
+ static const char* kClassName() { return "SkipListFactory"; }
+ static const char* kNickName() { return "skip_list"; }
+ virtual const char* Name() const override { return kClassName(); }
+ virtual const char* NickName() const override { return kNickName(); }
+ std::string GetId() const override;
+
+ // Methods for MemTableRepFactory class overrides
+ using MemTableRepFactory::CreateMemTableRep;
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) override;
+
+ bool IsInsertConcurrentlySupported() const override { return true; }
+
+ bool CanHandleDuplicatedKey() const override { return true; }
+
+ private:
+ size_t lookahead_;
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+// count: Passed to the constructor of the underlying std::vector of each
+// VectorRep. On initialization, the underlying array will be at least count
+// bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+ size_t count_;
+
+ public:
+ explicit VectorRepFactory(size_t count = 0);
+
+ // Methods for Configurable/Customizable class overrides
+ static const char* kClassName() { return "VectorRepFactory"; }
+ static const char* kNickName() { return "vector"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+
+ // Methods for MemTableRepFactory class overrides
+ using MemTableRepFactory::CreateMemTableRep;
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) override;
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+// link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+ size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+ int32_t skiplist_branching_factor = 4);
+
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+// Otherwise from huge page TLB. The user needs to reserve
+// huge pages for it to be allocated, like:
+// sysctl -w vm.nr_hugepages=20
+// See linux doc Documentation/vm/hugetlbpage.txt
+// @bucket_entries_logging_threshold: if number of entries in one bucket
+// exceeds this number, log about it.
+// @if_log_bucket_dist_when_flash: if true, log distribution of number of
+// entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+// entries exceed this parameter.
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+ size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
+ int bucket_entries_logging_threshold = 4096,
+ bool if_log_bucket_dist_when_flash = true,
+ uint32_t threshold_use_skiplist = 256);
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h
new file mode 100644
index 000000000..ae795220b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/merge_operator.h
@@ -0,0 +1,265 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+// a) AssociativeMergeOperator - for most simple semantics (always take
+// two values, and merge them into one value, which is then put back
+// into rocksdb); numeric addition and string concatenation are examples;
+//
+// b) MergeOperator - the generic class for all the more abstract / complex
+// operations; one method (FullMergeV2) to merge a Put/Delete value with a
+// merge operand; and another method (PartialMerge) that merges multiple
+// operands together. this is especially useful if your key values have
+// complex structures but you would still like to support client-specific
+// incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class MergeOperator : public Customizable {
+ public:
+ virtual ~MergeOperator() {}
+ static const char* Type() { return "MergeOperator"; }
+ static Status CreateFromString(const ConfigOptions& opts,
+ const std::string& id,
+ std::shared_ptr<MergeOperator>* result);
+
+ // Gives the client a way to express the read -> modify -> write semantics
+ // key: (IN) The key that's associated with this merge operation.
+ // Client could multiplex the merge operator based on it
+ // if the key space is partitioned and different subspaces
+ // refer to different types of data which have different
+ // merge operation semantics
+ // existing: (IN) null indicates that the key does not exist before this op
+ // operand_list:(IN) the sequence of merge operations to apply, front() first.
+ // new_value:(OUT) Client is responsible for filling the merge result here.
+ // The string that new_value is pointing to will be empty.
+ // logger: (IN) Client could use this to log errors during merge.
+ //
+ // Return true on success.
+ // All values passed in will be client-specific values. So if this method
+ // returns false, it is because client specified bad data or there was
+ // internal corruption. This will be treated as an error by the library.
+ //
+ // Also make use of the *logger for error messages.
+ virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+ const std::deque<std::string>& /*operand_list*/,
+ std::string* /*new_value*/, Logger* /*logger*/) const {
+ // deprecated, please use FullMergeV2()
+ assert(false);
+ return false;
+ }
+
+ struct MergeOperationInput {
+ // If user-defined timestamp is enabled, `_key` includes timestamp.
+ explicit MergeOperationInput(const Slice& _key,
+ const Slice* _existing_value,
+ const std::vector<Slice>& _operand_list,
+ Logger* _logger)
+ : key(_key),
+ existing_value(_existing_value),
+ operand_list(_operand_list),
+ logger(_logger) {}
+
+ // The key associated with the merge operation.
+ const Slice& key;
+ // The existing value of the current key, nullptr means that the
+ // value doesn't exist.
+ const Slice* existing_value;
+ // A list of operands to apply.
+ const std::vector<Slice>& operand_list;
+ // Logger could be used by client to log any errors that happen during
+ // the merge operation.
+ Logger* logger;
+ };
+
+ struct MergeOperationOutput {
+ explicit MergeOperationOutput(std::string& _new_value,
+ Slice& _existing_operand)
+ : new_value(_new_value), existing_operand(_existing_operand) {}
+
+ // Client is responsible for filling the merge result here.
+ std::string& new_value;
+ // If the merge result is one of the existing operands (or existing_value),
+ // client can set this field to the operand (or existing_value) instead of
+ // using new_value.
+ Slice& existing_operand;
+ };
+
+ // This function applies a stack of merge operands in chronological order
+ // on top of an existing value. There are two ways in which this method is
+ // being used:
+ // a) During Get() operation, it used to calculate the final value of a key
+ // b) During compaction, in order to collapse some operands with the based
+ // value.
+ //
+ // Note: The name of the method is somewhat misleading, as both in the cases
+ // of Get() or compaction it may be called on a subset of operands:
+ // K: 0 +1 +2 +7 +4 +5 2 +1 +2
+ // ^
+ // |
+ // snapshot
+ // In the example above, Get(K) operation will call FullMerge with a base
+ // value of 2 and operands [+1, +2]. Compaction process might decide to
+ // collapse the beginning of the history up to the snapshot by performing
+ // full Merge with base value of 0 and operands [+1, +2, +7, +4].
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const;
+
+ // This function performs merge(left_op, right_op)
+ // when both the operands are themselves merge operation types
+ // that you would have passed to a DB::Merge() call in the same order
+ // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+ //
+ // PartialMerge should combine them into a single merge operation that is
+ // saved into *new_value, and then it should return true.
+ // *new_value should be constructed such that a call to
+ // DB::Merge(key, *new_value) would yield the same result as a call
+ // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+ //
+ // The string that new_value is pointing to will be empty.
+ //
+ // The default implementation of PartialMergeMulti will use this function
+ // as a helper, for backward compatibility. Any successor class of
+ // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+ // although implementing PartialMergeMulti is suggested as it is in general
+ // more effective to merge multiple operands at a time instead of two
+ // operands at a time.
+ //
+ // If it is impossible or infeasible to combine the two operations,
+ // leave new_value unchanged and return false. The library will
+ // internally keep track of the operations, and apply them in the
+ // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+ //
+ // TODO: Presently there is no way to differentiate between error/corruption
+ // and simply "return false". For now, the client should simply return
+ // false in any case it cannot perform partial-merge, regardless of reason.
+ // If there is corruption in the data, handle it in the FullMergeV2() function
+ // and return false there. The default implementation of PartialMerge will
+ // always return false.
+ virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+ const Slice& /*right_operand*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const {
+ return false;
+ }
+
+ // This function performs merge when all the operands are themselves merge
+ // operation types that you would have passed to a DB::Merge() call in the
+ // same order (front() first)
+ // (i.e. DB::Merge(key, operand_list[0]), followed by
+ // DB::Merge(key, operand_list[1]), ...)
+ //
+ // PartialMergeMulti should combine them into a single merge operation that is
+ // saved into *new_value, and then it should return true. *new_value should
+ // be constructed such that a call to DB::Merge(key, *new_value) would yield
+ // the same result as sequential individual calls to DB::Merge(key, operand)
+ // for each operand in operand_list from front() to back().
+ //
+ // The string that new_value is pointing to will be empty.
+ //
+ // The PartialMergeMulti function will be called when there are at least two
+ // operands.
+ //
+ // In the default implementation, PartialMergeMulti will invoke PartialMerge
+ // multiple times, where each time it only merges two operands. Developers
+ // should either implement PartialMergeMulti, or implement PartialMerge which
+ // is served as the helper function of the default PartialMergeMulti.
+ virtual bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const;
+
+ // The name of the MergeOperator. Used to check for MergeOperator
+ // mismatches (i.e., a DB created with one MergeOperator is
+ // accessed using a different MergeOperator)
+ // TODO: the name is currently not stored persistently and thus
+ // no checking is enforced. Client is responsible for providing
+ // consistent MergeOperator between DB opens.
+ virtual const char* Name() const override = 0;
+
+ // Determines whether the PartialMerge can be called with just a single
+ // merge operand.
+ // Override and return true for allowing a single operand. PartialMerge
+ // and PartialMergeMulti should be overridden and implemented
+ // correctly to properly handle a single operand.
+ virtual bool AllowSingleOperand() const { return false; }
+
+ // Allows to control when to invoke a full merge during Get.
+ // This could be used to limit the number of merge operands that are looked at
+ // during a point lookup, thereby helping in limiting the number of levels to
+ // read from.
+ // Doesn't help with iterators.
+ //
+ // Note: the merge operands are passed to this function in the reversed order
+ // relative to how they were merged (passed to FullMerge or FullMergeV2)
+ // for performance reasons, see also:
+ // https://github.com/facebook/rocksdb/issues/3865
+ virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const {
+ return false;
+ }
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+ ~AssociativeMergeOperator() override {}
+
+ // Gives the client a way to express the read -> modify -> write semantics
+ // key: (IN) The key that's associated with this merge operation.
+ // existing_value:(IN) null indicates the key does not exist before this op
+ // value: (IN) the value to update/merge the existing_value with
+ // new_value: (OUT) Client is responsible for filling the merge result
+ // here. The string that new_value is pointing to will be empty.
+ // logger: (IN) Client could use this to log errors during merge.
+ //
+ // Return true on success.
+ // All values passed in will be client-specific values. So if this method
+ // returns false, it is because client specified bad data or there was
+ // internal corruption. The client should assume that this will be treated
+ // as an error by the library.
+ virtual bool Merge(const Slice& key, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* logger) const = 0;
+
+ private:
+ // Default implementations of the MergeOperator functions
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ bool PartialMerge(const Slice& key, const Slice& left_operand,
+ const Slice& right_operand, std::string* new_value,
+ Logger* logger) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
new file mode 100644
index 000000000..0cdffcd5f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -0,0 +1,245 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Basic identifiers and metadata for a file in a DB. This only includes
+// information considered relevant for taking backups, checkpoints, or other
+// services relating to DB file storage.
+// This is only appropriate for immutable files, such as SST files or all
+// files in a backup. See also LiveFileStorageInfo.
+struct FileStorageInfo {
+ // The name of the file within its directory (e.g. "123456.sst")
+ std::string relative_filename;
+ // The directory containing the file, without a trailing '/'. This could be
+ // a DB path, wal_dir, etc.
+ std::string directory;
+
+ // The id of the file within a single DB. Set to 0 if the file does not have
+ // a number (e.g. CURRENT)
+ uint64_t file_number = 0;
+ // The type of the file as part of a DB.
+ FileType file_type = kTempFile;
+
+ // File size in bytes. See also `trim_to_size`.
+ uint64_t size = 0;
+
+ // This feature is experimental and subject to change.
+ Temperature temperature = Temperature::kUnknown;
+
+ // The checksum of a SST file, the value is decided by the file content and
+ // the checksum algorithm used for this SST file. The checksum function is
+ // identified by the file_checksum_func_name. If the checksum function is
+ // not specified, file_checksum is "0" by default.
+ std::string file_checksum;
+
+ // The name of the checksum function used to generate the file checksum
+ // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
+ // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
+ // "Unknown".
+ std::string file_checksum_func_name;
+};
+
+// Adds to FileStorageInfo the ability to capture the state of files that
+// might change in a running DB.
+struct LiveFileStorageInfo : public FileStorageInfo {
+ // If non-empty, this string represents the "saved" contents of the file
+ // for the current context. (This field is used for checkpointing CURRENT
+ // file.) In that case, size == replacement_contents.size() and file on disk
+ // should be ignored. If empty string, the file on disk should still have
+ // "saved" contents. (See trim_to_size.)
+ std::string replacement_contents;
+
+ // If true, the file on disk is allowed to be larger than `size` but only
+ // the first `size` bytes should be used for the current context. If false,
+ // the file is corrupt if size on disk does not equal `size`.
+ bool trim_to_size = false;
+};
+
+// The metadata that describes an SST file. (Does not need to extend
+// LiveFileStorageInfo because SST files are always immutable.)
+struct SstFileMetaData : public FileStorageInfo {
+ SstFileMetaData() { file_type = kTableFile; }
+
+ SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+ const std::string& _directory, uint64_t _size,
+ SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
+ const std::string& _smallestkey,
+ const std::string& _largestkey, uint64_t _num_reads_sampled,
+ bool _being_compacted, Temperature _temperature,
+ uint64_t _oldest_blob_file_number,
+ uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+ std::string& _file_checksum,
+ std::string& _file_checksum_func_name)
+ : smallest_seqno(_smallest_seqno),
+ largest_seqno(_largest_seqno),
+ smallestkey(_smallestkey),
+ largestkey(_largestkey),
+ num_reads_sampled(_num_reads_sampled),
+ being_compacted(_being_compacted),
+ num_entries(0),
+ num_deletions(0),
+ oldest_blob_file_number(_oldest_blob_file_number),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time) {
+ if (!_file_name.empty()) {
+ if (_file_name[0] == '/') {
+ relative_filename = _file_name.substr(1);
+ name = _file_name; // Deprecated field
+ } else {
+ relative_filename = _file_name;
+ name = std::string("/") + _file_name; // Deprecated field
+ }
+ assert(relative_filename.size() + 1 == name.size());
+ assert(relative_filename[0] != '/');
+ assert(name[0] == '/');
+ }
+ directory = _directory;
+ db_path = _directory; // Deprecated field
+ file_number = _file_number;
+ file_type = kTableFile;
+ size = _size;
+ temperature = _temperature;
+ file_checksum = _file_checksum;
+ file_checksum_func_name = _file_checksum_func_name;
+ }
+
+ SequenceNumber smallest_seqno = 0; // Smallest sequence number in file.
+ SequenceNumber largest_seqno = 0; // Largest sequence number in file.
+ std::string smallestkey; // Smallest user defined key in the file.
+ std::string largestkey; // Largest user defined key in the file.
+ uint64_t num_reads_sampled = 0; // How many times the file is read.
+ bool being_compacted =
+ false; // true if the file is currently being compacted.
+
+ uint64_t num_entries = 0;
+ uint64_t num_deletions = 0;
+
+ uint64_t oldest_blob_file_number = 0; // The id of the oldest blob file
+ // referenced by the file.
+ // An SST file may be generated by compactions whose input files may
+ // in turn be generated by earlier compactions. The creation time of the
+ // oldest SST file that is the compaction ancestor of this file.
+ // The timestamp is provided SystemClock::GetCurrentTime().
+ // 0 if the information is not available.
+ //
+ // Note: for TTL blob files, it contains the start of the expiration range.
+ uint64_t oldest_ancester_time = 0;
+ // Timestamp when the SST file is created, provided by
+ // SystemClock::GetCurrentTime(). 0 if the information is not available.
+ uint64_t file_creation_time = 0;
+
+ // DEPRECATED: The name of the file within its directory with a
+ // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
+ // instead.
+ std::string name;
+
+ // DEPRECATED: replaced by `directory` in base struct
+ std::string db_path;
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+ std::string column_family_name; // Name of the column family
+ int level; // Level at which this file resides.
+ LiveFileMetaData() : column_family_name(), level(0) {}
+};
+
+// The MetaData that describes a Blob file
+struct BlobMetaData {
+ BlobMetaData()
+ : blob_file_number(0),
+ blob_file_size(0),
+ total_blob_count(0),
+ total_blob_bytes(0),
+ garbage_blob_count(0),
+ garbage_blob_bytes(0) {}
+
+ BlobMetaData(uint64_t _file_number, const std::string& _file_name,
+ const std::string& _file_path, uint64_t _file_size,
+ uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+ uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes,
+ const std::string& _file_checksum,
+ const std::string& _file_checksum_func_name)
+ : blob_file_number(_file_number),
+ blob_file_name(_file_name),
+ blob_file_path(_file_path),
+ blob_file_size(_file_size),
+ total_blob_count(_total_blob_count),
+ total_blob_bytes(_total_blob_bytes),
+ garbage_blob_count(_garbage_blob_count),
+ garbage_blob_bytes(_garbage_blob_bytes),
+ checksum_method(_file_checksum),
+ checksum_value(_file_checksum_func_name) {}
+ uint64_t blob_file_number;
+ std::string blob_file_name;
+ std::string blob_file_path;
+ uint64_t blob_file_size;
+ uint64_t total_blob_count;
+ uint64_t total_blob_bytes;
+ uint64_t garbage_blob_count;
+ uint64_t garbage_blob_bytes;
+ std::string checksum_method;
+ std::string checksum_value;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+ LevelMetaData(int _level, uint64_t _size,
+ const std::vector<SstFileMetaData>&& _files)
+ : level(_level), size(_size), files(_files) {}
+
+ // The level which this meta data describes.
+ const int level;
+ // The size of this level in bytes, which is equal to the sum of
+ // the file size of its "files".
+ const uint64_t size;
+ // The metadata of all sst files in this level.
+ const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+ ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
+ ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+ const std::vector<LevelMetaData>&& _levels)
+ : size(_size), name(_name), levels(_levels) {}
+
+ // The size of this column family in bytes, which is equal to the sum of
+ // the file size of its "levels".
+ uint64_t size;
+ // The number of files in this column family.
+ size_t file_count;
+ // The name of the column family.
+ std::string name;
+ // The metadata of all levels in this column family.
+ std::vector<LevelMetaData> levels;
+
+ // The total size of all blob files
+ uint64_t blob_file_size = 0;
+ // The number of blob files in this column family.
+ size_t blob_file_count = 0;
+ // The metadata of the blobs in this column family.
+ std::vector<BlobMetaData> blob_files;
+};
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+ std::string db_comparator_name; // Used to safety check at import.
+ std::vector<LiveFileMetaData> files; // Vector of file metadata.
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
new file mode 100644
index 000000000..7a4d8b5a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -0,0 +1,2113 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/types.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#ifdef max
+#undef max
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class ConcurrentTaskLimiter;
+class Env;
+enum InfoLogLevel : unsigned char;
+class SstFileManager;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class MemTableRepFactory;
+class RateLimiter;
+class Slice;
+class Statistics;
+class InternalKeyComparator;
+class WalFilter;
+class FileSystem;
+
+struct Options;
+struct DbPath;
+
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
+struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
+ // The function recovers options to a previous version. Only 4.6 or later
+ // versions are supported.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
+ ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ // Some functions that make it easier to optimize RocksDB
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ // An optional cache object is passed in to be used as the block cache
+ ColumnFamilyOptions* OptimizeForSmallDb(
+ std::shared_ptr<Cache>* cache = nullptr);
+
+ // Use this if you don't need to keep the data sorted, i.e. you'll never use
+ // an iterator, only Put() and Get() API calls
+ //
+ // Not supported in ROCKSDB_LITE
+ ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
+
+ // Default values for some parameters in ColumnFamilyOptions are not
+ // optimized for heavy workloads and big datasets, which means you might
+ // observe write stalls under some conditions. As a starting point for tuning
+ // RocksDB options, use the following two functions:
+ // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+ // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+ // Universal style compaction is focused on reducing Write Amplification
+ // Factor for big data sets, but increases Space Amplification. You can learn
+ // more about the different styles here:
+ // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+ // Make sure to also call IncreaseParallelism(), which will provide the
+ // biggest performance gains.
+ // Note: we might use more memory than memtable_memory_budget during high
+ // write rate period
+ //
+ // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
+ ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+ uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+ ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+ uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator = BytewiseComparator();
+
+ // REQUIRES: The client must provide a merge operator if Merge operation
+ // needs to be accessed. Calling Merge on a DB without a merge operator
+ // would result in Status::NotSupported. The client must ensure that the
+ // merge operator supplied here has the same name and *exactly* the same
+ // semantics as the merge operator provided to previous open calls on
+ // the same DB. The only exception is reserved for upgrade, where a DB
+ // previously without a merge operator is introduced to Merge operation
+ // for the first time. It's necessary to specify a merge operator when
+ // opening the DB in this case.
+ // Default: nullptr
+ std::shared_ptr<MergeOperator> merge_operator = nullptr;
+
+ // A single CompactionFilter instance to call into during compaction.
+ // Allows an application to modify/delete a key-value during background
+ // compaction.
+ //
+ // If the client requires a new `CompactionFilter` to be used for different
+ // compaction runs and/or requires a `CompactionFilter` for table file
+ // creations outside of compaction, it can specify compaction_filter_factory
+ // instead of this option. The client should specify only one of the two.
+ // compaction_filter takes precedence over compaction_filter_factory if
+ // client specifies both.
+ //
+ // If multithreaded compaction is being used, the supplied CompactionFilter
+ // instance may be used from different threads concurrently and so should be
+ // thread-safe.
+ //
+ // Default: nullptr
+ const CompactionFilter* compaction_filter = nullptr;
+
+ // This is a factory that provides `CompactionFilter` objects which allow
+ // an application to modify/delete a key-value during table file creation.
+ //
+ // Unlike the `compaction_filter` option, which is used when compaction
+ // creates a table file, this factory allows using a `CompactionFilter` when a
+ // table file is created for various reasons. The factory can decide what
+ // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+ // default the decision is to use a `CompactionFilter` for
+ // `TableFileCreationReason::kCompaction` only.
+ //
+ // Each thread of work involving creating table files will create a new
+ // `CompactionFilter` when it will be used according to the above
+ // `TableFileCreationReason`-based decision. This allows the application to
+ // know about the different ongoing threads of work and makes it unnecessary
+ // for `CompactionFilter` to provide thread-safety.
+ //
+ // Default: nullptr
+ std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
+ //
+ // Larger values increase performance, especially during bulk loads.
+ // Up to max_write_buffer_number write buffers may be held in memory
+ // at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
+ // Also, a larger write buffer will result in a longer recovery time
+ // the next time the database is opened.
+ //
+ // Note that write_buffer_size is enforced per column family.
+ // See db_write_buffer_size for sharing memory across column families.
+ //
+ // Default: 64MB
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t write_buffer_size = 64 << 20;
+
+ // Compress blocks using the specified compression algorithm.
+ //
+ // Default: kSnappyCompression, if it's supported. If snappy is not linked
+ // with the library, the default is kNoCompression.
+ //
+ // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ //
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kSnappyCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ //
+ // If you do not set `compression_opts.level`, or set it to
+ // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+ // default corresponding to `compression` as follows:
+ //
+ // - kZSTD: 3
+ // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+ // - kLZ4HCCompression: 0
+ // - For all others, we do not specify a compression level
+ //
+ // Dynamically changeable through SetOptions() API
+ CompressionType compression;
+
+ // Compression algorithm that will be used for the bottommost level that
+ // contain files. The behavior for num_levels = 1 is not well defined.
+ // Right now, with num_levels = 1, all compaction outputs will use
+ // bottommost_compression and all flush outputs still use options.compression,
+ // but the behavior is subject to change.
+ //
+ // Default: kDisableCompressionOption (Disabled)
+ CompressionType bottommost_compression = kDisableCompressionOption;
+
+ // different options for compression algorithms used by bottommost_compression
+ // if it is enabled. To enable it, please see the definition of
+ // CompressionOptions. Behavior for num_levels = 1 is the same as
+ // options.bottommost_compression.
+ CompressionOptions bottommost_compression_opts;
+
+ // different options for compression algorithms
+ CompressionOptions compression_opts;
+
+ // Number of files to trigger level-0 compaction. A value <0 means that
+ // level-0 compaction will not be triggered by number of files at all.
+ //
+ // Default: 4
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_file_num_compaction_trigger = 4;
+
+ // If non-nullptr, use the specified function to put keys in contiguous
+ // groups called "prefixes". These prefixes are used to place one
+ // representative entry for the group into the Bloom filter
+ // rather than an entry for each key (see whole_key_filtering).
+ // Under certain conditions, this enables optimizing some range queries
+ // (Iterators) in addition to some point lookups (Get/MultiGet).
+ //
+ // Together `prefix_extractor` and `comparator` must satisfy one essential
+ // property for valid prefix filtering of range queries:
+ // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+ // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+ // Then InDomain(k2) and prefix(k2) == prefix(k1)
+ //
+ // In other words, all keys with the same prefix must be in a contiguous
+ // group by comparator order, and cannot be interrupted by keys with no
+ // prefix ("out of domain"). (This makes it valid to conclude that no
+ // entries within some bounds are present if the upper and lower bounds
+ // have a common prefix and no entries with that same prefix are present.)
+ //
+ // Some other properties are recommended but not strictly required. Under
+ // most sensible comparators, the following will need to hold true to
+ // satisfy the essential property above:
+ // * "Prefix is a prefix": key.starts_with(prefix(key))
+ // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+ // Compare(prefix(k1), prefix(k2)) <= 0
+ //
+ // The next two properties ensure that seeking to a prefix allows
+ // enumerating all entries with that prefix:
+ // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+ // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
+ //
+ // Default: nullptr
+ std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+
+ // Control maximum total data size for a level.
+ // max_bytes_for_level_base is the max total for level-1.
+ // Maximum number of bytes for level L can be calculated as
+ // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+ // For example, if max_bytes_for_level_base is 200MB, and if
+ // max_bytes_for_level_multiplier is 10, total data size for level-1
+ // will be 200MB, total file size for level-2 will be 2GB,
+ // and total file size for level-3 will be 20GB.
+ //
+ // Default: 256MB.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_bytes_for_level_base = 256 * 1048576;
+
+ // Deprecated.
+ uint64_t snap_refresh_nanos = 0;
+
+ // Disable automatic compactions. Manual compactions can still
+ // be issued on this column family
+ //
+ // Dynamically changeable through SetOptions() API
+ bool disable_auto_compactions = false;
+
+ // This is a factory that provides TableFactory objects.
+ // Default: a block-based table factory that provides a default
+ // implementation of TableBuilder and TableReader with default
+ // BlockBasedTableOptions.
+ std::shared_ptr<TableFactory> table_factory;
+
+ // A list of paths where SST files for this column family
+ // can be put into, with its target size. Similar to db_paths,
+ // newer data is placed into paths specified earlier in the
+ // vector while older data gradually moves to paths specified
+ // later in the vector.
+ // Note that, if a path is supplied to multiple column
+ // families, it would have files and total size from all
+ // the column families combined. User should provision for the
+ // total size(from all the column families) in such cases.
+ //
+ // If left empty, db_paths will be used.
+ // Default: empty
+ std::vector<DbPath> cf_paths;
+
+ // Compaction concurrent thread limiter for the column family.
+ // If non-nullptr, use given concurrent thread limiter to control
+ // the max outstanding compaction tasks. Limiter can be shared with
+ // multiple column families across db instances.
+ //
+ // Default: nullptr
+ std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
+
+ // If non-nullptr, use the specified factory for a function to determine the
+ // partitioning of sst files. This helps compaction to split the files
+ // on interesting boundaries (key prefixes) to make propagation of sst
+ // files less write amplifying (covering the whole key space).
+ // THE FEATURE IS STILL EXPERIMENTAL
+ //
+ // Default: nullptr
+ std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+ // Create ColumnFamilyOptions with default values for all fields
+ ColumnFamilyOptions();
+ // Create ColumnFamilyOptions from Options
+ explicit ColumnFamilyOptions(const Options& options);
+
+ void Dump(Logger* log) const;
+};
+
+enum class WALRecoveryMode : char {
+ // Original levelDB recovery
+ //
+ // We tolerate the last record in any log to be incomplete due to a crash
+ // while writing it. Zeroed bytes from preallocation are also tolerated in the
+ // trailing data of any log.
+ //
+ // Use case: Applications for which updates, once applied, must not be rolled
+ // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
+ // this as long as `WritableFile::Append()` writes are durable. In case the
+ // user needs the guarantee in more situations (e.g., when
+ // `WritableFile::Append()` writes to page cache, but the user desires this
+ // guarantee in face of power-loss crash-recovery), RocksDB offers various
+ // mechanisms to additionally invoke `WritableFile::Sync()` in order to
+ // strengthen the guarantee.
+ //
+ // This differs from `kPointInTimeRecovery` in that, in case a corruption is
+ // detected during recovery, this mode will refuse to open the DB. Whereas,
+ // `kPointInTimeRecovery` will stop recovery just before the corruption since
+ // that is a valid point-in-time to which to recover.
+ kTolerateCorruptedTailRecords = 0x00,
+ // Recover from clean shutdown
+ // We don't expect to find any corruption in the WAL
+ // Use case : This is ideal for unit tests and rare applications that
+ // can require high consistency guarantee
+ kAbsoluteConsistency = 0x01,
+ // Recover to point-in-time consistency (default)
+ // We stop the WAL playback on discovering WAL inconsistency
+ // Use case : Ideal for systems that have disk controller cache like
+ // hard disk, SSD without super capacitor that store related data
+ kPointInTimeRecovery = 0x02,
+ // Recovery after a disaster
+ // We ignore any corruption in the WAL and try to salvage as much data as
+ // possible
+ // Use case : Ideal for last ditch effort to recover data or systems that
+ // operate with low grade unrelated data
+ kSkipAnyCorruptedRecords = 0x03,
+};
+
+struct DbPath {
+ std::string path;
+ uint64_t target_size; // Target size of total files under the path, in byte.
+
+ DbPath() : target_size(0) {}
+ DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
+};
+
+extern const char* kHostnameForDbHostId;
+
+enum class CompactionServiceJobStatus : char {
+ kSuccess,
+ kFailure,
+ kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+ std::string db_name;
+ std::string db_id;
+ std::string db_session_id;
+ uint64_t job_id; // job_id is only unique within the current DB and session,
+ // restart DB will reset the job_id. `db_id` and
+ // `db_session_id` could help you build unique id across
+ // different DBs and sessions.
+
+ Env::Priority priority;
+
+ CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+ std::string db_session_id_, uint64_t job_id_,
+ Env::Priority priority_)
+ : db_name(std::move(db_name_)),
+ db_id(std::move(db_id_)),
+ db_session_id(std::move(db_session_id_)),
+ job_id(job_id_),
+ priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+ static const char* Type() { return "CompactionService"; }
+
+ // Returns the name of this compaction service.
+ const char* Name() const override = 0;
+
+ // Start the remote compaction with `compaction_service_input`, which can be
+ // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+ // information the user might want to know, which includes `job_id`.
+ virtual CompactionServiceJobStatus StartV2(
+ const CompactionServiceJobInfo& /*info*/,
+ const std::string& /*compaction_service_input*/) {
+ return CompactionServiceJobStatus::kUseLocal;
+ }
+
+ // Wait for remote compaction to finish.
+ virtual CompactionServiceJobStatus WaitForCompleteV2(
+ const CompactionServiceJobInfo& /*info*/,
+ std::string* /*compaction_service_result*/) {
+ return CompactionServiceJobStatus::kUseLocal;
+ }
+
+ ~CompactionService() override = default;
+};
+
+struct DBOptions {
+ // The function recovers options to the option as in version 4.6.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
+ DBOptions* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ // Some functions that make it easier to optimize RocksDB
+
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ // An optional cache object is passed in for the memory of the
+ // memtable to cost to
+ DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
+
+#ifndef ROCKSDB_LITE
+ // By default, RocksDB uses only one background thread for flush and
+ // compaction. Calling this function will set it up such that total of
+ // `total_threads` is used. Good value for `total_threads` is the number of
+ // cores. You almost definitely want to call this function if your system is
+ // bottlenecked by RocksDB.
+ DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif // ROCKSDB_LITE
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing = false;
+
+ // If true, missing column families will be automatically created.
+ // Default: false
+ bool create_missing_column_families = false;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists = false;
+
+ // If true, RocksDB will aggressively check consistency of the data.
+ // Also, if any of the writes to the database fails (Put, Delete, Merge,
+ // Write), the database will switch to read-only mode and fail all other
+ // Write operations.
+ // In most cases you want this to be set to true.
+ // Default: true
+ bool paranoid_checks = true;
+
+ // If true, during memtable flush, RocksDB will validate total entries
+ // read in flush, and compare with counter inserted into it.
+ // The option is here to turn the feature off in case this new validation
+ // feature has a bug.
+ // Default: true
+ bool flush_verify_memtable_count = true;
+
+ // If true, the log numbers and sizes of the synced WALs are tracked
+ // in MANIFEST. During DB recovery, if a synced WAL is missing
+ // from disk, or the WAL's size does not match the recorded size in
+ // MANIFEST, an error will be reported and the recovery will be aborted.
+ //
+ // This is one additional protection against WAL corruption besides the
+ // per-WAL-entry checksum.
+ //
+ // Note that this option does not work with secondary instance.
+ // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
+ // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
+ // tracked for performance/efficiency reasons.
+ //
+ // Default: false
+ bool track_and_verify_wals_in_manifest = false;
+
+ // If true, verifies the SST unique id between MANIFEST and actual file
+ // each time an SST file is opened. This check ensures an SST file is not
+ // overwritten or misplaced. A corruption error will be reported if mismatch
+ // detected, but only when MANIFEST tracks the unique id, which starts from
+ // RocksDB version 7.3. Although the tracked internal unique id is related
+ // to the one returned by GetUniqueIdFromTableProperties, that is subject to
+ // change.
+ // NOTE: verification is currently only done on SST files using block-based
+ // table format.
+ //
+ // Setting to false should only be needed in case of unexpected problems.
+ //
+ // Although an early version of this option opened all SST files for
+ // verification on DB::Open, that is no longer guaranteed. However, as
+ // documented in an above option, if max_open_files is -1, DB will open all
+ // files on DB::Open().
+ //
+ // Default: true
+ bool verify_sst_unique_id_in_manifest = true;
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc. In the near
+ // future, support for doing storage operations such as read/write files
+ // through env will be deprecated in favor of file_system (see below)
+ // Default: Env::Default()
+ Env* env = Env::Default();
+
+ // Limits internal file read/write bandwidth:
+ //
+ // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH`
+ // - Compaction requests read and write bandwidth at
+ // `Env::IOPriority::IO_LOW`
+ // - Reads associated with a `ReadOptions` can be charged at
+ // `ReadOptions::rate_limiter_priority` (see that option's API doc for usage
+ // and limitations).
+ // - Writes associated with a `WriteOptions` can be charged at
+ // `WriteOptions::rate_limiter_priority` (see that option's API doc for
+ // usage and limitations).
+ //
+ // Rate limiting is disabled if nullptr. If rate limiter is enabled,
+ // bytes_per_sync is set to 1MB by default.
+ //
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> rate_limiter = nullptr;
+
+ // Use to track SST files and control their file deletion rate.
+ //
+ // Features:
+ // - Throttle the deletion rate of the SST files.
+ // - Keep track the total size of all SST files.
+ // - Set a maximum allowed space limit for SST files that when reached
+ // the DB wont do any further flushes or compactions and will set the
+ // background error.
+ // - Can be shared between multiple dbs.
+ // Limitations:
+ // - Only track and throttle deletes of SST files in
+ // first db_path (db_name if db_paths is empty).
+ //
+ // Default: nullptr
+ std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
+
+ // Any internal progress/error information generated by the db will
+ // be written to info_log if it is non-nullptr, or to a file stored
+ // in the same directory as the DB contents if info_log is nullptr.
+ // Default: nullptr
+ std::shared_ptr<Logger> info_log = nullptr;
+
+#ifdef NDEBUG
+ InfoLogLevel info_log_level = INFO_LEVEL;
+#else
+ InfoLogLevel info_log_level = DEBUG_LEVEL;
+#endif // NDEBUG
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set. Value -1 means
+ // files opened are always kept open. You can estimate number of files based
+ // on target_file_size_base and target_file_size_multiplier for level-based
+ // compaction. For universal-style compaction, you can usually set it to -1.
+ //
+ // A high value or -1 for this option can cause high memory usage.
+ // See BlockBasedTableOptions::cache_usage_options to constrain
+ // memory usage in case of block based table format.
+ //
+ // Default: -1
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_open_files = -1;
+
+ // If max_open_files is -1, DB will open all files on DB::Open(). You can
+ // use this option to increase the number of threads used to open the files.
+ // Default: 16
+ int max_file_opening_threads = 16;
+
+ // Once write-ahead logs exceed this size, we will start forcing the flush of
+ // column families whose memtables are backed by the oldest live WAL file
+ // (i.e. the ones that are causing all the space amplification). If set to 0
+ // (default), we will dynamically choose the WAL size limit to be
+ // [sum of all write_buffer_size * max_write_buffer_number] * 4
+ //
+ // For example, with 15 column families, each with
+ // write_buffer_size = 128 MB
+ // max_write_buffer_number = 6
+ // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+ //
+ // The RocksDB wiki has some discussion about how the WAL interacts
+ // with memtables and flushing of column families.
+ // https://github.com/facebook/rocksdb/wiki/Column-Families
+ //
+ // This option takes effect only when there are more than one column
+ // family as otherwise the wal size is dictated by the write_buffer_size.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t max_total_wal_size = 0;
+
+ // If non-null, then we should collect metrics about database operations
+ std::shared_ptr<Statistics> statistics = nullptr;
+
+ // By default, writes to stable storage use fdatasync (on platforms
+ // where this function is available). If this option is true,
+ // fsync is used instead.
+ //
+ // fsync and fdatasync are equally safe for our purposes and fdatasync is
+ // faster, so it is rarely necessary to set this option. It is provided
+ // as a workaround for kernel/filesystem bugs, such as one that affected
+ // fdatasync with ext4 in kernel versions prior to 3.7.
+ bool use_fsync = false;
+
+ // A list of paths where SST files can be put into, with its target size.
+ // Newer data is placed into paths specified earlier in the vector while
+ // older data gradually moves to paths specified later in the vector.
+ //
+ // For example, you have a flash device with 10GB allocated for the DB,
+ // as well as a hard drive of 2TB, you should config it to be:
+ // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+ //
+ // The system will try to guarantee data under each path is close to but
+ // not larger than the target size. But current and future file sizes used
+ // by determining where to place a file are based on best-effort estimation,
+ // which means there is a chance that the actual size under the directory
+ // is slightly more than target size under some workloads. User should give
+ // some buffer room for those cases.
+ //
+ // If none of the paths has sufficient room to place a file, the file will
+ // be placed to the last path anyway, despite to the target size.
+ //
+ // Placing newer data to earlier paths is also best-efforts. User should
+ // expect user files to be placed in higher levels in some extreme cases.
+ //
+ // If left empty, only one path will be used, which is db_name passed when
+ // opening the DB.
+ // Default: empty
+ std::vector<DbPath> db_paths;
+
+ // This specifies the info LOG dir.
+ // If it is empty, the log files will be in the same dir as data.
+ // If it is non empty, the log files will be in the specified dir,
+ // and the db data dir's absolute path will be used as the log file
+ // name's prefix.
+ std::string db_log_dir = "";
+
+ // This specifies the absolute dir path for write-ahead logs (WAL).
+ // If it is empty, the log files will be in the same dir as data,
+ // dbname is used as the data dir by default
+ // If it is non empty, the log files will be in kept the specified dir.
+ // When destroying the db,
+ // all log files in wal_dir and the dir itself is deleted
+ std::string wal_dir = "";
+
+ // The periodicity when obsolete files get deleted. The default
+ // value is 6 hours. The files that get out of scope by compaction
+ // process will still get automatically delete on every compaction,
+ // regardless of this setting
+ //
+ // Default: 6 hours
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
+
+ // Maximum number of concurrent background jobs (compactions and flushes).
+ //
+ // Default: 2
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_background_jobs = 2;
+
+ // DEPRECATED: RocksDB automatically decides this based on the
+ // value of max_background_jobs. For backwards compatibility we will set
+ // `max_background_jobs = max_background_compactions + max_background_flushes`
+ // in the case where user sets at least one of `max_background_compactions` or
+ // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+ //
+ // Maximum number of concurrent background compaction jobs, submitted to
+ // the default LOW priority thread pool.
+ //
+ // If you're increasing this, also consider increasing number of threads in
+ // LOW priority thread pool. For more information, see
+ // Env::SetBackgroundThreads
+ //
+ // Default: -1
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_background_compactions = -1;
+
+ // This value represents the maximum number of threads that will
+ // concurrently perform a compaction job by breaking it into multiple,
+ // smaller ones that are run simultaneously.
+ // Default: 1 (i.e. no subcompactions)
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint32_t max_subcompactions = 1;
+
+ // DEPRECATED: RocksDB automatically decides this based on the
+ // value of max_background_jobs. For backwards compatibility we will set
+ // `max_background_jobs = max_background_compactions + max_background_flushes`
+ // in the case where user sets at least one of `max_background_compactions` or
+ // `max_background_flushes`.
+ //
+ // Maximum number of concurrent background memtable flush jobs, submitted by
+ // default to the HIGH priority thread pool. If the HIGH priority thread pool
+ // is configured to have zero threads, flush jobs will share the LOW priority
+ // thread pool with compaction jobs.
+ //
+ // It is important to use both thread pools when the same Env is shared by
+ // multiple db instances. Without a separate pool, long running compaction
+ // jobs could potentially block memtable flush jobs of other db instances,
+ // leading to unnecessary Put stalls.
+ //
+ // If you're increasing this, also consider increasing number of threads in
+ // HIGH priority thread pool. For more information, see
+ // Env::SetBackgroundThreads
+ // Default: -1
+ int max_background_flushes = -1;
+
+ // Specify the maximal size of the info log file. If the log file
+ // is larger than `max_log_file_size`, a new info log file will
+ // be created.
+ // If max_log_file_size == 0, all logs will be written to one
+ // log file.
+ size_t max_log_file_size = 0;
+
+ // Time for the info log file to roll (in seconds).
+ // If specified with non-zero value, log file will be rolled
+ // if it has been active longer than `log_file_time_to_roll`.
+ // Default: 0 (disabled)
+ // Not supported in ROCKSDB_LITE mode!
+ size_t log_file_time_to_roll = 0;
+
+ // Maximal info log files to be kept.
+ // Default: 1000
+ size_t keep_log_file_num = 1000;
+
+ // Recycle log files.
+ // If non-zero, we will reuse previously written log files for new
+ // logs, overwriting the old data. The value indicates how many
+ // such files we will keep around at any point in time for later
+ // use. This is more efficient because the blocks are already
+ // allocated and fdatasync does not need to update the inode after
+ // each write.
+ // Default: 0
+ size_t recycle_log_file_num = 0;
+
+ // manifest file is rolled over on reaching this limit.
+ // The older manifest file be deleted.
+ // The default value is 1GB so that the manifest file can grow, but not
+ // reach the limit of storage capacity.
+ uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
+
+ // Number of shards used for table cache.
+ int table_cache_numshardbits = 6;
+
+ // The following two fields affect how archived logs will be deleted.
+ // 1. If both set to 0, logs will be deleted asap and will not get into
+ // the archive.
+ // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+ // WAL files will be checked every 10 min and if total size is greater
+ // then WAL_size_limit_MB, they will be deleted starting with the
+ // earliest until size_limit is met. All empty files will be deleted.
+ // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+ // WAL files will be checked every WAL_ttl_seconds / 2 and those that
+ // are older than WAL_ttl_seconds will be deleted.
+ // 4. If both are not 0, WAL files will be checked every 10 min and both
+ // checks will be performed with ttl being first.
+ uint64_t WAL_ttl_seconds = 0;
+ uint64_t WAL_size_limit_MB = 0;
+
+ // Number of bytes to preallocate (via fallocate) the manifest
+ // files. Default is 4mb, which is reasonable to reduce random IO
+ // as well as prevent overallocation for mounts that preallocate
+ // large amounts of data (such as xfs's allocsize option).
+ size_t manifest_preallocation_size = 4 * 1024 * 1024;
+
+ // Allow the OS to mmap file for reading sst tables.
+ // Not recommended for 32-bit OS.
+ // When the option is set to true and compression is disabled, the blocks
+ // will not be copied and will be read directly from the mmap-ed memory
+ // area, and the block will not be inserted into the block cache. However,
+ // checksums will still be checked if ReadOptions.verify_checksums is set
+ // to be true. It means a checksum check every time a block is read, more
+ // than the setup where the option is set to false and the block cache is
+ // used. The common use of the options is to run RocksDB on ramfs, where
+ // checksum verification is usually not needed.
+ // Default: false
+ bool allow_mmap_reads = false;
+
+ // Allow the OS to mmap file for writing.
+ // DB::SyncWAL() only works if this is set to false.
+ // Default: false
+ bool allow_mmap_writes = false;
+
+ // Enable direct I/O mode for read/write
+ // they may or may not improve performance depending on the use case
+ //
+ // Files will be opened in "direct I/O" mode
+ // which means that data r/w from the disk will not be cached or
+ // buffered. The hardware buffer of the devices may however still
+ // be used. Memory mapped files are not impacted by these parameters.
+
+ // Use O_DIRECT for user and compaction reads.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool use_direct_reads = false;
+
+ // Use O_DIRECT for writes in background flush and compactions.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool use_direct_io_for_flush_and_compaction = false;
+
+ // If false, fallocate() calls are bypassed, which disables file
+ // preallocation. The file space preallocation is used to increase the file
+ // write/append performance. By default, RocksDB preallocates space for WAL,
+ // SST, Manifest files, the extra space is truncated when the file is written.
+ // Warning: if you're using btrfs, we would recommend setting
+ // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+ // allocated space cannot be freed, which could be significant if you have
+ // lots of files. More details about this limitation:
+ // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
+ bool allow_fallocate = true;
+
+ // Disable child process inherit open files. Default: true
+ bool is_fd_close_on_exec = true;
+
+ // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+ //
+ // Default: 600 (10 min)
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ unsigned int stats_dump_period_sec = 600;
+
+ // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
+ // Default: 600
+ unsigned int stats_persist_period_sec = 600;
+
+ // If true, automatically persist stats to a hidden column family (column
+ // family name: ___rocksdb_stats_history___) every
+ // stats_persist_period_sec seconds; otherwise, write to an in-memory
+ // struct. User can query through `GetStatsHistory` API.
+ // If user attempts to create a column family with the same name on a DB
+ // which have previously set persist_stats_to_disk to true, the column family
+ // creation will fail, but the hidden column family will survive, as well as
+ // the previously persisted statistics.
+ // When peristing stats to disk, the stat name will be limited at 100 bytes.
+ // Default: false
+ bool persist_stats_to_disk = false;
+
+ // if not zero, periodically take stats snapshots and store in memory, the
+ // memory size for stats snapshots is capped at stats_history_buffer_size
+ // Default: 1MB
+ size_t stats_history_buffer_size = 1024 * 1024;
+
+ // If set true, will hint the underlying file system that the file
+ // access pattern is random, when a sst file is opened.
+ // Default: true
+ bool advise_random_on_open = true;
+
+ // Amount of data to build up in memtables across all column
+ // families before writing to disk.
+ //
+ // This is distinct from write_buffer_size, which enforces a limit
+ // for a single memtable.
+ //
+ // This feature is disabled by default. Specify a non-zero value
+ // to enable it.
+ //
+ // Default: 0 (disabled)
+ size_t db_write_buffer_size = 0;
+
+ // The memory usage of memtable will report to this object. The same object
+ // can be passed into multiple DBs and it will track the sum of size of all
+ // the DBs. If the total size of all live memtables of all the DBs exceeds
+ // a limit, a flush will be triggered in the next DB to which the next write
+ // is issued, as long as there is one or more column family not already
+ // flushing.
+ //
+ // If the object is only passed to one DB, the behavior is the same as
+ // db_write_buffer_size. When write_buffer_manager is set, the value set will
+ // override db_write_buffer_size.
+ //
+ // This feature is disabled by default. Specify a non-zero value
+ // to enable it.
+ //
+ // Default: null
+ std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
+
+ // Specify the file access pattern once a compaction is started.
+ // It will be applied to all input files of a compaction.
+ // Default: NORMAL
+ enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
+ AccessHint access_hint_on_compaction_start = NORMAL;
+
+ // If non-zero, we perform bigger reads when doing compaction. If you're
+ // running RocksDB on spinning disks, you should set this to at least 2MB.
+ // That way RocksDB's compaction is doing sequential instead of random reads.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ size_t compaction_readahead_size = 0;
+
+ // This is a maximum buffer size that is used by WinMmapReadableFile in
+ // unbuffered disk I/O mode. We need to maintain an aligned buffer for
+ // reads. We allow the buffer to grow until the specified value and then
+ // for bigger requests allocate one shot buffers. In unbuffered mode we
+ // always bypass read-ahead buffer at ReadaheadRandomAccessFile
+ // When read-ahead is required we then make use of compaction_readahead_size
+ // value and always try to read ahead. With read-ahead we always
+ // pre-allocate buffer to the size instead of growing it up to a limit.
+ //
+ // This option is currently honored only on Windows
+ //
+ // Default: 1 Mb
+ //
+ // Special value: 0 - means do not maintain per instance buffer. Allocate
+ // per request buffer and avoid locking.
+ size_t random_access_max_buffer_size = 1024 * 1024;
+
+ // This is the maximum buffer size that is used by WritableFileWriter.
+ // With direct IO, we need to maintain an aligned buffer for writes.
+ // We allow the buffer to grow until it's size hits the limit in buffered
+ // IO and fix the buffer size when using direct IO to ensure alignment of
+ // write requests if the logical sector size is unusual
+ //
+ // Default: 1024 * 1024 (1 MB)
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ size_t writable_file_max_buffer_size = 1024 * 1024;
+
+ // Use adaptive mutex, which spins in the user space before resorting
+ // to kernel. This could reduce context switch when the mutex is not
+ // heavily contended. However, if the mutex is hot, we could end up
+ // wasting spin time.
+ // Default: false
+ bool use_adaptive_mutex = false;
+
+ // Create DBOptions with default values for all fields
+ DBOptions();
+ // Create DBOptions from Options
+ explicit DBOptions(const Options& options);
+
+ void Dump(Logger* log) const;
+
+ // Allows OS to incrementally sync files to disk while they are being
+ // written, asynchronously, in the background. This operation can be used
+ // to smooth out write I/Os over time. Users shouldn't rely on it for
+ // persistence guarantee.
+ // Issue one request for every bytes_per_sync written. 0 turns it off.
+ //
+ // You may consider using rate_limiter to regulate write rate to device.
+ // When rate limiter is enabled, it automatically enables bytes_per_sync
+ // to 1MB.
+ //
+ // This option applies to table files
+ //
+ // Default: 0, turned off
+ //
+ // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t bytes_per_sync = 0;
+
+ // Same as bytes_per_sync, but applies to WAL files
+ //
+ // Default: 0, turned off
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t wal_bytes_per_sync = 0;
+
+ // When true, guarantees WAL files have at most `wal_bytes_per_sync`
+ // bytes submitted for writeback at any given time, and SST files have at most
+ // `bytes_per_sync` bytes pending writeback at any given time. This can be
+ // used to handle cases where processing speed exceeds I/O speed during file
+ // generation, which can lead to a huge sync when the file is finished, even
+ // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
+ //
+ // - If `sync_file_range` is supported it achieves this by waiting for any
+ // prior `sync_file_range`s to finish before proceeding. In this way,
+ // processing (compression, etc.) can proceed uninhibited in the gap
+ // between `sync_file_range`s, and we block only when I/O falls behind.
+ // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+ // always blocks, thus preventing the interleaving of I/O and processing.
+ //
+ // Note: Enabling this option does not provide any additional persistence
+ // guarantees, as it may use `sync_file_range`, which does not write out
+ // metadata.
+ //
+ // Default: false
+ bool strict_bytes_per_sync = false;
+
+ // A vector of EventListeners whose callback functions will be called
+ // when specific RocksDB event happens.
+ std::vector<std::shared_ptr<EventListener>> listeners;
+
+ // If true, then the status of the threads involved in this DB will
+ // be tracked and available via GetThreadList() API.
+ //
+ // Default: false
+ bool enable_thread_tracking = false;
+
+ // The limited write rate to DB if soft_pending_compaction_bytes_limit or
+ // level0_slowdown_writes_trigger is triggered, or we are writing to the
+ // last mem table allowed and we allow more than 3 mem tables. It is
+ // calculated using size of user write requests before compression.
+ // RocksDB may decide to slow down more if the compaction still
+ // gets behind further.
+ // If the value is 0, we will infer a value from `rater_limiter` value
+ // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+ // if users change the rate in `rate_limiter` after DB is opened,
+ // `delayed_write_rate` won't be adjusted.
+ //
+ // Unit: byte per second.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t delayed_write_rate = 0;
+
+ // By default, a single write thread queue is maintained. The thread gets
+ // to the head of the queue becomes write batch group leader and responsible
+ // for writing to WAL and memtable for the batch group.
+ //
+ // If enable_pipelined_write is true, separate write thread queue is
+ // maintained for WAL write and memtable write. A write thread first enter WAL
+ // writer queue and then memtable writer queue. Pending thread on the WAL
+ // writer queue thus only have to wait for previous writers to finish their
+ // WAL writing but not the memtable writing. Enabling the feature may improve
+ // write throughput and reduce latency of the prepare phase of two-phase
+ // commit.
+ //
+ // Default: false
+ bool enable_pipelined_write = false;
+
+ // Setting unordered_write to true trades higher write throughput with
+ // relaxing the immutability guarantee of snapshots. This violates the
+ // repeatability one expects from ::Get from a snapshot, as well as
+ // ::MultiGet and Iterator's consistent-point-in-time view property.
+ // If the application cannot tolerate the relaxed guarantees, it can implement
+ // its own mechanisms to work around that and yet benefit from the higher
+ // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+ // two_write_queues=true is one way to achieve immutable snapshots despite
+ // unordered_write.
+ //
+ // By default, i.e., when it is false, rocksdb does not advance the sequence
+ // number for new snapshots unless all the writes with lower sequence numbers
+ // are already finished. This provides the immutability that we except from
+ // snapshots. Moreover, since Iterator and MultiGet internally depend on
+ // snapshots, the snapshot immutability results into Iterator and MultiGet
+ // offering consistent-point-in-time view. If set to true, although
+ // Read-Your-Own-Write property is still provided, the snapshot immutability
+ // property is relaxed: the writes issued after the snapshot is obtained (with
+ // larger sequence numbers) will be still not visible to the reads from that
+ // snapshot, however, there still might be pending writes (with lower sequence
+ // number) that will change the state visible to the snapshot after they are
+ // landed to the memtable.
+ //
+ // Default: false
+ bool unordered_write = false;
+
+ // If true, allow multi-writers to update mem tables in parallel.
+ // Only some memtable_factory-s support concurrent writes; currently it
+ // is implemented only for SkipListFactory. Concurrent memtable writes
+ // are not compatible with inplace_update_support or filter_deletes.
+ // It is strongly recommended to set enable_write_thread_adaptive_yield
+ // if you are going to use this feature.
+ //
+ // Default: true
+ bool allow_concurrent_memtable_write = true;
+
+ // If true, threads synchronizing with the write batch group leader will
+ // wait for up to write_thread_max_yield_usec before blocking on a mutex.
+ // This can substantially improve throughput for concurrent workloads,
+ // regardless of whether allow_concurrent_memtable_write is enabled.
+ //
+ // Default: true
+ bool enable_write_thread_adaptive_yield = true;
+
+ // The maximum limit of number of bytes that are written in a single batch
+ // of WAL or memtable write. It is followed when the leader write size
+ // is larger than 1/8 of this limit.
+ //
+ // Default: 1 MB
+ uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
+ // The maximum number of microseconds that a write operation will use
+ // a yielding spin loop to coordinate with other write threads before
+ // blocking on a mutex. (Assuming write_thread_slow_yield_usec is
+ // set properly) increasing this value is likely to increase RocksDB
+ // throughput at the expense of increased CPU usage.
+ //
+ // Default: 100
+ uint64_t write_thread_max_yield_usec = 100;
+
+ // The latency in microseconds after which a std::this_thread::yield
+ // call (sched_yield on Linux) is considered to be a signal that
+ // other processes or threads would like to use the current core.
+ // Increasing this makes writer threads more likely to take CPU
+ // by spinning, which will show up as an increase in the number of
+ // involuntary context switches.
+ //
+ // Default: 3
+ uint64_t write_thread_slow_yield_usec = 3;
+
+ // If true, then DB::Open() will not update the statistics used to optimize
+ // compaction decision by loading table properties from many files.
+ // Turning off this feature will improve DBOpen time especially in
+ // disk environment.
+ //
+ // Default: false
+ bool skip_stats_update_on_db_open = false;
+
+ // If true, then DB::Open() will not fetch and check sizes of all sst files.
+ // This may significantly speed up startup if there are many sst files,
+ // especially when using non-default Env with expensive GetFileSize().
+ // We'll still check that all required sst files exist.
+ // If paranoid_checks is false, this option is ignored, and sst files are
+ // not checked at all.
+ //
+ // Default: false
+ bool skip_checking_sst_file_sizes_on_db_open = false;
+
+ // Recovery mode to control the consistency while replaying WAL
+ // Default: kPointInTimeRecovery
+ WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+ // if set to false then recovery will fail when a prepared
+ // transaction is encountered in the WAL
+ bool allow_2pc = false;
+
+ // A global cache for table-level rows.
+ // Default: nullptr (disabled)
+ // Not supported in ROCKSDB_LITE mode!
+ std::shared_ptr<Cache> row_cache = nullptr;
+
+#ifndef ROCKSDB_LITE
+ // A filter object supplied to be invoked while processing write-ahead-logs
+ // (WALs) during recovery. The filter provides a way to inspect log
+ // records, ignoring a particular record or skipping replay.
+ // The filter is invoked at startup and is invoked from a single-thread
+ // currently.
+ WalFilter* wal_filter = nullptr;
+#endif // ROCKSDB_LITE
+
+ // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+ // SetOptions will fail if options file is not properly persisted.
+ //
+ // DEFAULT: false
+ bool fail_if_options_file_error = false;
+
+ // If true, then print malloc stats together with rocksdb.stats
+ // when printing to LOG.
+ // DEFAULT: false
+ bool dump_malloc_stats = false;
+
+ // By default RocksDB replay WAL logs and flush them on DB open, which may
+ // create very small SST files. If this option is enabled, RocksDB will try
+ // to avoid (but not guarantee not to) flush during recovery. Also, existing
+ // WAL logs will be kept, so that if crash happened before flush, we still
+ // have logs to recover from.
+ //
+ // DEFAULT: false
+ bool avoid_flush_during_recovery = false;
+
+ // By default RocksDB will flush all memtables on DB close if there are
+ // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+ // DB close. Unpersisted data WILL BE LOST.
+ //
+ // DEFAULT: false
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ bool avoid_flush_during_shutdown = false;
+
+ // Set this option to true during creation of database if you want
+ // to be able to ingest behind (call IngestExternalFile() skipping keys
+ // that already exist, rather than overwriting matching keys).
+ // Setting this option to true will affect 2 things:
+ // 1) Disable some internal optimizations around SST file compression
+ // 2) Reserve bottom-most level for ingested files only.
+ // 3) Note that num_levels should be >= 3 if this option is turned on.
+ //
+ // DEFAULT: false
+ // Immutable.
+ bool allow_ingest_behind = false;
+
+ // If enabled it uses two queues for writes, one for the ones with
+ // disable_memtable and one for the ones that also write to memtable. This
+ // allows the memtable writes not to lag behind other writes. It can be used
+ // to optimize MySQL 2PC in which only the commits, which are serial, write to
+ // memtable.
+ bool two_write_queues = false;
+
+ // If true WAL is not flushed automatically after each write. Instead it
+ // relies on manual invocation of FlushWAL to write the WAL buffer to its
+ // file.
+ bool manual_wal_flush = false;
+
+ // This feature is WORK IN PROGRESS
+ // If enabled WAL records will be compressed before they are written.
+ // Only zstd is supported. Compressed WAL records will be read in supported
+ // versions regardless of the wal_compression settings.
+ CompressionType wal_compression = kNoCompression;
+
+ // If true, RocksDB supports flushing multiple column families and committing
+ // their results atomically to MANIFEST. Note that it is not
+ // necessary to set atomic_flush to true if WAL is always enabled since WAL
+ // allows the database to be restored to the last persistent state in WAL.
+ // This option is useful when there are column families with writes NOT
+ // protected by WAL.
+ // For manual flush, application has to specify which column families to
+ // flush atomically in DB::Flush.
+ // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+ //
+ // Currently, any WAL-enabled writes after atomic flush may be replayed
+ // independently if the process crashes later and tries to recover.
+ bool atomic_flush = false;
+
+ // If true, working thread may avoid doing unnecessary and long-latency
+ // operation (such as deleting obsolete files directly or deleting memtable)
+ // and will instead schedule a background job to do it.
+ // Use it if you're latency-sensitive.
+ // If set to true, takes precedence over
+ // ReadOptions::background_purge_on_iterator_cleanup.
+ bool avoid_unnecessary_blocking_io = false;
+
+ // Historically DB ID has always been stored in Identity File in DB folder.
+ // If this flag is true, the DB ID is written to Manifest file in addition
+ // to the Identity file. By doing this 2 problems are solved
+ // 1. We don't checksum the Identity file where as Manifest file is.
+ // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+ // the source of truth. Previously the Identity file could be copied
+ // independent of Manifest and that can result in wrong DB ID.
+ // We recommend setting this flag to true.
+ // Default: false
+ bool write_dbid_to_manifest = false;
+
+ // The number of bytes to prefetch when reading the log. This is mostly useful
+ // for reading a remotely located log, as it can save the number of
+ // round-trips. If 0, then the prefetching is disabled.
+ //
+ // Default: 0
+ size_t log_readahead_size = 0;
+
+ // If user does NOT provide the checksum generator factory, the file checksum
+ // will NOT be used. A new file checksum generator object will be created
+ // when a SST file is created. Therefore, each created FileChecksumGenerator
+ // will only be used from a single thread and so does not need to be
+ // thread-safe.
+ //
+ // Default: nullptr
+ std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+ // By default, RocksDB recovery fails if any table/blob file referenced in the
+ // final version reconstructed from the
+ // MANIFEST are missing after scanning the MANIFEST pointed to by the
+ // CURRENT file. It can also fail if verification of unique SST id fails.
+ // Best-efforts recovery is another recovery mode that does not necessarily
+ // fail when certain table/blob files are missing/corrupted or have mismatched
+ // unique id table property. Instead, best-efforts recovery recovers each
+ // column family to a point in the MANIFEST that corresponds to a version. In
+ // such a version, all valid table/blob files referenced have the expected
+ // file size. For table files, their unique id table property match the
+ // MANIFEST.
+ //
+ // Best-efforts recovery does not need a valid CURRENT file, and tries to
+ // recover the database using one of the available MANIFEST files in the db
+ // directory.
+ // Best-efforts recovery tries the available MANIFEST files from high file
+ // numbers (newer) to low file numbers (older), and stops after finding the
+ // first MANIFEST file from which the db can be recovered to a state without
+ // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
+ // files. It is possible that the database can be restored to an empty state
+ // with no table or blob files.
+ //
+ // Regardless of this option, the IDENTITY file
+ // is updated if needed during recovery to match the DB ID in the MANIFEST (if
+ // previously using write_dbid_to_manifest) or to be in some valid state
+ // (non-empty DB ID). Currently, not compatible with atomic flush.
+ // Furthermore, WAL files will not be used for recovery if
+ // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
+ // 2) underlying env's LockFile() call returns ok even for non-existing LOCK
+ // file.
+ //
+ // Default: false
+ bool best_efforts_recovery = false;
+
+ // It defines how many times db resume is called by a separate thread when
+ // background retryable IO Error happens. When background retryable IO
+ // Error happens, SetBGError is called to deal with the error. If the error
+ // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+ // then db resume is called in background to recover from the error. If this
+ // value is 0 or negative, db resume will not be called.
+ //
+ // Default: INT_MAX
+ int max_bgerror_resume_count = INT_MAX;
+
+ // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
+ // This option decides how long to wait to retry the next resume if the
+ // previous resume fails and satisfy redo resume conditions.
+ //
+ // Default: 1000000 (microseconds).
+ uint64_t bgerror_resume_retry_interval = 1000000;
+
+ // It allows user to opt-in to get error messages containing corrupted
+ // keys/values. Corrupt keys, values will be logged in the
+ // messages/logs/status that will help users with the useful information
+ // regarding affected data. By default value is set false to prevent users
+ // data to be exposed in the logs/messages etc.
+ //
+ // Default: false
+ bool allow_data_in_errors = false;
+
+ // A string identifying the machine hosting the DB. This
+ // will be written as a property in every SST file written by the DB (or
+ // by offline writers such as SstFileWriter and RepairDB). It can be useful
+ // for troubleshooting in memory corruption caused by a failing host when
+ // writing a file, by tracing back to the writing host. These corruptions
+ // may not be caught by the checksum since they happen before checksumming.
+ // If left as default, the table writer will substitute it with the actual
+ // hostname when writing the SST file. If set to an empty string, the
+ // property will not be written to the SST file.
+ //
+ // Default: hostname
+ std::string db_host_id = kHostnameForDbHostId;
+
+ // Use this if your DB want to enable checksum handoff for specific file
+ // types writes. Make sure that the File_system you use support the
+ // crc32c checksum verification
+ // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+ // NOTE: currently RocksDB only generates crc32c based checksum for the
+ // handoff. If the storage layer has different checksum support, user
+ // should enble this set as empty. Otherwise,it may cause unexpected
+ // write failures.
+ FileTypeSet checksum_handoff_file_types;
+
+ // EXPERIMENTAL
+ // CompactionService is a feature allows the user to run compactions on a
+ // different host or process, which offloads the background load from the
+ // primary host.
+ // It's an experimental feature, the interface will be changed without
+ // backward/forward compatibility support for now. Some known issues are still
+ // under development.
+ std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+ // It indicates, which lowest cache tier we want to
+ // use for a certain DB. Currently we support volatile_tier and
+ // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+ // the block cache (current implemented volatile_tier) is used. So
+ // cache entries will not spill to secondary cache (current
+ // implemented non_volatile_tier), and block cache lookup misses will not
+ // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+ // both block cache and secondary cache.
+ //
+ // Default: kNonVolatileBlockTier
+ CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+ // If set to false, when compaction or flush sees a SingleDelete followed by
+ // a Delete for the same user key, compaction job will not fail.
+ // Otherwise, compaction job will fail.
+ // This is a temporary option to help existing use cases migrate, and
+ // will be removed in a future release.
+ // Warning: do not set to false unless you are trying to migrate existing
+ // data in which the contract of single delete
+ // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced,
+ // thus has Delete mixed with SingleDelete for the same user key. Violation
+ // of the contract leads to undefined behaviors with high possibility of data
+ // inconsistency, e.g. deleted old data become visible again, etc.
+ bool enforce_single_del_contracts = true;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+ // Create an Options object with default values for all fields.
+ Options() : DBOptions(), ColumnFamilyOptions() {}
+
+ Options(const DBOptions& db_options,
+ const ColumnFamilyOptions& column_family_options)
+ : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+ // Change to some default settings from an older version.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
+ Options* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ void Dump(Logger* log) const;
+
+ void DumpCFOptions(Logger* log) const;
+
+ // Some functions that make it easier to optimize RocksDB
+
+ // Set appropriate parameters for bulk loading.
+ // The reason that this is a function that returns "this" instead of a
+ // constructor is to enable chaining of multiple similar calls in the future.
+ //
+
+ // All data will be in level 0 without any automatic compaction.
+ // It's recommended to manually call CompactRange(NULL, NULL) before reading
+ // from the database, because otherwise the read can be very slow.
+ Options* PrepareForBulkLoad();
+
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ Options* OptimizeForSmallDb();
+
+ // Disable some checks that should not be necessary in the absence of
+ // software logic errors or CPU+memory hardware errors. This can improve
+ // write speeds but is only recommended for temporary use. Does not
+ // change protection against corrupt storage (e.g. verify_checksums).
+ Options* DisableExtraChecks();
+};
+
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+ kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
+ kBlockCacheTier = 0x1, // data in memtable or block cache
+ kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option
+ // will skip data in memtable.
+ // Note that this ReadTier currently only supports
+ // Get and MultiGet and does not support iterators.
+ kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators.
+};
+
+// Options that control read operations
+struct ReadOptions {
+ // If "snapshot" is non-nullptr, read as of the supplied snapshot
+ // (which must belong to the DB that is being read and which must
+ // not have been released). If "snapshot" is nullptr, use an implicit
+ // snapshot of the state at the beginning of this read operation.
+ // Default: nullptr
+ const Snapshot* snapshot;
+
+ // `iterate_lower_bound` defines the smallest key at which the backward
+ // iterator can return an entry. Once the bound is passed, Valid() will be
+ // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
+ // entry.
+ //
+ // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+ // need to have the same prefix. This is because ordering is not guaranteed
+ // outside of prefix domain.
+ //
+ // In case of user_defined timestamp, if enabled, iterate_lower_bound should
+ // point to key without timestamp part.
+ // Default: nullptr
+ const Slice* iterate_lower_bound;
+
+ // "iterate_upper_bound" defines the extent up to which the forward iterator
+ // can return entries. Once the bound is reached, Valid() will be false.
+ // "iterate_upper_bound" is exclusive ie the bound value is
+ // not a valid entry. If prefix_extractor is not null:
+ // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+ // to infer whether prefix iterating (e.g. applying prefix bloom filter)
+ // can be used within RocksDB. This is done by comparing
+ // iterate_upper_bound with the seek key.
+ // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+ // effect if it shares the same prefix as the seek key. If
+ // iterate_upper_bound is outside the prefix of the seek key, then keys
+ // returned outside the prefix range will be undefined, just as if
+ // iterate_upper_bound = null.
+ // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+ // at the first key smaller than iterate_upper_bound.
+ //
+ // In case of user_defined timestamp, if enabled, iterate_upper_bound should
+ // point to key without timestamp part.
+ // Default: nullptr
+ const Slice* iterate_upper_bound;
+
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
+ // for a table file. The readahead starts at 8KB and doubles on every
+ // additional read up to 256KB.
+ // This option can help if most of the range scans are large, and if it is
+ // determined that a larger readahead than that enabled by auto-readahead is
+ // needed.
+ // Using a large readahead size (> 2MB) can typically improve the performance
+ // of forward iteration on spinning disks.
+ // Default: 0
+ size_t readahead_size;
+
+ // A threshold for the number of keys that can be skipped before failing an
+ // iterator seek as incomplete. The default value of 0 should be used to
+ // never fail a request as incomplete, even on skipping too many keys.
+ // Default: 0
+ uint64_t max_skippable_internal_keys;
+
+ // Specify if this read request should process data that ALREADY
+ // resides on a particular cache. If the required data is not
+ // found at the specified cache, then Status::Incomplete is returned.
+ // Default: kReadAllTier
+ ReadTier read_tier;
+
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: true
+ bool verify_checksums;
+
+ // Should the "data block"/"index block" read for this iteration be placed in
+ // block cache?
+ // Callers may wish to set this field to false for bulk scans.
+ // This would help not to the change eviction order of existing items in the
+ // block cache.
+ // Default: true
+ bool fill_cache;
+
+ // Specify to create a tailing iterator -- a special iterator that has a
+ // view of the complete database (i.e. it can also be used to read newly
+ // added data) and is optimized for sequential reads. It will return records
+ // that were inserted into the database after the creation of the iterator.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool tailing;
+
+ // This options is not used anymore. It was to turn on a functionality that
+ // has been removed.
+ bool managed;
+
+ // Enable a total order seek regardless of index format (e.g. hash index)
+ // used in the table. Some table format (e.g. plain table) may not support
+ // this option.
+ // If true when calling Get(), we also skip prefix bloom when reading from
+ // block based table, which only affects Get() performance.
+ // Default: false
+ bool total_order_seek;
+
+ // When true, by default use total_order_seek = true, and RocksDB can
+ // selectively enable prefix seek mode if won't generate a different result
+ // from total_order_seek, based on seek key, and iterator upper bound.
+ // Not supported in ROCKSDB_LITE mode, in the way that even with value true
+ // prefix mode is not used.
+ // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+ // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+ // prefix of upper bound differs from prefix of seek key has a flaw.
+ // If present in the DB, "short keys" (shorter than "full length" prefix)
+ // can be omitted from auto_prefix_mode iteration when they would be present
+ // in total_order_seek iteration, regardless of whether the short keys are
+ // "in domain" of the prefix extractor. This is not an issue if no short
+ // keys are added to DB or are not expected to be returned by such
+ // iterators. (We are also assuming the new condition on
+ // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+ // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
+ // Default: false
+ bool auto_prefix_mode;
+
+ // Enforce that the iterator only iterates over the same prefix as the seek.
+ // This option is effective only for prefix seeks, i.e. prefix_extractor is
+ // non-null for the column family and total_order_seek is false. Unlike
+ // iterate_upper_bound, prefix_same_as_start only works within a prefix
+ // but in both directions.
+ // Default: false
+ bool prefix_same_as_start;
+
+ // Keep the blocks loaded by the iterator pinned in memory as long as the
+ // iterator is not deleted, If used when reading from tables created with
+ // BlockBasedTableOptions::use_delta_encoding = false,
+ // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
+ // return 1.
+ // Default: false
+ bool pin_data;
+
+ // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+ // schedule a background job in the flush job queue and delete obsolete files
+ // in background.
+ // Default: false
+ bool background_purge_on_iterator_cleanup;
+
+ // If true, range tombstones handling will be skipped in key lookup paths.
+ // For DB instances that don't use DeleteRange() calls, this setting can
+ // be used to optimize the read performance.
+ // Note that, if this assumption (of no previous DeleteRange() calls) is
+ // broken, stale keys could be served in read paths.
+ // Default: false
+ bool ignore_range_deletions;
+
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned. This option only affects Iterators and has
+ // no impact on point lookups.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
+ // Timestamp of operation. Read should return the latest data visible to the
+ // specified timestamp. All timestamps of the same database must be of the
+ // same length and format. The user is responsible for providing a customized
+ // compare function via Comparator to order <key, timestamp> tuples.
+ // For iterator, iter_start_ts is the lower bound (older) and timestamp
+ // serves as the upper bound. Versions of the same record that fall in
+ // the timestamp range will be returned. If iter_start_ts is nullptr,
+ // only the most recent version visible to timestamp is returned.
+ // The user-specified timestamp feature is still under active development,
+ // and the API is subject to change.
+ // Default: nullptr
+ const Slice* timestamp;
+ const Slice* iter_start_ts;
+
+ // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+ // in microseconds.
+ // It should be set to microseconds since epoch, i.e, gettimeofday or
+ // equivalent plus allowed duration in microseconds. The best way is to use
+ // env->NowMicros() + some timeout.
+ // This is best efforts. The call may exceed the deadline if there is IO
+ // involved and the file system doesn't support deadlines, or due to
+ // checking for deadline periodically rather than for every key if
+ // processing a batch
+ std::chrono::microseconds deadline;
+
+ // A timeout in microseconds to be passed to the underlying FileSystem for
+ // reads. As opposed to deadline, this determines the timeout for each
+ // individual file read request. If a MultiGet/Get/Seek/Next etc call
+ // results in multiple reads, each read can last up to io_timeout us.
+ std::chrono::microseconds io_timeout;
+
+ // It limits the maximum cumulative value size of the keys in batch while
+ // reading through MultiGet. Once the cumulative value size exceeds this
+ // soft limit then all the remaining keys are returned with status Aborted.
+ //
+ // Default: std::numeric_limits<uint64_t>::max()
+ uint64_t value_size_soft_limit;
+
+ // For iterators, RocksDB does auto-readahead on noticing more than two
+ // sequential reads for a table file if user doesn't provide readahead_size.
+ // The readahead starts at 8KB and doubles on every additional read upto
+ // max_auto_readahead_size only when reads are sequential. However at each
+ // level, if iterator moves over next file, readahead_size starts again from
+ // 8KB.
+ //
+ // By enabling this option, RocksDB will do some enhancements for
+ // prefetching the data.
+ //
+ // Default: false
+ bool adaptive_readahead;
+
+ // For file reads associated with this option, charge the internal rate
+ // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+ // special value `Env::IO_TOTAL` disables charging the rate limiter.
+ //
+ // The rate limiting is bypassed no matter this option's value for file reads
+ // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
+ // is a `PlainTableFactory`) and cuckoo tables (these can exist when
+ // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
+ //
+ // The bytes charged to rate limiter may not exactly match the file read bytes
+ // since there are some seemingly insignificant reads, like for file
+ // headers/footers, that we currently do not charge to rate limiter.
+ //
+ // Default: `Env::IO_TOTAL`.
+ Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+
+ // Experimental
+ //
+ // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
+ // RocksDB apply it if reads are sequential and its internal automatic
+ // prefetching.
+ //
+ // Default: false
+ bool async_io;
+
+ // Experimental
+ //
+ // If async_io is set, then this flag controls whether we read SST files
+ // in multiple levels asynchronously. Enabling this flag can help reduce
+ // MultiGet latency by maximizing the number of SST files read in
+ // parallel if the keys in the MultiGet batch are in different levels. It
+ // comes at the expense of slightly higher CPU overhead.
+ //
+ // Default: true
+ bool optimize_multiget_for_io;
+
+ ReadOptions();
+ ReadOptions(bool cksum, bool cache);
+};
+
+// Options that control write operations
+struct WriteOptions {
+ // If true, the write will be flushed from the operating system
+ // buffer cache (by calling WritableFile::Sync()) before the write
+ // is considered complete. If this flag is true, writes will be
+ // slower.
+ //
+ // If this flag is false, and the machine crashes, some recent
+ // writes may be lost. Note that if it is just the process that
+ // crashes (i.e., the machine does not reboot), no writes will be
+ // lost even if sync==false.
+ //
+ // In other words, a DB write with sync==false has similar
+ // crash semantics as the "write()" system call. A DB write
+ // with sync==true has similar crash semantics to a "write()"
+ // system call followed by "fdatasync()".
+ //
+ // Default: false
+ bool sync;
+
+ // If true, writes will not first go to the write ahead log,
+ // and the write may get lost after a crash. The backup engine
+ // relies on write-ahead logs to back up the memtable, so if
+ // you disable write-ahead logs, you must create backups with
+ // flush_before_backup=true to avoid losing unflushed memtable data.
+ // Default: false
+ bool disableWAL;
+
+ // If true and if user is trying to write to column families that don't exist
+ // (they were dropped), ignore the write (don't return an error). If there
+ // are multiple writes in a WriteBatch, other writes will succeed.
+ // Default: false
+ bool ignore_missing_column_families;
+
+ // If true and we need to wait or sleep for the write request, fails
+ // immediately with Status::Incomplete().
+ // Default: false
+ bool no_slowdown;
+
+ // If true, this write request is of lower priority if compaction is
+ // behind. In this case, no_slowdown = true, the request will be canceled
+ // immediately with Status::Incomplete() returned. Otherwise, it will be
+ // slowed down. The slowdown value is determined by RocksDB to guarantee
+ // it introduces minimum impacts to high priority writes.
+ //
+ // Default: false
+ bool low_pri;
+
+ // If true, this writebatch will maintain the last insert positions of each
+ // memtable as hints in concurrent write. It can improve write performance
+ // in concurrent writes if keys in one writebatch are sequential. In
+ // non-concurrent writes (when concurrent_memtable_writes is false) this
+ // option will be ignored.
+ //
+ // Default: false
+ bool memtable_insert_hint_per_batch;
+
+ // For writes associated with this option, charge the internal rate
+ // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+ // special value `Env::IO_TOTAL` disables charging the rate limiter.
+ //
+ // Currently the support covers automatic WAL flushes, which happen during
+ // live updates (`Put()`, `Write()`, `Delete()`, etc.)
+ // when `WriteOptions::disableWAL == false`
+ // and `DBOptions::manual_wal_flush == false`.
+ //
+ // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed
+ // due to implementation constraints.
+ //
+ // Default: `Env::IO_TOTAL`
+ Env::IOPriority rate_limiter_priority;
+
+ // `protection_bytes_per_key` is the number of bytes used to store
+ // protection information for each key entry. Currently supported values are
+ // zero (disabled) and eight.
+ //
+ // Default: zero (disabled).
+ size_t protection_bytes_per_key;
+
+ WriteOptions()
+ : sync(false),
+ disableWAL(false),
+ ignore_missing_column_families(false),
+ no_slowdown(false),
+ low_pri(false),
+ memtable_insert_hint_per_batch(false),
+ rate_limiter_priority(Env::IO_TOTAL),
+ protection_bytes_per_key(0) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+ // If true, the flush will wait until the flush is done.
+ // Default: true
+ bool wait;
+ // If true, the flush would proceed immediately even it means writes will
+ // stall for the duration of the flush; if false the operation will wait
+ // until it's possible to do flush w/o causing stall or until required flush
+ // is performed by someone else (foreground call or background thread).
+ // Default: false
+ bool allow_write_stall;
+ FlushOptions() : wait(true), allow_write_stall(false) {}
+};
+
+// Create a Logger from provided DBOptions
+extern Status CreateLoggerFromOptions(const std::string& dbname,
+ const DBOptions& options,
+ std::shared_ptr<Logger>* logger);
+
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+ // Compaction output compression type
+ // Default: snappy
+ // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+ // according to the `ColumnFamilyOptions`, taking into account the output
+ // level if `compression_per_level` is specified.
+ CompressionType compression;
+ // Compaction will create files of size `output_file_size_limit`.
+ // Default: MAX, which means that compaction will create a single file
+ uint64_t output_file_size_limit;
+ // If > 0, it will replace the option in the DBOptions for this compaction.
+ uint32_t max_subcompactions;
+
+ CompactionOptions()
+ : compression(kSnappyCompression),
+ output_file_size_limit(std::numeric_limits<uint64_t>::max()),
+ max_subcompactions(0) {}
+};
+
+// For level based compaction, we can configure if we want to skip/force
+// bottommost level compaction.
+enum class BottommostLevelCompaction {
+ // Skip bottommost level compaction
+ kSkip,
+ // Only compact bottommost level if there is a compaction filter
+ // This is the default option
+ kIfHaveCompactionFilter,
+ // Always compact bottommost level
+ kForce,
+ // Always compact bottommost level but in bottommost level avoid
+ // double-compacting files created in the same compaction
+ kForceOptimized,
+};
+
+// For manual compaction, we can configure if we want to skip/force garbage
+// collection of blob files.
+enum class BlobGarbageCollectionPolicy {
+ // Force blob file garbage collection.
+ kForce,
+ // Skip blob file garbage collection.
+ kDisable,
+ // Inherit blob file garbage collection policy from ColumnFamilyOptions.
+ kUseDefault,
+};
+
+// CompactRangeOptions is used by CompactRange() call.
+struct CompactRangeOptions {
+ // If true, no other compaction will run at the same time as this
+ // manual compaction.
+ //
+ // Default: false
+ bool exclusive_manual_compaction = false;
+
+ // If true, compacted files will be moved to the minimum level capable
+ // of holding the data or given level (specified non-negative target_level).
+ bool change_level = false;
+ // If change_level is true and target_level have non-negative value, compacted
+ // files will be moved to target_level.
+ int target_level = -1;
+ // Compaction outputs will be placed in options.db_paths[target_path_id].
+ // Behavior is undefined if target_path_id is out of range.
+ uint32_t target_path_id = 0;
+ // By default level based compaction will only compact the bottommost level
+ // if there is a compaction filter
+ BottommostLevelCompaction bottommost_level_compaction =
+ BottommostLevelCompaction::kIfHaveCompactionFilter;
+ // If true, will execute immediately even if doing so would cause the DB to
+ // enter write stall mode. Otherwise, it'll sleep until load is low enough.
+ bool allow_write_stall = false;
+ // If > 0, it will replace the option in the DBOptions for this compaction.
+ uint32_t max_subcompactions = 0;
+ // Set user-defined timestamp low bound, the data with older timestamp than
+ // low bound maybe GCed by compaction. Default: nullptr
+ const Slice* full_history_ts_low = nullptr;
+
+ // Allows cancellation of an in-progress manual compaction.
+ //
+ // Cancellation can be delayed waiting on automatic compactions when used
+ // together with `exclusive_manual_compaction == true`.
+ std::atomic<bool>* canceled = nullptr;
+ // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+ // canceled variable in CompactRangeOptions.
+ // Typically, when CompactRange is being called in one thread (t1) with
+ // canceled = false, and DisableManualCompaction is being called in the
+ // other thread (t2), manual compaction is disabled normally, even if the
+ // compaction iterator may still scan a few items before *canceled is
+ // set to true
+
+ // If set to kForce, RocksDB will override enable_blob_file_garbage_collection
+ // to true; if set to kDisable, RocksDB will override it to false, and
+ // kUseDefault leaves the setting in effect. This enables customers to both
+ // force-enable and force-disable GC when calling CompactRange.
+ BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+ BlobGarbageCollectionPolicy::kUseDefault;
+
+ // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff
+ // from ColumnFamilyOptions in effect. Otherwise, it will override the
+ // user-provided setting. This enables customers to selectively override the
+ // age cutoff.
+ double blob_garbage_collection_age_cutoff = -1;
+};
+
+// IngestExternalFileOptions is used by IngestExternalFile()
+struct IngestExternalFileOptions {
+ // Can be set to true to move the files instead of copying them.
+ bool move_files = false;
+ // If set to true, ingestion falls back to copy when move fails.
+ bool failed_move_fall_back_to_copy = true;
+ // If set to false, an ingested file keys could appear in existing snapshots
+ // that where created before the file was ingested.
+ bool snapshot_consistency = true;
+ // If set to false, IngestExternalFile() will fail if the file key range
+ // overlaps with existing keys or tombstones in the DB.
+ bool allow_global_seqno = true;
+ // If set to false and the file key range overlaps with the memtable key range
+ // (memtable flush required), IngestExternalFile will fail.
+ bool allow_blocking_flush = true;
+ // Set to true if you would like duplicate keys in the file being ingested
+ // to be skipped rather than overwriting existing data under that key.
+ // Use case: back-fill of some historical data in the database without
+ // over-writing existing newer version of data.
+ // This option could only be used if the DB has been running
+ // with allow_ingest_behind=true since the dawn of time.
+ // All files will be ingested at the bottommost level with seqno=0.
+ bool ingest_behind = false;
+ // Set to true if you would like to write global_seqno to a given offset in
+ // the external SST file for backward compatibility. Older versions of
+ // RocksDB writes a global_seqno to a given offset within ingested SST files,
+ // and new versions of RocksDB do not. If you ingest an external SST using
+ // new version of RocksDB and would like to be able to downgrade to an
+ // older version of RocksDB, you should set 'write_global_seqno' to true. If
+ // your service is just starting to use the new RocksDB, we recommend that
+ // you set this option to false, which brings two benefits:
+ // 1. No extra random write for global_seqno during ingestion.
+ // 2. Without writing external SST file, it's possible to do checksum.
+ // We have a plan to set this option to false by default in the future.
+ bool write_global_seqno = true;
+ // Set to true if you would like to verify the checksums of each block of the
+ // external SST file before ingestion.
+ // Warning: setting this to true causes slowdown in file ingestion because
+ // the external SST file has to be read.
+ bool verify_checksums_before_ingest = false;
+ // When verify_checksums_before_ingest = true, RocksDB uses default
+ // readahead setting to scan the file while verifying checksums before
+ // ingestion.
+ // Users can override the default value using this option.
+ // Using a large readahead size (> 2MB) can typically improve the performance
+ // of forward iteration on spinning disks.
+ size_t verify_checksums_readahead_size = 0;
+ // Set to TRUE if user wants to verify the sst file checksum of ingested
+ // files. The DB checksum function will generate the checksum of each
+ // ingested file (if file_checksum_gen_factory is set) and compare the
+ // checksum function name and checksum with the ingested checksum information.
+ //
+ // If this option is set to True: 1) if DB does not enable checksum
+ // (file_checksum_gen_factory == nullptr), the ingested checksum information
+ // will be ignored; 2) If DB enable the checksum function, we calculate the
+ // sst file checksum after the file is moved or copied and compare the
+ // checksum and checksum name. If checksum or checksum function name does
+ // not match, ingestion will be failed. If the verification is successful,
+ // checksum and checksum function name will be stored in Manifest.
+ // If this option is set to FALSE, 1) if DB does not enable checksum,
+ // the ingested checksum information will be ignored; 2) if DB enable the
+ // checksum, we only verify the ingested checksum function name and we
+ // trust the ingested checksum. If the checksum function name matches, we
+ // store the checksum in Manifest. DB does not calculate the checksum during
+ // ingestion. However, if no checksum information is provided with the
+ // ingested files, DB will generate the checksum and store in the Manifest.
+ bool verify_file_checksum = true;
+ // Set to TRUE if user wants file to be ingested to the bottommost level. An
+ // error of Status::TryAgain() will be returned if a file cannot fit in the
+ // bottommost level when calling
+ // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear
+ // the bottommost level in the overlapping range before re-attempt.
+ //
+ // ingest_behind takes precedence over fail_if_not_bottommost_level.
+ bool fail_if_not_bottommost_level = false;
+};
+
+enum TraceFilterType : uint64_t {
+ // Trace all the operations
+ kTraceFilterNone = 0x0,
+ // Do not trace the get operations
+ kTraceFilterGet = 0x1 << 0,
+ // Do not trace the write operations
+ kTraceFilterWrite = 0x1 << 1,
+ // Do not trace the `Iterator::Seek()` operations
+ kTraceFilterIteratorSeek = 0x1 << 2,
+ // Do not trace the `Iterator::SeekForPrev()` operations
+ kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+ // Do not trace the `MultiGet()` operations
+ kTraceFilterMultiGet = 0x1 << 4,
+};
+
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+ // To avoid the trace file size grows large than the storage space,
+ // user can set the max trace file size in Bytes. Default is 64GB
+ uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+ // Specify trace sampling option, i.e. capture one per how many requests.
+ // Default to 1 (capture every request).
+ uint64_t sampling_frequency = 1;
+ // Note: The filtering happens before sampling.
+ uint64_t filter = kTraceFilterNone;
+ // When true, the order of write records in the trace will match the order of
+ // the corresponding write records in the WAL and applied to the DB. There may
+ // be a performance penalty associated with preserving this ordering.
+ //
+ // Default: false. This means write records in the trace may be in an order
+ // different from the WAL's order.
+ bool preserve_write_order = false;
+};
+
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+ // Can be set to true to move the files instead of copying them.
+ bool move_files = false;
+};
+
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+ // Defines whether the returned size should include the recently written
+ // data in the memtables. If set to false, include_files must be true.
+ bool include_memtables = false;
+ // Defines whether the returned size should include data serialized to disk.
+ // If set to false, include_memtables must be true.
+ bool include_files = true;
+ // When approximating the files total size that is used to store a keys range
+ // using DB::GetApproximateSizes, allow approximation with an error margin of
+ // up to total_files_size * files_size_error_margin. This allows to take some
+ // shortcuts in files size approximation, resulting in better performance,
+ // while guaranteeing the resulting error is within a reasonable margin.
+ // E.g., if the value is 0.1, then the error margin of the returned files size
+ // approximation will be within 10%.
+ // If the value is non-positive - a more precise yet more CPU intensive
+ // estimation is performed.
+ double files_size_error_margin = -1.0;
+};
+
+struct CompactionServiceOptionsOverride {
+ // Currently pointer configurations are not passed to compaction service
+ // compaction so the user needs to set it. It will be removed once pointer
+ // configuration passing is supported.
+ Env* env = Env::Default();
+ std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+ const Comparator* comparator = BytewiseComparator();
+ std::shared_ptr<MergeOperator> merge_operator = nullptr;
+ const CompactionFilter* compaction_filter = nullptr;
+ std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+ std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+ std::shared_ptr<TableFactory> table_factory;
+ std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+ // Only subsets of events are triggered in remote compaction worker, like:
+ // `OnTableFileCreated`, `OnTableFileCreationStarted`,
+ // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`,
+ // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and
+ // `OnCompactionCompleted` won't be triggered. They will be triggered on the
+ // primary DB side.
+ std::vector<std::shared_ptr<EventListener>> listeners;
+
+ // statistics is used to collect DB operation metrics, the metrics won't be
+ // returned to CompactionService primary host, to collect that, the user needs
+ // to set it here.
+ std::shared_ptr<Statistics> statistics = nullptr;
+
+ // Only compaction generated SST files use this user defined table properties
+ // collector.
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories;
+};
+
+struct OpenAndCompactOptions {
+ // Allows cancellation of an in-progress compaction.
+ std::atomic<bool>* canceled = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+ // Whether to populate FileStorageInfo::file_checksum* or leave blank
+ bool include_checksum_info = false;
+ // Flushes memtables if total size in bytes of live WAL files is >= this
+ // number (and DB is not read-only).
+ // Default: always force a flush without checking sizes.
+ uint64_t wal_size_for_flush = 0;
+};
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
new file mode 100644
index 000000000..cd1dd99f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+ // # of times bloom filter has avoided file reads, i.e., negatives.
+ uint64_t bloom_filter_useful = 0;
+ // # of times bloom FullFilter has not avoided the reads.
+ uint64_t bloom_filter_full_positive = 0;
+ // # of times bloom FullFilter has not avoided the reads and data actually
+ // exist.
+ uint64_t bloom_filter_full_true_positive = 0;
+
+ // total number of user key returned (only include keys that are found, does
+ // not include keys that are deleted or merged without a final put
+ uint64_t user_key_return_count = 0;
+
+ // total nanos spent on reading data from SST files
+ uint64_t get_from_table_nanos = 0;
+
+ uint64_t block_cache_hit_count = 0; // total number of block cache hits
+ uint64_t block_cache_miss_count = 0; // total number of block cache misses
+
+ void Reset(); // reset all performance counters to zero
+};
+
+struct PerfContext {
+ ~PerfContext();
+
+ PerfContext() {}
+
+ PerfContext(const PerfContext&);
+ PerfContext& operator=(const PerfContext&);
+ PerfContext(PerfContext&&) noexcept;
+
+ void Reset(); // reset all performance counters to zero
+
+ std::string ToString(bool exclude_zero_counters = false) const;
+
+ // enable per level perf context and allocate storage for PerfContextByLevel
+ void EnablePerLevelPerfContext();
+
+ // temporarily disable per level perf context by setting the flag to false
+ void DisablePerLevelPerfContext();
+
+ // free the space for PerfContextByLevel, also disable per level perf context
+ void ClearPerLevelPerfContext();
+
+ uint64_t user_key_comparison_count; // total number of user key comparisons
+ uint64_t block_cache_hit_count; // total number of block cache hits
+ uint64_t block_read_count; // total number of block reads (with IO)
+ uint64_t block_read_byte; // total number of bytes from block reads
+ uint64_t block_read_time; // total nanos spent on block reads
+ uint64_t block_cache_index_hit_count; // total number of index block hits
+ // total number of standalone handles lookup from secondary cache
+ uint64_t block_cache_standalone_handle_count;
+ // total number of real handles lookup from secondary cache that are inserted
+ // into primary cache
+ uint64_t block_cache_real_handle_count;
+ uint64_t index_block_read_count; // total number of index block reads
+ uint64_t block_cache_filter_hit_count; // total number of filter block hits
+ uint64_t filter_block_read_count; // total number of filter block reads
+ uint64_t compression_dict_block_read_count; // total number of compression
+ // dictionary block reads
+
+ uint64_t secondary_cache_hit_count; // total number of secondary cache hits
+ // total number of real handles inserted into secondary cache
+ uint64_t compressed_sec_cache_insert_real_count;
+ // total number of dummy handles inserted into secondary cache
+ uint64_t compressed_sec_cache_insert_dummy_count;
+ // bytes for vals before compression in secondary cache
+ uint64_t compressed_sec_cache_uncompressed_bytes;
+ // bytes for vals after compression in secondary cache
+ uint64_t compressed_sec_cache_compressed_bytes;
+
+ uint64_t block_checksum_time; // total nanos spent on block checksum
+ uint64_t block_decompress_time; // total nanos spent on block decompression
+
+ uint64_t get_read_bytes; // bytes for vals returned by Get
+ uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet
+ uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator
+
+ uint64_t blob_cache_hit_count; // total number of blob cache hits
+ uint64_t blob_read_count; // total number of blob reads (with IO)
+ uint64_t blob_read_byte; // total number of bytes from blob reads
+ uint64_t blob_read_time; // total nanos spent on blob reads
+ uint64_t blob_checksum_time; // total nanos spent on blob checksum
+ uint64_t blob_decompress_time; // total nanos spent on blob decompression
+
+ // total number of internal keys skipped over during iteration.
+ // There are several reasons for it:
+ // 1. when calling Next(), the iterator is in the position of the previous
+ // key, so that we'll need to skip it. It means this counter will always
+ // be incremented in Next().
+ // 2. when calling Next(), we need to skip internal entries for the previous
+ // keys that are overwritten.
+ // 3. when calling Next(), Seek() or SeekToFirst(), after previous key
+ // before calling Next(), the seek key in Seek() or the beginning for
+ // SeekToFirst(), there may be one or more deleted keys before the next
+ // valid key that the operation should place the iterator to. We need
+ // to skip both of the tombstone and updates hidden by the tombstones. The
+ // tombstones are not included in this counter, while previous updates
+ // hidden by the tombstones will be included here.
+ // 4. symmetric cases for Prev() and SeekToLast()
+ // internal_recent_skipped_count is not included in this counter.
+ //
+ uint64_t internal_key_skipped_count;
+ // Total number of deletes and single deletes skipped over during iteration
+ // When calling Next(), Seek() or SeekToFirst(), after previous position
+ // before calling Next(), the seek key in Seek() or the beginning for
+ // SeekToFirst(), there may be one or more deleted keys before the next valid
+ // key. Every deleted key is counted once. We don't recount here if there are
+ // still older updates invalidated by the tombstones.
+ //
+ uint64_t internal_delete_skipped_count;
+ // How many times iterators skipped over internal keys that are more recent
+ // than the snapshot that iterator is using.
+ //
+ uint64_t internal_recent_skipped_count;
+ // How many values were fed into merge operator by iterators.
+ //
+ uint64_t internal_merge_count;
+ // Number of times we reseeked inside a merging iterator, specifically to skip
+ // after or before a range of keys covered by a range deletion in a newer LSM
+ // component.
+ uint64_t internal_range_del_reseek_count;
+
+ uint64_t get_snapshot_time; // total nanos spent on getting snapshot
+ uint64_t get_from_memtable_time; // total nanos spent on querying memtables
+ uint64_t get_from_memtable_count; // number of mem tables queried
+ // total nanos spent after Get() finds a key
+ uint64_t get_post_process_time;
+ uint64_t get_from_output_files_time; // total nanos reading from output files
+ // total nanos spent on seeking memtable
+ uint64_t seek_on_memtable_time;
+ // number of seeks issued on memtable
+ // (including SeekForPrev but not SeekToFirst and SeekToLast)
+ uint64_t seek_on_memtable_count;
+ // number of Next()s issued on memtable
+ uint64_t next_on_memtable_count;
+ // number of Prev()s issued on memtable
+ uint64_t prev_on_memtable_count;
+ // total nanos spent on seeking child iters
+ uint64_t seek_child_seek_time;
+ // number of seek issued in child iterators
+ uint64_t seek_child_seek_count;
+ uint64_t seek_min_heap_time; // total nanos spent on the merge min heap
+ uint64_t seek_max_heap_time; // total nanos spent on the merge max heap
+ // total nanos spent on seeking the internal entries
+ uint64_t seek_internal_seek_time;
+ // total nanos spent on iterating internal entries to find the next user entry
+ uint64_t find_next_user_entry_time;
+
+ // This group of stats provide a breakdown of time spent by Write().
+ // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+ // are enabled.
+ //
+ // total nanos spent on writing to WAL
+ uint64_t write_wal_time;
+ // total nanos spent on writing to mem tables
+ uint64_t write_memtable_time;
+ // total nanos spent on delaying or throttling write
+ uint64_t write_delay_time;
+ // total nanos spent on switching memtable/wal and scheduling
+ // flushes/compactions.
+ uint64_t write_scheduling_flushes_compactions_time;
+ // total nanos spent on writing a record, excluding the above four things
+ uint64_t write_pre_and_post_process_time;
+
+ // time spent waiting for other threads of the batch group
+ uint64_t write_thread_wait_nanos;
+
+ // time spent on acquiring DB mutex.
+ uint64_t db_mutex_lock_nanos;
+ // Time spent on waiting with a condition variable created with DB mutex.
+ uint64_t db_condition_wait_nanos;
+ // Time spent on merge operator.
+ uint64_t merge_operator_time_nanos;
+
+ // Time spent on reading index block from block cache or SST file
+ uint64_t read_index_block_nanos;
+ // Time spent on reading filter block from block cache or SST file
+ uint64_t read_filter_block_nanos;
+ // Time spent on creating data block iterator
+ uint64_t new_table_block_iter_nanos;
+ // Time spent on creating a iterator of an SST file.
+ uint64_t new_table_iterator_nanos;
+ // Time spent on seeking a key in data/index blocks
+ uint64_t block_seek_nanos;
+ // Time spent on finding or creating a table reader
+ uint64_t find_table_nanos;
+ // total number of mem table bloom hits
+ uint64_t bloom_memtable_hit_count;
+ // total number of mem table bloom misses
+ uint64_t bloom_memtable_miss_count;
+ // total number of SST table bloom hits
+ uint64_t bloom_sst_hit_count;
+ // total number of SST table bloom misses
+ uint64_t bloom_sst_miss_count;
+
+ // Time spent waiting on key locks in transaction lock manager.
+ uint64_t key_lock_wait_time;
+ // number of times acquiring a lock was blocked by another transaction.
+ uint64_t key_lock_wait_count;
+
+ // Total time spent in Env filesystem operations. These are only populated
+ // when TimedEnv is used.
+ uint64_t env_new_sequential_file_nanos;
+ uint64_t env_new_random_access_file_nanos;
+ uint64_t env_new_writable_file_nanos;
+ uint64_t env_reuse_writable_file_nanos;
+ uint64_t env_new_random_rw_file_nanos;
+ uint64_t env_new_directory_nanos;
+ uint64_t env_file_exists_nanos;
+ uint64_t env_get_children_nanos;
+ uint64_t env_get_children_file_attributes_nanos;
+ uint64_t env_delete_file_nanos;
+ uint64_t env_create_dir_nanos;
+ uint64_t env_create_dir_if_missing_nanos;
+ uint64_t env_delete_dir_nanos;
+ uint64_t env_get_file_size_nanos;
+ uint64_t env_get_file_modification_time_nanos;
+ uint64_t env_rename_file_nanos;
+ uint64_t env_link_file_nanos;
+ uint64_t env_lock_file_nanos;
+ uint64_t env_unlock_file_nanos;
+ uint64_t env_new_logger_nanos;
+
+ uint64_t get_cpu_nanos;
+ uint64_t iter_next_cpu_nanos;
+ uint64_t iter_prev_cpu_nanos;
+ uint64_t iter_seek_cpu_nanos;
+
+ // Time spent in encrypting data. Populated when EncryptedEnv is used.
+ uint64_t encrypt_data_nanos;
+ // Time spent in decrypting data. Populated when EncryptedEnv is used.
+ uint64_t decrypt_data_nanos;
+
+ uint64_t number_async_seek;
+
+ std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+ bool per_level_perf_context_enabled = false;
+};
+
+// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
+// non-thread-local PerfContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+// a thread-local PerfContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
+PerfContext* get_perf_context();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h
new file mode 100644
index 000000000..e7dded0e3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_level.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// How much perf stats to collect. Affects perf_context and iostats_context.
+enum PerfLevel : unsigned char {
+ kUninitialized = 0, // unknown setting
+ kDisable = 1, // disable perf stats
+ kEnableCount = 2, // enable only count stats
+ kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time
+ // stats except for mutexes
+ // Other than time, also measure CPU time counters. Still don't measure
+ // time (neither wall time nor CPU time) for mutexes.
+ kEnableTimeAndCPUTimeExceptForMutex = 4,
+ kEnableTime = 5, // enable count and time stats
+ kOutOfBounds = 6 // N.B. Must always be the last value!
+};
+
+// set the perf stats level for current thread
+void SetPerfLevel(PerfLevel level);
+
+// get current perf stats level for current thread
+PerfLevel GetPerfLevel();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h
new file mode 100644
index 000000000..f14f01999
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/persistent_cache.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCache
+//
+// Persistent cache interface for caching IO pages on a persistent medium. The
+// cache interface is specifically designed for persistent read cache.
+class PersistentCache {
+ public:
+ using StatsType = std::vector<std::map<std::string, double>>;
+
+ virtual ~PersistentCache() {}
+
+ // Insert to page cache
+ //
+ // page_key Identifier to identify a page uniquely across restarts
+ // data Page data to copy (caller retains ownership)
+ // size Size of the page
+ virtual Status Insert(const Slice& key, const char* data,
+ const size_t size) = 0;
+
+ // Lookup page cache by page identifier
+ //
+ // page_key Page identifier
+ // buf Buffer where the data should be copied
+ // size Size of the page
+ virtual Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+ size_t* size) = 0;
+
+ // True if the cache is configured to store serialized blocks, which are
+ // potentially compressed and include a trailer (when SST format calls for
+ // one). False if the cache stores uncompressed blocks (no trailer).
+ virtual bool IsCompressed() = 0;
+
+ // Return stats as map of {string, double} per-tier
+ //
+ // Persistent cache can be initialized as a tier of caches. The stats are per
+ // tire top-down
+ virtual StatsType Stats() = 0;
+
+ virtual std::string GetPrintableOptions() const = 0;
+
+ // Return a new numeric id. May be used by multiple clients who are
+ // sharding the same persistent cache to partition the key space. Typically
+ // the client will allocate a new id at startup and prepend the id to its
+ // cache keys.
+ virtual uint64_t NewId() = 0;
+};
+
+// Factor method to create a new persistent cache
+Status NewPersistentCache(Env* const env, const std::string& path,
+ const uint64_t size,
+ const std::shared_ptr<Logger>& log,
+ const bool optimized_for_nvm,
+ std::shared_ptr<PersistentCache>* cache);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
new file mode 100644
index 000000000..9cad6edf4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class RateLimiter {
+ public:
+ enum class OpType {
+ kRead,
+ kWrite,
+ };
+
+ enum class Mode {
+ kReadsOnly,
+ kWritesOnly,
+ kAllIo,
+ };
+
+ // For API compatibility, default to rate-limiting writes only.
+ explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {}
+
+ virtual ~RateLimiter() {}
+
+ // This API allows user to dynamically change rate limiter's bytes per second.
+ // REQUIRED: bytes_per_second > 0
+ virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
+
+ // Deprecated. New RateLimiter derived classes should override
+ // Request(const int64_t, const Env::IOPriority, Statistics*) or
+ // Request(const int64_t, const Env::IOPriority, Statistics*, OpType)
+ // instead.
+ //
+ // Request for token for bytes. If this request can not be satisfied, the call
+ // is blocked. Caller is responsible to make sure
+ // bytes <= GetSingleBurstBytes()
+ // and bytes >= 0.
+ virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
+ assert(false);
+ }
+
+ // Request for token for bytes and potentially update statistics. If this
+ // request can not be satisfied, the call is blocked. Caller is responsible to
+ // make sure bytes <= GetSingleBurstBytes()
+ // and bytes >= 0.
+ virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* /* stats */) {
+ // For API compatibility, default implementation calls the older API in
+ // which statistics are unsupported.
+ Request(bytes, pri);
+ }
+
+ // Requests token to read or write bytes and potentially updates statistics.
+ //
+ // If this request can not be satisfied, the call is blocked. Caller is
+ // responsible to make sure bytes <= GetSingleBurstBytes()
+ // and bytes >= 0.
+ virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* stats, OpType op_type) {
+ if (IsRateLimited(op_type)) {
+ Request(bytes, pri, stats);
+ }
+ }
+
+ // Requests token to read or write bytes and potentially updates statistics.
+ // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of
+ // direct I/O) to allocate an appropriate number of bytes, which may be less
+ // than the number of bytes requested.
+ virtual size_t RequestToken(size_t bytes, size_t alignment,
+ Env::IOPriority io_priority, Statistics* stats,
+ RateLimiter::OpType op_type);
+
+ // Max bytes can be granted in a single burst
+ virtual int64_t GetSingleBurstBytes() const = 0;
+
+ // Total bytes that go through rate limiter
+ virtual int64_t GetTotalBytesThrough(
+ const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+ // Total # of requests that go through rate limiter
+ virtual int64_t GetTotalRequests(
+ const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+ // Total # of requests that are pending for bytes in rate limiter
+ // For convenience, this function is supported by the RateLimiter returned
+ // by NewGenericRateLimiter but is not required by RocksDB.
+ //
+ // REQUIRED: total_pending_request != nullptr
+ virtual Status GetTotalPendingRequests(
+ int64_t* total_pending_requests,
+ const Env::IOPriority pri = Env::IO_TOTAL) const {
+ assert(total_pending_requests != nullptr);
+ (void)total_pending_requests;
+ (void)pri;
+ return Status::NotSupported();
+ }
+
+ virtual int64_t GetBytesPerSecond() const = 0;
+
+ virtual bool IsRateLimited(OpType op_type) {
+ if ((mode_ == RateLimiter::Mode::kWritesOnly &&
+ op_type == RateLimiter::OpType::kRead) ||
+ (mode_ == RateLimiter::Mode::kReadsOnly &&
+ op_type == RateLimiter::OpType::kWrite)) {
+ return false;
+ }
+ return true;
+ }
+
+ protected:
+ Mode GetMode() { return mode_; }
+
+ private:
+ const Mode mode_;
+};
+
+// Create a RateLimiter object, which can be shared among RocksDB instances to
+// control write rate of flush and compaction.
+// @rate_bytes_per_sec: this is the only parameter you want to set most of the
+// time. It controls the total write rate of compaction and flush in bytes per
+// second. Currently, RocksDB does not enforce rate limit for anything other
+// than flush and compaction, e.g. write to WAL.
+// @refill_period_us: this controls how often tokens are refilled. For example,
+// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+// burstier writes while smaller value introduces more CPU overhead.
+// The default should work for most cases.
+// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
+// A low-pri request is usually blocked in favor of hi-pri request. Currently,
+// RocksDB assigns low-pri to request from compaction and high-pri to request
+// from flush. Low-pri requests can get blocked if flush requests come in
+// continuously. This fairness parameter grants low-pri requests permission by
+// 1/fairness chance even though high-pri requests exist to avoid starvation.
+// You should be good by leaving it at default 10.
+// @mode: Mode indicates which types of operations count against the limit.
+// @auto_tuned: Enables dynamic adjustment of rate limit within the range
+// `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to
+// the recent demand for background I/O.
+extern RateLimiter* NewGenericRateLimiter(
+ int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
+ int32_t fairness = 10,
+ RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+ bool auto_tuned = false);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
new file mode 100644
index 000000000..a339ec2aa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// For testing purposes
+#if ROCKSDB_NAMESPACE == 42
+#undef ROCKSDB_NAMESPACE
+#endif
+
+// Normal logic
+#ifndef ROCKSDB_NAMESPACE
+#define ROCKSDB_NAMESPACE rocksdb
+#endif
diff --git a/src/rocksdb/include/rocksdb/secondary_cache.h b/src/rocksdb/include/rocksdb/secondary_cache.h
new file mode 100644
index 000000000..a6a8c8b1d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/secondary_cache.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2021, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A handle for lookup result. The handle may not be immediately ready or
+// have a valid value. The caller must call isReady() to determine if its
+// ready, and call Wait() in order to block until it becomes ready.
+// The caller must call value() after it becomes ready to determine if the
+// handle successfullly read the item.
+class SecondaryCacheResultHandle {
+ public:
+ virtual ~SecondaryCacheResultHandle() = default;
+
+ // Returns whether the handle is ready or not
+ virtual bool IsReady() = 0;
+
+ // Block until handle becomes ready
+ virtual void Wait() = 0;
+
+ // Return the value. If nullptr, it means the lookup was unsuccessful
+ virtual void* Value() = 0;
+
+ // Return the size of value
+ virtual size_t Size() = 0;
+};
+
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SecondaryCache : public Customizable {
+ public:
+ ~SecondaryCache() override = default;
+
+ static const char* Type() { return "SecondaryCache"; }
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::shared_ptr<SecondaryCache>* result);
+
+ // Insert the given value into this cache. Ownership of `value` is
+ // transferred to the callee, who is reponsible for deleting the value
+ // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(),
+ // the callee is responsible for such cleanup even in case of non-OK
+ // Status.
+ // Typically, the value is not saved directly but the implementation
+ // uses the SaveToCallback provided by helper to extract value's
+ // persistable data (typically uncompressed block), which will be written
+ // to this tier. The implementation may or may not write it to cache
+ // depending on the admission control policy, even if the return status
+ // is success (OK).
+ //
+ // If the implementation is asynchronous or otherwise uses `value` after
+ // the call returns, then InsertSaved() must be overridden not to rely on
+ // Insert(). For example, there could be a "holding area" in memory where
+ // Lookup() might return the same parsed value back. But more typically, if
+ // the implementation only uses `value` for getting persistable data during
+ // the call, then the default implementation of `InsertSaved()` suffices.
+ virtual Status Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper) = 0;
+
+ // Insert a value from its saved/persistable data (typically uncompressed
+ // block), as if generated by SaveToCallback/SizeCallback. This can be used
+ // in "warming up" the cache from some auxiliary source, and like Insert()
+ // may or may not write it to cache depending on the admission control
+ // policy, even if the return status is success.
+ //
+ // The default implementation assumes synchronous, non-escaping Insert(),
+ // wherein `value` is not used after return of Insert(). See Insert().
+ virtual Status InsertSaved(const Slice& key, const Slice& saved);
+
+ // Lookup the data for the given key in this cache. The create_cb
+ // will be used to create the object. The handle returned may not be
+ // ready yet, unless wait=true, in which case Lookup() will block until
+ // the handle is ready.
+ //
+ // advise_erase is a hint from the primary cache indicating that the handle
+ // will be cached there, so the secondary cache is advised to drop it from
+ // the cache as an optimization. To use this feature, SupportForceErase()
+ // needs to return true.
+ // This hint can also be safely ignored.
+ //
+ // is_in_sec_cache is to indicate whether the handle is possibly erased
+ // from the secondary cache after the Lookup.
+ virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+ const Slice& key, const Cache::CreateCallback& create_cb, bool wait,
+ bool advise_erase, bool& is_in_sec_cache) = 0;
+
+ // Indicate whether a handle can be erased in this secondary cache.
+ [[nodiscard]] virtual bool SupportForceErase() const = 0;
+
+ // At the discretion of the implementation, erase the data associated
+ // with key.
+ virtual void Erase(const Slice& key) = 0;
+
+ // Wait for a collection of handles to become ready.
+ virtual void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) = 0;
+
+ // Set the maximum configured capacity of the cache.
+ // When the new capacity is less than the old capacity and the existing usage
+ // is greater than new capacity, the implementation will do its best job to
+ // purge the released entries from the cache in order to lower the usage.
+ //
+ // The derived class can make this function no-op and return NotSupported().
+ virtual Status SetCapacity(size_t /* capacity */) {
+ return Status::NotSupported();
+ }
+
+ // The derived class can make this function no-op and return NotSupported().
+ virtual Status GetCapacity(size_t& /* capacity */) {
+ return Status::NotSupported();
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
new file mode 100644
index 000000000..0d7eb5949
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -0,0 +1,264 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size. The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <string_view> // RocksDB now requires C++17 support
+
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice {
+ public:
+ // Create an empty slice.
+ Slice() : data_(""), size_(0) {}
+
+ // Create a slice that refers to d[0,n-1].
+ Slice(const char* d, size_t n) : data_(d), size_(n) {}
+
+ // Create a slice that refers to the contents of "s"
+ /* implicit */
+ Slice(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+ // Create a slice that refers to the same contents as "sv"
+ /* implicit */
+ Slice(const std::string_view& sv) : data_(sv.data()), size_(sv.size()) {}
+
+ // Create a slice that refers to s[0,strlen(s)-1]
+ /* implicit */
+ Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); }
+
+ // Create a single slice from SliceParts using buf as storage.
+ // buf must exist as long as the returned Slice exists.
+ Slice(const struct SliceParts& parts, std::string* buf);
+
+ // Return a pointer to the beginning of the referenced data
+ const char* data() const { return data_; }
+
+ // Return the length (in bytes) of the referenced data
+ size_t size() const { return size_; }
+
+ // Return true iff the length of the referenced data is zero
+ bool empty() const { return size_ == 0; }
+
+ // Return the ith byte in the referenced data.
+ // REQUIRES: n < size()
+ char operator[](size_t n) const {
+ assert(n < size());
+ return data_[n];
+ }
+
+ // Change this slice to refer to an empty array
+ void clear() {
+ data_ = "";
+ size_ = 0;
+ }
+
+ // Drop the first "n" bytes from this slice.
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ data_ += n;
+ size_ -= n;
+ }
+
+ void remove_suffix(size_t n) {
+ assert(n <= size());
+ size_ -= n;
+ }
+
+ // Return a string that contains the copy of the referenced data.
+ // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
+ std::string ToString(bool hex = false) const;
+
+ // Return a string_view that references the same data as this slice.
+ std::string_view ToStringView() const {
+ return std::string_view(data_, size_);
+ }
+
+ // Decodes the current slice interpreted as an hexadecimal string into result,
+ // if successful returns true, if this isn't a valid hex string
+ // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
+ // This slice is expected to have an even number of 0-9A-F characters
+ // also accepts lowercase (a-f)
+ bool DecodeHex(std::string* result) const;
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "*this" < "b",
+ // == 0 iff "*this" == "b",
+ // > 0 iff "*this" > "b"
+ int compare(const Slice& b) const;
+
+ // Return true iff "x" is a prefix of "*this"
+ bool starts_with(const Slice& x) const {
+ return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
+ }
+
+ bool ends_with(const Slice& x) const {
+ return ((size_ >= x.size_) &&
+ (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0));
+ }
+
+ // Compare two slices and returns the first byte where they differ
+ size_t difference_offset(const Slice& b) const;
+
+ // private: make these public for rocksdbjni access
+ const char* data_;
+ size_t size_;
+
+ // Intentionally copyable
+};
+
+/**
+ * A Slice that can be pinned with some cleanup tasks, which will be run upon
+ * ::Reset() or object destruction, whichever is invoked first. This can be used
+ * to avoid memcpy by having the PinnableSlice object referring to the data
+ * that is locked in the memory and release them after the data is consumed.
+ */
+class PinnableSlice : public Slice, public Cleanable {
+ public:
+ PinnableSlice() { buf_ = &self_space_; }
+ explicit PinnableSlice(std::string* buf) { buf_ = buf; }
+
+ PinnableSlice(PinnableSlice&& other);
+ PinnableSlice& operator=(PinnableSlice&& other);
+
+ // No copy constructor and copy assignment allowed.
+ PinnableSlice(PinnableSlice&) = delete;
+ PinnableSlice& operator=(PinnableSlice&) = delete;
+
+ inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
+ void* arg2) {
+ assert(!pinned_);
+ pinned_ = true;
+ data_ = s.data();
+ size_ = s.size();
+ RegisterCleanup(f, arg1, arg2);
+ assert(pinned_);
+ }
+
+ inline void PinSlice(const Slice& s, Cleanable* cleanable) {
+ assert(!pinned_);
+ pinned_ = true;
+ data_ = s.data();
+ size_ = s.size();
+ if (cleanable != nullptr) {
+ cleanable->DelegateCleanupsTo(this);
+ }
+ assert(pinned_);
+ }
+
+ inline void PinSelf(const Slice& slice) {
+ assert(!pinned_);
+ buf_->assign(slice.data(), slice.size());
+ data_ = buf_->data();
+ size_ = buf_->size();
+ assert(!pinned_);
+ }
+
+ inline void PinSelf() {
+ assert(!pinned_);
+ data_ = buf_->data();
+ size_ = buf_->size();
+ assert(!pinned_);
+ }
+
+ void remove_suffix(size_t n) {
+ assert(n <= size());
+ if (pinned_) {
+ size_ -= n;
+ } else {
+ buf_->erase(size() - n, n);
+ PinSelf();
+ }
+ }
+
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ if (pinned_) {
+ data_ += n;
+ size_ -= n;
+ } else {
+ buf_->erase(0, n);
+ PinSelf();
+ }
+ }
+
+ void Reset() {
+ Cleanable::Reset();
+ pinned_ = false;
+ size_ = 0;
+ }
+
+ inline std::string* GetSelf() { return buf_; }
+
+ inline bool IsPinned() const { return pinned_; }
+
+ private:
+ friend class PinnableSlice4Test;
+ std::string self_space_;
+ std::string* buf_;
+ bool pinned_ = false;
+};
+
+// A set of Slices that are virtually concatenated together. 'parts' points
+// to an array of Slices. The number of elements in the array is 'num_parts'.
+struct SliceParts {
+ SliceParts(const Slice* _parts, int _num_parts)
+ : parts(_parts), num_parts(_num_parts) {}
+ SliceParts() : parts(nullptr), num_parts(0) {}
+
+ const Slice* parts;
+ int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+ return ((x.size() == y.size()) &&
+ (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); }
+
+inline int Slice::compare(const Slice& b) const {
+ assert(data_ != nullptr && b.data_ != nullptr);
+ const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+ int r = memcmp(data_, b.data_, min_len);
+ if (r == 0) {
+ if (size_ < b.size_)
+ r = -1;
+ else if (size_ > b.size_)
+ r = +1;
+ }
+ return r;
+}
+
+inline size_t Slice::difference_offset(const Slice& b) const {
+ size_t off = 0;
+ const size_t len = (size_ < b.size_) ? size_ : b.size_;
+ for (; off < len; off++) {
+ if (data_[off] != b.data_[off]) break;
+ }
+ return off;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h
new file mode 100644
index 000000000..8909b9c53
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice_transform.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice. It is not required that every slice
+// belong to the domain and/or range of a function. Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct ConfigOptions;
+
+// A SliceTransform is a generic pluggable way of transforming one string
+// to another. Its primary use-case is in configuring RocksDB prefix Bloom
+// filters, by setting prefix_extractor in ColumnFamilyOptions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SliceTransform : public Customizable {
+ public:
+ virtual ~SliceTransform(){};
+
+ // Return the name of this transformation.
+ virtual const char* Name() const override = 0;
+ static const char* Type() { return "SliceTransform"; }
+
+ // Creates and configures a new SliceTransform from the input options and id.
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::shared_ptr<const SliceTransform>* result);
+
+ // Returns a string representation of this SliceTransform, representing the ID
+ // and any additional properties.
+ std::string AsString() const;
+
+ // Extract a prefix from a specified key, partial key, iterator upper bound,
+ // etc. This is normally used for building and checking prefix Bloom filters
+ // but should accept any string for which InDomain() returns true.
+ // See ColumnFamilyOptions::prefix_extractor for specific properties that
+ // must be satisfied by prefix extractors.
+ virtual Slice Transform(const Slice& key) const = 0;
+
+ // Determine whether the specified key is compatible with the logic
+ // specified in the Transform method. Keys for which InDomain returns
+ // false will not be added to or queried against prefix Bloom filters.
+ //
+ // For example, if the Transform method returns a fixed length
+ // prefix of size 4, then an invocation to InDomain("abc") returns
+ // false because the specified key length(3) is shorter than the
+ // prefix size of 4.
+ //
+ // Wiki documentation here:
+ // https://github.com/facebook/rocksdb/wiki/Prefix-Seek
+ //
+ virtual bool InDomain(const Slice& key) const = 0;
+
+ // DEPRECATED: This is currently not used and remains here for backward
+ // compatibility.
+ virtual bool InRange(const Slice& /*dst*/) const { return false; }
+
+ // Returns information on maximum prefix length, if there is one.
+ // If Transform(x).size() == n for some keys and otherwise < n,
+ // should return true and set *len = n. Returning false is safe but
+ // currently disables some auto_prefix_mode filtering.
+ // Specifically, if the iterate_upper_bound is the immediate successor (see
+ // Comparator::IsSameLengthImmediateSuccessor) of the seek key's prefix,
+ // we require this function return true and iterate_upper_bound.size() == n
+ // to recognize and optimize the prefix seek.
+ // Otherwise (including FullLengthEnabled returns false, or prefix length is
+ // less than maximum), Seek with auto_prefix_mode is only optimized if the
+ // iterate_upper_bound and seek key have the same prefix.
+ // BUG: Despite all these conditions and even with the extra condition on
+ // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not
+ // sufficient to ensure auto_prefix_mode returns all entries that
+ // total_order_seek would return. See auto_prefix_mode "BUG" section.
+ virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
+
+ // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+ //
+ // This function is not used by RocksDB, but for users. If users pass
+ // Options by string to RocksDB, they might not know what prefix extractor
+ // they are using. This function is to help users can determine:
+ // if they want to iterate all keys prefixing `prefix`, whether it is
+ // safe to use prefix bloom filter and seek to key `prefix`.
+ // If this function returns true, this means a user can Seek() to a prefix
+ // using the bloom filter. Otherwise, user needs to skip the bloom filter
+ // by setting ReadOptions.total_order_seek = true.
+ //
+ // Here is an example: Suppose we implement a slice transform that returns
+ // the first part of the string up to and including first ",":
+ // 1. SameResultWhenAppended("abc,") should return true. If applying prefix
+ // bloom filter using it, all slices matching "abc,.*" will be extracted
+ // to "abc,", so any SST file or memtable containing any of those key
+ // will not be filtered out.
+ // 2. SameResultWhenAppended("abc") should return false. A user will not be
+ // guaranteed to see all the keys matching "abc.*" if a user prefix
+ // seeks to "abc" against a DB with the same setting. If one SST file
+ // only contains "abcd,e", the file can be filtered out and the key will
+ // be invisible, because the prefix according to the configured extractor
+ // is "abcd,".
+ //
+ // i.e., an implementation always returning false is safe.
+ virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const {
+ return false;
+ }
+};
+
+// The prefix is the first `prefix_len` bytes of the key, and keys shorter
+// then `prefix_len` are not InDomain.
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+// The prefix is the first min(length(key),`cap_len`) bytes of the key, and
+// all keys are InDomain.
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
+// Prefix is equal to key. All keys are InDomain.
+extern const SliceTransform* NewNoopTransform();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h
new file mode 100644
index 000000000..1ea56e71e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/snapshot.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+//
+// To Create a Snapshot, call DB::GetSnapshot().
+// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot).
+class Snapshot {
+ public:
+ virtual SequenceNumber GetSequenceNumber() const = 0;
+
+ // Returns unix time i.e. the number of seconds since the Epoch, 1970-01-01
+ // 00:00:00 (UTC).
+ virtual int64_t GetUnixTime() const = 0;
+
+ virtual uint64_t GetTimestamp() const = 0;
+
+ protected:
+ virtual ~Snapshot();
+};
+
+// Simple RAII wrapper class for Snapshot.
+// Constructing this object will create a snapshot. Destructing will
+// release the snapshot.
+class ManagedSnapshot {
+ public:
+ explicit ManagedSnapshot(DB* db);
+
+ // Instead of creating a snapshot, take ownership of the input snapshot.
+ ManagedSnapshot(DB* db, const Snapshot* _snapshot);
+
+ ~ManagedSnapshot();
+
+ const Snapshot* snapshot();
+
+ private:
+ DB* db_;
+ const Snapshot* snapshot_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 000000000..9261ba47d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SSTDumpTool {
+ public:
+ int Run(int argc, char const* const* argv, Options options = Options());
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h
new file mode 100644
index 000000000..613292151
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_manager.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+// SstFileManager is NOT an extensible interface but a public interface for
+// result of NewSstFileManager. Any derived classes must be RocksDB internal.
+class SstFileManager {
+ public:
+ virtual ~SstFileManager() {}
+
+ // Update the maximum allowed space that should be used by RocksDB, if
+ // the total size of the SST and blob files exceeds max_allowed_space, writes
+ // to RocksDB will fail.
+ //
+ // Setting max_allowed_space to 0 will disable this feature; maximum allowed
+ // space will be infinite (Default value).
+ //
+ // thread-safe.
+ virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
+
+ // Set the amount of buffer room each compaction should be able to leave.
+ // In other words, at its maximum disk space consumption, the compaction
+ // should still leave compaction_buffer_size available on the disk so that
+ // other background functions may continue, such as logging and flushing.
+ virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
+
+ // Return true if the total size of SST and blob files exceeded the maximum
+ // allowed space usage.
+ //
+ // thread-safe.
+ virtual bool IsMaxAllowedSpaceReached() = 0;
+
+ // Returns true if the total size of SST and blob files as well as estimated
+ // size of ongoing compactions exceeds the maximums allowed space usage.
+ virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
+
+ // Return the total size of all tracked files.
+ // thread-safe
+ virtual uint64_t GetTotalSize() = 0;
+
+ // Return a map containing all tracked files and their corresponding sizes.
+ // thread-safe
+ virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
+
+ // Return delete rate limit in bytes per second.
+ // thread-safe
+ virtual int64_t GetDeleteRateBytesPerSecond() = 0;
+
+ // Update the delete rate limit in bytes per second.
+ // zero means disable delete rate limiting and delete files immediately
+ // thread-safe
+ virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
+
+ // Return trash/DB size ratio where new files will be deleted immediately
+ // thread-safe
+ virtual double GetMaxTrashDBRatio() = 0;
+
+ // Update trash/DB size ratio where new files will be deleted immediately
+ // thread-safe
+ virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+ // Return the total size of trash files
+ // thread-safe
+ virtual uint64_t GetTotalTrashSize() = 0;
+
+ // Set the statistics ptr to dump the stat information
+ virtual void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) = 0;
+};
+
+// Create a new SstFileManager that can be shared among multiple RocksDB
+// instances to track SST and blob files and control there deletion rate.
+// Even though SstFileManager don't track WAL files but it still control
+// there deletion rate.
+//
+// @param env: Pointer to Env object, please see "rocksdb/env.h".
+// @param fs: Pointer to FileSystem object (rocksdb/file_system.h"
+// @param info_log: If not nullptr, info_log will be used to log errors.
+//
+// == Deletion rate limiting specific arguments ==
+// @param trash_dir: Deprecated, this argument have no effect
+// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
+// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
+// in 1 second, we will wait for another 3 seconds before we delete other
+// files, Set to 0 to disable deletion rate limiting.
+// This option also affect the delete rate of WAL files in the DB.
+// @param delete_existing_trash: Deprecated, this argument have no effect, but
+// if user provide trash_dir we will schedule deletes for files in the dir
+// @param status: If not nullptr, status will contain any errors that happened
+// during creating the missing trash_dir or deleting existing files in trash.
+// @param max_trash_db_ratio: If the trash size constitutes for more than this
+// fraction of the total DB size we will start deleting new files passed to
+// DeleteScheduler immediately
+// @param bytes_max_delete_chunk: if a file to delete is larger than delete
+// chunk, ftruncate the file by this size each time, rather than dropping the
+// whole file. 0 means to always delete the whole file. If the file has more
+// than one linked names, the file will be deleted as a whole. Either way,
+// `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
+// files already renamed as a trash may be partial, so users should not
+// directly recover them without checking.
+extern SstFileManager* NewSstFileManager(
+ Env* env, std::shared_ptr<FileSystem> fs,
+ std::shared_ptr<Logger> info_log = nullptr,
+ const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0,
+ bool delete_existing_trash = true, Status* status = nullptr,
+ double max_trash_db_ratio = 0.25,
+ uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+// Same as above, but takes a pointer to a legacy Env object, instead of
+// Env and FileSystem objects
+extern SstFileManager* NewSstFileManager(
+ Env* env, std::shared_ptr<Logger> info_log = nullptr,
+ std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
+ bool delete_existing_trash = true, Status* status = nullptr,
+ double max_trash_db_ratio = 0.25,
+ uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h
new file mode 100644
index 000000000..4b8642480
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+ SstFileReader(const Options& options);
+
+ ~SstFileReader();
+
+ // Prepares to read from the file located at "file_path".
+ Status Open(const std::string& file_path);
+
+ // Returns a new iterator over the table contents.
+ // Most read options provide the same control as we read from DB.
+ // If "snapshot" is nullptr, the iterator returns only the latest keys.
+ Iterator* NewIterator(const ReadOptions& options);
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+ // Verifies whether there is corruption in this table.
+ Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+ Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+ private:
+ struct Rep;
+ std::unique_ptr<Rep> rep_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h
new file mode 100644
index 000000000..c85f097a5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_writer.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+
+// ExternalSstFileInfo include information about sst files created
+// using SstFileWriter.
+struct ExternalSstFileInfo {
+ ExternalSstFileInfo()
+ : file_path(""),
+ smallest_key(""),
+ largest_key(""),
+ smallest_range_del_key(""),
+ largest_range_del_key(""),
+ file_checksum(""),
+ file_checksum_func_name(""),
+ sequence_number(0),
+ file_size(0),
+ num_entries(0),
+ num_range_del_entries(0),
+ version(0) {}
+
+ ExternalSstFileInfo(const std::string& _file_path,
+ const std::string& _smallest_key,
+ const std::string& _largest_key,
+ SequenceNumber _sequence_number, uint64_t _file_size,
+ int32_t _num_entries, int32_t _version)
+ : file_path(_file_path),
+ smallest_key(_smallest_key),
+ largest_key(_largest_key),
+ smallest_range_del_key(""),
+ largest_range_del_key(""),
+ file_checksum(""),
+ file_checksum_func_name(""),
+ sequence_number(_sequence_number),
+ file_size(_file_size),
+ num_entries(_num_entries),
+ num_range_del_entries(0),
+ version(_version) {}
+
+ std::string file_path; // external sst file path
+ std::string smallest_key; // smallest user key in file
+ std::string largest_key; // largest user key in file
+ std::string
+ smallest_range_del_key; // smallest range deletion user key in file
+ std::string largest_range_del_key; // largest range deletion user key in file
+ std::string file_checksum; // sst file checksum;
+ std::string file_checksum_func_name; // The name of file checksum function
+ SequenceNumber sequence_number; // sequence number of all keys in file
+ uint64_t file_size; // file size in bytes
+ uint64_t num_entries; // number of entries in file
+ uint64_t num_range_del_entries; // number of range deletion entries in file
+ int32_t version; // file version
+};
+
+// SstFileWriter is used to create sst files that can be added to database later
+// All keys in files generated by SstFileWriter will have sequence number = 0.
+class SstFileWriter {
+ public:
+ // User can pass `column_family` to specify that the generated file will
+ // be ingested into this column_family, note that passing nullptr means that
+ // the column_family is unknown.
+ // If invalidate_page_cache is set to true, SstFileWriter will give the OS a
+ // hint that this file pages is not needed every time we write 1MB to the
+ // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
+ // passed.
+ // The `skip_filters` option is DEPRECATED and could be removed in the
+ // future. Use `BlockBasedTableOptions::filter_policy` to control filter
+ // generation.
+ SstFileWriter(const EnvOptions& env_options, const Options& options,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool invalidate_page_cache = true,
+ Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+ bool skip_filters = false)
+ : SstFileWriter(env_options, options, options.comparator, column_family,
+ invalidate_page_cache, io_priority, skip_filters) {}
+
+ // Deprecated API
+ SstFileWriter(const EnvOptions& env_options, const Options& options,
+ const Comparator* user_comparator,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool invalidate_page_cache = true,
+ Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+ bool skip_filters = false);
+
+ ~SstFileWriter();
+
+ // Prepare SstFileWriter to write into file located at "file_path".
+ Status Open(const std::string& file_path);
+
+ // Add a Put key with value to currently opened file (deprecated)
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: comparator is *not* timestamp-aware.
+ ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
+
+ // Add a Put key with value to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: comparator is *not* timestamp-aware.
+ Status Put(const Slice& user_key, const Slice& value);
+
+ // Add a Put (key with timestamp, value) to the currently opened file
+ // REQUIRES: key is after any previously added key according to the
+ // comparator.
+ // REQUIRES: the timestamp's size is equal to what is expected by
+ // the comparator.
+ Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value);
+
+ // Add a Merge key with value to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: comparator is *not* timestamp-aware.
+ Status Merge(const Slice& user_key, const Slice& value);
+
+ // Add a deletion key to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: comparator is *not* timestamp-aware.
+ Status Delete(const Slice& user_key);
+
+ // Add a deletion key with timestamp to the currently opened file
+ // REQUIRES: key is after any previously added key according to the
+ // comparator.
+ // REQUIRES: the timestamp's size is equal to what is expected by
+ // the comparator.
+ Status Delete(const Slice& user_key, const Slice& timestamp);
+
+ // Add a range deletion tombstone to currently opened file
+ // REQUIRES: comparator is *not* timestamp-aware.
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key);
+
+ // Add a range deletion tombstone to currently opened file.
+ // REQUIRES: begin_key and end_key are user keys without timestamp.
+ // REQUIRES: the timestamp's size is equal to what is expected by
+ // the comparator.
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key,
+ const Slice& timestamp);
+
+ // Finalize writing to sst file and close file.
+ //
+ // An optional ExternalSstFileInfo pointer can be passed to the function
+ // which will be populated with information about the created sst file.
+ Status Finish(ExternalSstFileInfo* file_info = nullptr);
+
+ // Return the current file size.
+ uint64_t FileSize();
+
+ private:
+ void InvalidatePageCache(bool closing);
+ struct Rep;
+ std::unique_ptr<Rep> rep_;
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_partitioner.h b/src/rocksdb/include/rocksdb/sst_partitioner.h
new file mode 100644
index 000000000..3af8e9492
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_partitioner.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+enum PartitionerResult : char {
+ // Partitioner does not require to create new file
+ kNotRequired = 0x0,
+ // Partitioner is requesting forcefully to create new file
+ kRequired = 0x1
+ // Additional constants can be added
+};
+
+struct PartitionerRequest {
+ PartitionerRequest(const Slice& prev_user_key_,
+ const Slice& current_user_key_,
+ uint64_t current_output_file_size_)
+ : prev_user_key(&prev_user_key_),
+ current_user_key(&current_user_key_),
+ current_output_file_size(current_output_file_size_) {}
+ const Slice* prev_user_key;
+ const Slice* current_user_key;
+ uint64_t current_output_file_size;
+};
+
+/*
+ * A SstPartitioner is a generic pluggable way of defining the partition
+ * of SST files. Compaction job will split the SST files on partition boundary
+ * to lower the write amplification during SST file promote to higher level.
+ */
+class SstPartitioner {
+ public:
+ virtual ~SstPartitioner() {}
+
+ // Return the name of this partitioner.
+ virtual const char* Name() const = 0;
+
+ // It is called for all keys in compaction. When partitioner want to create
+ // new SST file it needs to return true. It means compaction job will finish
+ // current SST file where last key is "prev_user_key" parameter and start new
+ // SST file where first key is "current_user_key". Returns decision if
+ // partition boundary was detected and compaction should create new file.
+ virtual PartitionerResult ShouldPartition(
+ const PartitionerRequest& request) = 0;
+
+ // Called with smallest and largest keys in SST file when compaction try to do
+ // trivial move. Returns true is partitioner allows to do trivial move.
+ virtual bool CanDoTrivialMove(const Slice& smallest_user_key,
+ const Slice& largest_user_key) = 0;
+
+ // Context information of a compaction run
+ struct Context {
+ // Does this compaction run include all data files
+ bool is_full_compaction;
+ // Is this compaction requested by the client (true),
+ // or is it occurring as an automatic compaction process
+ bool is_manual_compaction;
+ // Output level for this compaction
+ int output_level;
+ // Smallest key for compaction
+ Slice smallest_user_key;
+ // Largest key for compaction
+ Slice largest_user_key;
+ };
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SstPartitionerFactory : public Customizable {
+ public:
+ ~SstPartitionerFactory() override {}
+ static const char* Type() { return "SstPartitionerFactory"; }
+ static Status CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<SstPartitionerFactory>* result);
+
+ virtual std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& context) const = 0;
+
+ // Returns a name that identifies this partitioner factory.
+ const char* Name() const override = 0;
+};
+
+/*
+ * Fixed key prefix partitioner. It splits the output SST files when prefix
+ * defined by size changes.
+ */
+class SstPartitionerFixedPrefix : public SstPartitioner {
+ public:
+ explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {}
+
+ virtual ~SstPartitionerFixedPrefix() override {}
+
+ const char* Name() const override { return "SstPartitionerFixedPrefix"; }
+
+ PartitionerResult ShouldPartition(const PartitionerRequest& request) override;
+
+ bool CanDoTrivialMove(const Slice& smallest_user_key,
+ const Slice& largest_user_key) override;
+
+ private:
+ size_t len_;
+};
+
+/*
+ * Factory for fixed prefix partitioner.
+ */
+class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory {
+ public:
+ explicit SstPartitionerFixedPrefixFactory(size_t len);
+
+ ~SstPartitionerFixedPrefixFactory() override {}
+
+ static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override;
+
+ private:
+ size_t len_;
+};
+
+extern std::shared_ptr<SstPartitionerFactory>
+NewSstPartitionerFixedPrefixFactory(size_t prefix_len);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
new file mode 100644
index 000000000..42a938f30
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -0,0 +1,707 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/**
+ * Keep adding tickers here.
+ * 1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking
+ * over its old value.
+ * 2. Add a readable string in TickersNameMap below for the newly added ticker.
+ * 3. Add a corresponding enum value to TickerType.java in the java API
+ * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
+ * and toCppTickers
+ */
+enum Tickers : uint32_t {
+ // total block cache misses
+ // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+ // BLOCK_CACHE_FILTER_MISS +
+ // BLOCK_CACHE_DATA_MISS;
+ BLOCK_CACHE_MISS = 0,
+ // total block cache hit
+ // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+ // BLOCK_CACHE_FILTER_HIT +
+ // BLOCK_CACHE_DATA_HIT;
+ BLOCK_CACHE_HIT,
+ // # of blocks added to block cache.
+ BLOCK_CACHE_ADD,
+ // # of failures when adding blocks to block cache.
+ BLOCK_CACHE_ADD_FAILURES,
+ // # of times cache miss when accessing index block from block cache.
+ BLOCK_CACHE_INDEX_MISS,
+ // # of times cache hit when accessing index block from block cache.
+ BLOCK_CACHE_INDEX_HIT,
+ // # of index blocks added to block cache.
+ BLOCK_CACHE_INDEX_ADD,
+ // # of bytes of index blocks inserted into cache
+ BLOCK_CACHE_INDEX_BYTES_INSERT,
+ // # of bytes of index block erased from cache
+ BLOCK_CACHE_INDEX_BYTES_EVICT,
+ // # of times cache miss when accessing filter block from block cache.
+ BLOCK_CACHE_FILTER_MISS,
+ // # of times cache hit when accessing filter block from block cache.
+ BLOCK_CACHE_FILTER_HIT,
+ // # of filter blocks added to block cache.
+ BLOCK_CACHE_FILTER_ADD,
+ // # of bytes of bloom filter blocks inserted into cache
+ BLOCK_CACHE_FILTER_BYTES_INSERT,
+ // # of bytes of bloom filter block erased from cache
+ BLOCK_CACHE_FILTER_BYTES_EVICT,
+ // # of times cache miss when accessing data block from block cache.
+ BLOCK_CACHE_DATA_MISS,
+ // # of times cache hit when accessing data block from block cache.
+ BLOCK_CACHE_DATA_HIT,
+ // # of data blocks added to block cache.
+ BLOCK_CACHE_DATA_ADD,
+ // # of bytes of data blocks inserted into cache
+ BLOCK_CACHE_DATA_BYTES_INSERT,
+ // # of bytes read from cache.
+ BLOCK_CACHE_BYTES_READ,
+ // # of bytes written into cache.
+ BLOCK_CACHE_BYTES_WRITE,
+
+ // # of times bloom filter has avoided file reads, i.e., negatives.
+ BLOOM_FILTER_USEFUL,
+ // # of times bloom FullFilter has not avoided the reads.
+ BLOOM_FILTER_FULL_POSITIVE,
+ // # of times bloom FullFilter has not avoided the reads and data actually
+ // exist.
+ BLOOM_FILTER_FULL_TRUE_POSITIVE,
+
+ BLOOM_FILTER_MICROS,
+
+ // # persistent cache hit
+ PERSISTENT_CACHE_HIT,
+ // # persistent cache miss
+ PERSISTENT_CACHE_MISS,
+
+ // # total simulation block cache hits
+ SIM_BLOCK_CACHE_HIT,
+ // # total simulation block cache misses
+ SIM_BLOCK_CACHE_MISS,
+
+ // # of memtable hits.
+ MEMTABLE_HIT,
+ // # of memtable misses.
+ MEMTABLE_MISS,
+
+ // # of Get() queries served by L0
+ GET_HIT_L0,
+ // # of Get() queries served by L1
+ GET_HIT_L1,
+ // # of Get() queries served by L2 and up
+ GET_HIT_L2_AND_UP,
+
+ /**
+ * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+ * There are 4 reasons currently.
+ */
+ COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
+ // Also includes keys dropped for range del.
+ COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
+ COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone.
+ COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
+ COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted.
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ // If a compaction was canceled in sfm to prevent ENOSPC
+ COMPACTION_CANCELLED,
+
+ // Number of keys written to the database via the Put and Write call's
+ NUMBER_KEYS_WRITTEN,
+ // Number of Keys read,
+ NUMBER_KEYS_READ,
+ // Number keys updated, if inplace update is enabled
+ NUMBER_KEYS_UPDATED,
+ // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+ // DB::Merge(), and DB::Write().
+ BYTES_WRITTEN,
+ // The number of uncompressed bytes read from DB::Get(). It could be
+ // either from memtables, cache, or table files.
+ // For the number of logical bytes read from DB::MultiGet(),
+ // please use NUMBER_MULTIGET_BYTES_READ.
+ BYTES_READ,
+ // The number of calls to seek/next/prev
+ NUMBER_DB_SEEK,
+ NUMBER_DB_NEXT,
+ NUMBER_DB_PREV,
+ // The number of calls to seek/next/prev that returned data
+ NUMBER_DB_SEEK_FOUND,
+ NUMBER_DB_NEXT_FOUND,
+ NUMBER_DB_PREV_FOUND,
+ // The number of uncompressed bytes read from an iterator.
+ // Includes size of key and value.
+ ITER_BYTES_READ,
+ NO_FILE_CLOSES,
+ NO_FILE_OPENS,
+ NO_FILE_ERRORS,
+ // DEPRECATED Time system had to wait to do LO-L1 compactions
+ STALL_L0_SLOWDOWN_MICROS,
+ // DEPRECATED Time system had to wait to move memtable to L1.
+ STALL_MEMTABLE_COMPACTION_MICROS,
+ // DEPRECATED write throttle because of too many files in L0
+ STALL_L0_NUM_FILES_MICROS,
+ // Writer has to wait for compaction or flush to finish.
+ STALL_MICROS,
+ // The wait time for db mutex.
+ // Disabled by default. To enable it set stats level to kAll
+ DB_MUTEX_WAIT_MICROS,
+ RATE_LIMIT_DELAY_MILLIS,
+ // DEPRECATED number of iterators currently open
+ NO_ITERATORS,
+
+ // Number of MultiGet calls, keys read, and bytes read
+ NUMBER_MULTIGET_CALLS,
+ NUMBER_MULTIGET_KEYS_READ,
+ NUMBER_MULTIGET_BYTES_READ,
+
+ // Number of deletes records that were not required to be
+ // written to storage because key does not exist
+ NUMBER_FILTERED_DELETES,
+ NUMBER_MERGE_FAILURES,
+
+ // number of times bloom was checked before creating iterator on a
+ // file, and the number of times the check was useful in avoiding
+ // iterator creation (and thus likely IOPs).
+ BLOOM_FILTER_PREFIX_CHECKED,
+ BLOOM_FILTER_PREFIX_USEFUL,
+
+ // Number of times we had to reseek inside an iteration to skip
+ // over large number of keys with same userkey.
+ NUMBER_OF_RESEEKS_IN_ITERATION,
+
+ // Record the number of calls to GetUpdatesSince. Useful to keep track of
+ // transaction log iterator refreshes
+ GET_UPDATES_SINCE_CALLS,
+ BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
+ BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
+ // Number of blocks added to compressed block cache
+ BLOCK_CACHE_COMPRESSED_ADD,
+ // Number of failures when adding blocks to compressed block cache
+ BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+ WAL_FILE_SYNCED, // Number of times WAL sync is done
+ WAL_FILE_BYTES, // Number of bytes written to WAL
+
+ // Writes can be processed by requesting thread or by the thread at the
+ // head of the writers queue.
+ WRITE_DONE_BY_SELF,
+ WRITE_DONE_BY_OTHER, // Equivalent to writes done for others
+ WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
+ WRITE_WITH_WAL, // Number of Write calls that request WAL
+ COMPACT_READ_BYTES, // Bytes read during compaction
+ COMPACT_WRITE_BYTES, // Bytes written during compaction
+ FLUSH_WRITE_BYTES, // Bytes written during flush
+
+ // Compaction read and write statistics broken down by CompactionReason
+ COMPACT_READ_BYTES_MARKED,
+ COMPACT_READ_BYTES_PERIODIC,
+ COMPACT_READ_BYTES_TTL,
+ COMPACT_WRITE_BYTES_MARKED,
+ COMPACT_WRITE_BYTES_PERIODIC,
+ COMPACT_WRITE_BYTES_TTL,
+
+ // Number of table's properties loaded directly from file, without creating
+ // table reader object.
+ NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+ NUMBER_SUPERVERSION_ACQUIRES,
+ NUMBER_SUPERVERSION_RELEASES,
+ NUMBER_SUPERVERSION_CLEANUPS,
+
+ // # of compressions/decompressions executed
+ NUMBER_BLOCK_COMPRESSED,
+ NUMBER_BLOCK_DECOMPRESSED,
+
+ NUMBER_BLOCK_NOT_COMPRESSED,
+ MERGE_OPERATION_TOTAL_TIME,
+ FILTER_OPERATION_TOTAL_TIME,
+
+ // Row cache.
+ ROW_CACHE_HIT,
+ ROW_CACHE_MISS,
+
+ // Read amplification statistics.
+ // Read amplification can be calculated using this formula
+ // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+ //
+ // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
+ READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used.
+ READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks.
+
+ // Number of refill intervals where rate limiter's bytes are fully consumed.
+ NUMBER_RATE_LIMITER_DRAINS,
+
+ // Number of internal keys skipped by Iterator
+ NUMBER_ITER_SKIP,
+
+ // BlobDB specific stats
+ // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_PUT,
+ // # of Write to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_WRITE,
+ // # of Get to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_GET,
+ // # of MultiGet to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_MULTIGET,
+ // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only
+ // applicable to legacy BlobDB.
+ BLOB_DB_NUM_SEEK,
+ // # of Next to BlobDB iterator. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_NEXT,
+ // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_PREV,
+ // # of keys written to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_KEYS_WRITTEN,
+ // # of keys read from BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_NUM_KEYS_READ,
+ // # of bytes (key + value) written to BlobDB. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_BYTES_WRITTEN,
+ // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_BYTES_READ,
+ // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
+ // legacy BlobDB.
+ BLOB_DB_WRITE_INLINED,
+ // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_WRITE_INLINED_TTL,
+ // # of keys written by BlobDB as non-TTL blob value. Only applicable to
+ // legacy BlobDB.
+ BLOB_DB_WRITE_BLOB,
+ // # of keys written by BlobDB as TTL blob value. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_WRITE_BLOB_TTL,
+ // # of bytes written to blob file.
+ BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ // # of bytes read from blob file.
+ BLOB_DB_BLOB_FILE_BYTES_READ,
+ // # of times a blob files being synced.
+ BLOB_DB_BLOB_FILE_SYNCED,
+ // # of blob index evicted from base DB by BlobDB compaction filter because
+ // of expiration. Only applicable to legacy BlobDB.
+ BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+ // size of blob index evicted from base DB by BlobDB compaction filter
+ // because of expiration. Only applicable to legacy BlobDB.
+ BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
+ // # of blob index evicted from base DB by BlobDB compaction filter because
+ // of corresponding file deleted. Only applicable to legacy BlobDB.
+ BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+ // size of blob index evicted from base DB by BlobDB compaction filter
+ // because of corresponding file deleted. Only applicable to legacy BlobDB.
+ BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
+ // # of blob files that were obsoleted by garbage collection. Only applicable
+ // to legacy BlobDB.
+ BLOB_DB_GC_NUM_FILES,
+ // # of blob files generated by garbage collection. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_GC_NUM_NEW_FILES,
+ // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB.
+ BLOB_DB_GC_FAILURES,
+ // # of keys dropped by BlobDB garbage collection because they had been
+ // overwritten. DEPRECATED.
+ BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+ // # of keys dropped by BlobDB garbage collection because of expiration.
+ // DEPRECATED.
+ BLOB_DB_GC_NUM_KEYS_EXPIRED,
+ // # of keys relocated to new blob file by garbage collection.
+ BLOB_DB_GC_NUM_KEYS_RELOCATED,
+ // # of bytes dropped by BlobDB garbage collection because they had been
+ // overwritten. DEPRECATED.
+ BLOB_DB_GC_BYTES_OVERWRITTEN,
+ // # of bytes dropped by BlobDB garbage collection because of expiration.
+ // DEPRECATED.
+ BLOB_DB_GC_BYTES_EXPIRED,
+ // # of bytes relocated to new blob file by garbage collection.
+ BLOB_DB_GC_BYTES_RELOCATED,
+ // # of blob files evicted because of BlobDB is full. Only applicable to
+ // legacy BlobDB.
+ BLOB_DB_FIFO_NUM_FILES_EVICTED,
+ // # of keys in the blob files evicted because of BlobDB is full. Only
+ // applicable to legacy BlobDB.
+ BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+ // # of bytes in the blob files evicted because of BlobDB is full. Only
+ // applicable to legacy BlobDB.
+ BLOB_DB_FIFO_BYTES_EVICTED,
+
+ // These counters indicate a performance issue in WritePrepared transactions.
+ // We should not seem them ticking them much.
+ // # of times prepare_mutex_ is acquired in the fast path.
+ TXN_PREPARE_MUTEX_OVERHEAD,
+ // # of times old_commit_map_mutex_ is acquired in the fast path.
+ TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+ // # of times we checked a batch for duplicate keys.
+ TXN_DUPLICATE_KEY_OVERHEAD,
+ // # of times snapshot_mutex_ is acquired in the fast path.
+ TXN_SNAPSHOT_MUTEX_OVERHEAD,
+ // # of times ::Get returned TryAgain due to expired snapshot seq
+ TXN_GET_TRY_AGAIN,
+
+ // Number of keys actually found in MultiGet calls (vs number requested by
+ // caller)
+ // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
+ NUMBER_MULTIGET_KEYS_FOUND,
+
+ NO_ITERATOR_CREATED, // number of iterators created
+ NO_ITERATOR_DELETED, // number of iterators deleted
+
+ BLOCK_CACHE_COMPRESSION_DICT_MISS,
+ BLOCK_CACHE_COMPRESSION_DICT_HIT,
+ BLOCK_CACHE_COMPRESSION_DICT_ADD,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+
+ // # of blocks redundantly inserted into block cache.
+ // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD
+ BLOCK_CACHE_ADD_REDUNDANT,
+ // # of index blocks redundantly inserted into block cache.
+ // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD
+ BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+ // # of filter blocks redundantly inserted into block cache.
+ // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD
+ BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+ // # of data blocks redundantly inserted into block cache.
+ // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD
+ BLOCK_CACHE_DATA_ADD_REDUNDANT,
+ // # of dict blocks redundantly inserted into block cache.
+ // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT
+ // <= BLOCK_CACHE_COMPRESSION_DICT_ADD
+ BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+
+ // # of files marked as trash by sst file manager and will be deleted
+ // later by background thread.
+ FILES_MARKED_TRASH,
+ // # of files deleted immediately by sst file manger through delete scheduler.
+ FILES_DELETED_IMMEDIATELY,
+
+ // The counters for error handler, not that, bg_io_error is the subset of
+ // bg_error and bg_retryable_io_error is the subset of bg_io_error
+ ERROR_HANDLER_BG_ERROR_COUNT,
+ ERROR_HANDLER_BG_IO_ERROR_COUNT,
+ ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+ ERROR_HANDLER_AUTORESUME_COUNT,
+ ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+ ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+
+ // Statistics for memtable garbage collection:
+ // Raw bytes of data (payload) present on memtable at flush time.
+ MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+ // Outdated bytes of data present on memtable at flush time.
+ MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+
+ // Secondary cache statistics
+ SECONDARY_CACHE_HITS,
+
+ // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+ VERIFY_CHECKSUM_READ_BYTES,
+
+ // Bytes read/written while creating backups
+ BACKUP_READ_BYTES,
+ BACKUP_WRITE_BYTES,
+
+ // Remote compaction read/write statistics
+ REMOTE_COMPACT_READ_BYTES,
+ REMOTE_COMPACT_WRITE_BYTES,
+
+ // Tiered storage related statistics
+ HOT_FILE_READ_BYTES,
+ WARM_FILE_READ_BYTES,
+ COLD_FILE_READ_BYTES,
+ HOT_FILE_READ_COUNT,
+ WARM_FILE_READ_COUNT,
+ COLD_FILE_READ_COUNT,
+
+ // Last level and non-last level read statistics
+ LAST_LEVEL_READ_BYTES,
+ LAST_LEVEL_READ_COUNT,
+ NON_LAST_LEVEL_READ_BYTES,
+ NON_LAST_LEVEL_READ_COUNT,
+
+ BLOCK_CHECKSUM_COMPUTE_COUNT,
+ MULTIGET_COROUTINE_COUNT,
+
+ // Integrated BlobDB specific stats
+ // # of times cache miss when accessing blob from blob cache.
+ BLOB_DB_CACHE_MISS,
+ // # of times cache hit when accessing blob from blob cache.
+ BLOB_DB_CACHE_HIT,
+ // # of data blocks added to blob cache.
+ BLOB_DB_CACHE_ADD,
+ // # of failures when adding blobs to blob cache.
+ BLOB_DB_CACHE_ADD_FAILURES,
+ // # of bytes read from blob cache.
+ BLOB_DB_CACHE_BYTES_READ,
+ // # of bytes written into blob cache.
+ BLOB_DB_CACHE_BYTES_WRITE,
+
+ // Time spent in the ReadAsync file system call
+ READ_ASYNC_MICROS,
+ // Number of errors returned to the async read callback
+ ASYNC_READ_ERROR_COUNT,
+
+ TICKER_ENUM_MAX
+};
+
+// The order of items listed in Tickers should be the same as
+// the order listed in TickersNameMap
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram should have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ * Add a corresponding enum value to HistogramType.java in the java API
+ */
+enum Histograms : uint32_t {
+ DB_GET = 0,
+ DB_WRITE,
+ COMPACTION_TIME,
+ COMPACTION_CPU_TIME,
+ SUBCOMPACTION_SETUP_TIME,
+ TABLE_SYNC_MICROS,
+ COMPACTION_OUTFILE_SYNC_MICROS,
+ WAL_FILE_SYNC_MICROS,
+ MANIFEST_FILE_SYNC_MICROS,
+ // TIME SPENT IN IO DURING TABLE OPEN
+ TABLE_OPEN_IO_MICROS,
+ DB_MULTIGET,
+ READ_BLOCK_COMPACTION_MICROS,
+ READ_BLOCK_GET_MICROS,
+ WRITE_RAW_BLOCK_MICROS,
+ STALL_L0_SLOWDOWN_COUNT,
+ STALL_MEMTABLE_COMPACTION_COUNT,
+ STALL_L0_NUM_FILES_COUNT,
+ HARD_RATE_LIMIT_DELAY_COUNT,
+ SOFT_RATE_LIMIT_DELAY_COUNT,
+ NUM_FILES_IN_SINGLE_COMPACTION,
+ DB_SEEK,
+ WRITE_STALL,
+ SST_READ_MICROS,
+ // The number of subcompactions actually scheduled during a compaction
+ NUM_SUBCOMPACTIONS_SCHEDULED,
+ // Value size distribution in each operation
+ BYTES_PER_READ,
+ BYTES_PER_WRITE,
+ BYTES_PER_MULTIGET,
+
+ // number of bytes compressed/decompressed
+ // number of bytes is when uncompressed; i.e. before/after respectively
+ BYTES_COMPRESSED,
+ BYTES_DECOMPRESSED,
+ COMPRESSION_TIMES_NANOS,
+ DECOMPRESSION_TIMES_NANOS,
+ // Number of merge operands passed to the merge operator in user read
+ // requests.
+ READ_NUM_MERGE_OPERANDS,
+
+ // BlobDB specific stats
+ // Size of keys written to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_KEY_SIZE,
+ // Size of values written to BlobDB. Only applicable to legacy BlobDB.
+ BLOB_DB_VALUE_SIZE,
+ // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
+ // BlobDB.
+ BLOB_DB_WRITE_MICROS,
+ // BlobDB Get latency. Only applicable to legacy BlobDB.
+ BLOB_DB_GET_MICROS,
+ // BlobDB MultiGet latency. Only applicable to legacy BlobDB.
+ BLOB_DB_MULTIGET_MICROS,
+ // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to
+ // legacy BlobDB.
+ BLOB_DB_SEEK_MICROS,
+ // BlobDB Next latency. Only applicable to legacy BlobDB.
+ BLOB_DB_NEXT_MICROS,
+ // BlobDB Prev latency. Only applicable to legacy BlobDB.
+ BLOB_DB_PREV_MICROS,
+ // Blob file write latency.
+ BLOB_DB_BLOB_FILE_WRITE_MICROS,
+ // Blob file read latency.
+ BLOB_DB_BLOB_FILE_READ_MICROS,
+ // Blob file sync latency.
+ BLOB_DB_BLOB_FILE_SYNC_MICROS,
+ // BlobDB garbage collection time. DEPRECATED.
+ BLOB_DB_GC_MICROS,
+ // BlobDB compression time.
+ BLOB_DB_COMPRESSION_MICROS,
+ // BlobDB decompression time.
+ BLOB_DB_DECOMPRESSION_MICROS,
+ // Time spent flushing memtable to disk
+ FLUSH_TIME,
+ SST_BATCH_SIZE,
+
+ // MultiGet stats logged per level
+ // Num of index and filter blocks read from file system per level.
+ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+ // Num of data blocks read from file system per level.
+ // Obsolete
+ NUM_DATA_BLOCKS_READ_PER_LEVEL,
+ // Num of sst files read from file system per level.
+ NUM_SST_READ_PER_LEVEL,
+
+ // Error handler statistics
+ ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+
+ // Stats related to asynchronous read requests.
+ ASYNC_READ_BYTES,
+ POLL_WAIT_MICROS,
+
+ // Number of prefetched bytes discarded by RocksDB.
+ PREFETCHED_BYTES_DISCARDED,
+
+ // Number of IOs issued in parallel in a MultiGet batch
+ MULTIGET_IO_BATCH_SIZE,
+
+ // Number of levels requiring IO for MultiGet
+ NUM_LEVEL_READ_PER_MULTIGET,
+
+ // Wait time for aborting async read in FilePrefetchBuffer destructor
+ ASYNC_PREFETCH_ABORT_MICROS,
+
+ HISTOGRAM_ENUM_MAX,
+};
+
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
+
+struct HistogramData {
+ double median;
+ double percentile95;
+ double percentile99;
+ double average;
+ double standard_deviation;
+ // zero-initialize new members since old Statistics::histogramData()
+ // implementations won't write them.
+ double max = 0.0;
+ uint64_t count = 0;
+ uint64_t sum = 0;
+ double min = 0.0;
+};
+
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+enum StatsLevel : uint8_t {
+ // Disable all metrics
+ kDisableAll,
+ // Disable tickers
+ kExceptTickers = kDisableAll,
+ // Disable timer stats, and skip histogram stats
+ kExceptHistogramOrTimers,
+ // Skip timer stats
+ kExceptTimers,
+ // Collect all stats except time inside mutex lock AND time spent on
+ // compression.
+ kExceptDetailedTimers,
+ // Collect all stats except the counters requiring to get time inside the
+ // mutex lock.
+ kExceptTimeForMutex,
+ // Collect all stats, including measuring duration of mutex operations.
+ // If getting time is expensive on the platform to run, it can
+ // reduce scalability to more threads, especially for writes.
+ kAll,
+};
+
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+// Options options;
+// options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+// Status s = DB::Open(options, kDBPath, &db);
+// ...
+// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+// HistogramData hist;
+// options.statistics->histogramData(FLUSH_TIME, &hist);
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Statistics : public Customizable {
+ public:
+ ~Statistics() override {}
+ static const char* Type() { return "Statistics"; }
+ static Status CreateFromString(const ConfigOptions& opts,
+ const std::string& value,
+ std::shared_ptr<Statistics>* result);
+ // Default name of empty, for backwards compatibility. Derived classes should
+ // override this method.
+ // This default implementation will likely be removed in a future release
+ const char* Name() const override { return ""; }
+ virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
+ virtual void histogramData(uint32_t type,
+ HistogramData* const data) const = 0;
+ virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
+ virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
+ virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
+ virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
+ virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
+ if (get_stats_level() <= StatsLevel::kExceptTimers) {
+ return;
+ }
+ recordInHistogram(histogramType, time);
+ }
+ // The function is here only for backward compatibility reason.
+ // Users implementing their own Statistics class should override
+ // recordInHistogram() instead and leave measureTime() as it is.
+ virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
+ // This is not supposed to be called.
+ assert(false);
+ }
+ virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
+ // measureTime() is the old and inaccurate function name.
+ // To keep backward compatible. If users implement their own
+ // statistics, which overrides measureTime() but doesn't override
+ // this function. We forward to measureTime().
+ measureTime(histogramType, time);
+ }
+
+ // Resets all ticker and histogram stats
+ virtual Status Reset() { return Status::NotSupported("Not implemented"); }
+
+#ifndef ROCKSDB_LITE
+ using Customizable::ToString;
+#endif // ROCKSDB_LITE
+ // String representation of the statistic object. Must be thread-safe.
+ virtual std::string ToString() const {
+ // Do nothing by default
+ return std::string("ToString(): not implemented");
+ }
+
+ virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
+ // Do nothing by default
+ return false;
+ }
+
+ // Override this function to disable particular histogram collection
+ virtual bool HistEnabledForType(uint32_t type) const {
+ return type < HISTOGRAM_ENUM_MAX;
+ }
+ void set_stats_level(StatsLevel sl) {
+ stats_level_.store(sl, std::memory_order_relaxed);
+ }
+ StatsLevel get_stats_level() const {
+ return stats_level_.load(std::memory_order_relaxed);
+ }
+
+ private:
+ std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h
new file mode 100644
index 000000000..57e469295
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/stats_history.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+// std::unique_ptr<StatsHistoryIterator> stats_iter;
+// Status s = db->GetStatsHistory(0 /* start_time */,
+// env->NowMicros() /* end_time*/,
+// &stats_iter);
+// if (s.ok) {
+// for (; stats_iter->Valid(); stats_iter->Next()) {
+// uint64_t stats_time = stats_iter->GetStatsTime();
+// const std::map<std::string, uint64_t>& stats_map =
+// stats_iter->GetStatsMap();
+// process(stats_time, stats_map);
+// }
+// }
+class StatsHistoryIterator {
+ public:
+ StatsHistoryIterator() {}
+ virtual ~StatsHistoryIterator() {}
+
+ virtual bool Valid() const = 0;
+
+ // Moves to the next stats history record. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Return the time stamp (in seconds) when stats history is recorded.
+ // REQUIRES: Valid()
+ virtual uint64_t GetStatsTime() const = 0;
+
+ // DEPRECATED (was never used)
+ virtual int GetFormatVersion() const { return -1; }
+
+ // Return the current stats history as an std::map which specifies the
+ // mapping from stats name to stats value . The underlying storage
+ // for the returned map is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
new file mode 100644
index 000000000..1ab3dc4cb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -0,0 +1,570 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#pragma once
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include <memory>
+#include <string>
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include "port/stack_trace.h"
+#endif
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status {
+ public:
+ // Create a success status.
+ Status()
+ : code_(kOk),
+ subcode_(kNone),
+ sev_(kNoError),
+ retryable_(false),
+ data_loss_(false),
+ scope_(0),
+ state_(nullptr) {}
+ ~Status() {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ if (!checked_) {
+ fprintf(stderr, "Failed to check Status %p\n", this);
+ port::PrintStack();
+ std::abort();
+ }
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ }
+
+ // Copy the specified status.
+ Status(const Status& s);
+ Status& operator=(const Status& s);
+ Status(Status&& s) noexcept;
+ Status& operator=(Status&& s) noexcept;
+ bool operator==(const Status& rhs) const;
+ bool operator!=(const Status& rhs) const;
+
+ // In case of intentionally swallowing an error, user must explicitly call
+ // this function. That way we are easily able to search the code to find where
+ // error swallowing occurs.
+ inline void PermitUncheckedError() const { MarkChecked(); }
+
+ inline void MustCheck() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ checked_ = false;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ }
+
+ enum Code : unsigned char {
+ kOk = 0,
+ kNotFound = 1,
+ kCorruption = 2,
+ kNotSupported = 3,
+ kInvalidArgument = 4,
+ kIOError = 5,
+ kMergeInProgress = 6,
+ kIncomplete = 7,
+ kShutdownInProgress = 8,
+ kTimedOut = 9,
+ kAborted = 10,
+ kBusy = 11,
+ kExpired = 12,
+ kTryAgain = 13,
+ kCompactionTooLarge = 14,
+ kColumnFamilyDropped = 15,
+ kMaxCode
+ };
+
+ Code code() const {
+ MarkChecked();
+ return code_;
+ }
+
+ enum SubCode : unsigned char {
+ kNone = 0,
+ kMutexTimeout = 1,
+ kLockTimeout = 2,
+ kLockLimit = 3,
+ kNoSpace = 4,
+ kDeadlock = 5,
+ kStaleFile = 6,
+ kMemoryLimit = 7,
+ kSpaceLimit = 8,
+ kPathNotFound = 9,
+ KMergeOperandsInsufficientCapacity = 10,
+ kManualCompactionPaused = 11,
+ kOverwritten = 12,
+ kTxnNotPrepared = 13,
+ kIOFenced = 14,
+ kMaxSubCode
+ };
+
+ SubCode subcode() const {
+ MarkChecked();
+ return subcode_;
+ }
+
+ enum Severity : unsigned char {
+ kNoError = 0,
+ kSoftError = 1,
+ kHardError = 2,
+ kFatalError = 3,
+ kUnrecoverableError = 4,
+ kMaxSeverity
+ };
+
+ Status(const Status& s, Severity sev);
+
+ Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg)
+ : Status(_code, _subcode, msg, "", _sev) {}
+
+ Severity severity() const {
+ MarkChecked();
+ return sev_;
+ }
+
+ // Returns a C style string indicating the message of the Status
+ const char* getState() const {
+ MarkChecked();
+ return state_.get();
+ }
+
+ // Return a success status.
+ static Status OK() { return Status(); }
+
+ // Successful, though an existing something was overwritten
+ // Note: using variants of OK status for program logic is discouraged,
+ // but it can be useful for communicating statistical information without
+ // changing public APIs.
+ static Status OkOverwritten() { return Status(kOk, kOverwritten); }
+
+ // Return error status of an appropriate type.
+ static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotFound, msg, msg2);
+ }
+
+ // Fast path for not found without malloc;
+ static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
+
+ static Status NotFound(SubCode sc, const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kNotFound, sc, msg, msg2);
+ }
+
+ static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kCorruption, msg, msg2);
+ }
+ static Status Corruption(SubCode msg = kNone) {
+ return Status(kCorruption, msg);
+ }
+
+ static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotSupported, msg, msg2);
+ }
+ static Status NotSupported(SubCode msg = kNone) {
+ return Status(kNotSupported, msg);
+ }
+
+ static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kInvalidArgument, msg, msg2);
+ }
+ static Status InvalidArgument(SubCode msg = kNone) {
+ return Status(kInvalidArgument, msg);
+ }
+
+ static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, msg, msg2);
+ }
+ static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); }
+
+ static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kMergeInProgress, msg, msg2);
+ }
+ static Status MergeInProgress(SubCode msg = kNone) {
+ return Status(kMergeInProgress, msg);
+ }
+
+ static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIncomplete, msg, msg2);
+ }
+ static Status Incomplete(SubCode msg = kNone) {
+ return Status(kIncomplete, msg);
+ }
+
+ static Status ShutdownInProgress(SubCode msg = kNone) {
+ return Status(kShutdownInProgress, msg);
+ }
+ static Status ShutdownInProgress(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kShutdownInProgress, msg, msg2);
+ }
+ static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); }
+ static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kAborted, msg, msg2);
+ }
+
+ static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); }
+ static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kBusy, msg, msg2);
+ }
+
+ static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); }
+ static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kTimedOut, msg, msg2);
+ }
+
+ static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); }
+ static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kExpired, msg, msg2);
+ }
+
+ static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); }
+ static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kTryAgain, msg, msg2);
+ }
+
+ static Status CompactionTooLarge(SubCode msg = kNone) {
+ return Status(kCompactionTooLarge, msg);
+ }
+ static Status CompactionTooLarge(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kCompactionTooLarge, msg, msg2);
+ }
+
+ static Status ColumnFamilyDropped(SubCode msg = kNone) {
+ return Status(kColumnFamilyDropped, msg);
+ }
+
+ static Status ColumnFamilyDropped(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kColumnFamilyDropped, msg, msg2);
+ }
+
+ static Status NoSpace() { return Status(kIOError, kNoSpace); }
+ static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kNoSpace, msg, msg2);
+ }
+
+ static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); }
+ static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kAborted, kMemoryLimit, msg, msg2);
+ }
+
+ static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); }
+ static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kSpaceLimit, msg, msg2);
+ }
+
+ static Status PathNotFound() { return Status(kIOError, kPathNotFound); }
+ static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kPathNotFound, msg, msg2);
+ }
+
+ static Status TxnNotPrepared() {
+ return Status(kInvalidArgument, kTxnNotPrepared);
+ }
+ static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
+ }
+
+ // Returns true iff the status indicates success.
+ bool ok() const {
+ MarkChecked();
+ return code() == kOk;
+ }
+
+ // Returns true iff the status indicates success *with* something
+ // overwritten
+ bool IsOkOverwritten() const {
+ MarkChecked();
+ return code() == kOk && subcode() == kOverwritten;
+ }
+
+ // Returns true iff the status indicates a NotFound error.
+ bool IsNotFound() const {
+ MarkChecked();
+ return code() == kNotFound;
+ }
+
+ // Returns true iff the status indicates a Corruption error.
+ bool IsCorruption() const {
+ MarkChecked();
+ return code() == kCorruption;
+ }
+
+ // Returns true iff the status indicates a NotSupported error.
+ bool IsNotSupported() const {
+ MarkChecked();
+ return code() == kNotSupported;
+ }
+
+ // Returns true iff the status indicates an InvalidArgument error.
+ bool IsInvalidArgument() const {
+ MarkChecked();
+ return code() == kInvalidArgument;
+ }
+
+ // Returns true iff the status indicates an IOError.
+ bool IsIOError() const {
+ MarkChecked();
+ return code() == kIOError;
+ }
+
+ // Returns true iff the status indicates an MergeInProgress.
+ bool IsMergeInProgress() const {
+ MarkChecked();
+ return code() == kMergeInProgress;
+ }
+
+ // Returns true iff the status indicates Incomplete
+ bool IsIncomplete() const {
+ MarkChecked();
+ return code() == kIncomplete;
+ }
+
+ // Returns true iff the status indicates Shutdown In progress
+ bool IsShutdownInProgress() const {
+ MarkChecked();
+ return code() == kShutdownInProgress;
+ }
+
+ bool IsTimedOut() const {
+ MarkChecked();
+ return code() == kTimedOut;
+ }
+
+ bool IsAborted() const {
+ MarkChecked();
+ return code() == kAborted;
+ }
+
+ bool IsLockLimit() const {
+ MarkChecked();
+ return code() == kAborted && subcode() == kLockLimit;
+ }
+
+ // Returns true iff the status indicates that a resource is Busy and
+ // temporarily could not be acquired.
+ bool IsBusy() const {
+ MarkChecked();
+ return code() == kBusy;
+ }
+
+ bool IsDeadlock() const {
+ MarkChecked();
+ return code() == kBusy && subcode() == kDeadlock;
+ }
+
+ // Returns true iff the status indicated that the operation has Expired.
+ bool IsExpired() const {
+ MarkChecked();
+ return code() == kExpired;
+ }
+
+ // Returns true iff the status indicates a TryAgain error.
+ // This usually means that the operation failed, but may succeed if
+ // re-attempted.
+ bool IsTryAgain() const {
+ MarkChecked();
+ return code() == kTryAgain;
+ }
+
+ // Returns true iff the status indicates the proposed compaction is too large
+ bool IsCompactionTooLarge() const {
+ MarkChecked();
+ return code() == kCompactionTooLarge;
+ }
+
+ // Returns true iff the status indicates Column Family Dropped
+ bool IsColumnFamilyDropped() const {
+ MarkChecked();
+ return code() == kColumnFamilyDropped;
+ }
+
+ // Returns true iff the status indicates a NoSpace error
+ // This is caused by an I/O error returning the specific "out of space"
+ // error condition. Stricto sensu, an NoSpace error is an I/O error
+ // with a specific subcode, enabling users to take the appropriate action
+ // if needed
+ bool IsNoSpace() const {
+ MarkChecked();
+ return (code() == kIOError) && (subcode() == kNoSpace);
+ }
+
+ // Returns true iff the status indicates a memory limit error. There may be
+ // cases where we limit the memory used in certain operations (eg. the size
+ // of a write batch) in order to avoid out of memory exceptions.
+ bool IsMemoryLimit() const {
+ MarkChecked();
+ return (code() == kAborted) && (subcode() == kMemoryLimit);
+ }
+
+ // Returns true iff the status indicates a PathNotFound error
+ // This is caused by an I/O error returning the specific "no such file or
+ // directory" error condition. A PathNotFound error is an I/O error with
+ // a specific subcode, enabling users to take appropriate action if necessary
+ bool IsPathNotFound() const {
+ MarkChecked();
+ return (code() == kIOError || code() == kNotFound) &&
+ (subcode() == kPathNotFound);
+ }
+
+ // Returns true iff the status indicates manual compaction paused. This
+ // is caused by a call to PauseManualCompaction
+ bool IsManualCompactionPaused() const {
+ MarkChecked();
+ return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+ }
+
+ // Returns true iff the status indicates a TxnNotPrepared error.
+ bool IsTxnNotPrepared() const {
+ MarkChecked();
+ return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared);
+ }
+
+ // Returns true iff the status indicates a IOFenced error.
+ bool IsIOFenced() const {
+ MarkChecked();
+ return (code() == kIOError) && (subcode() == kIOFenced);
+ }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ std::string ToString() const;
+
+ protected:
+ Code code_;
+ SubCode subcode_;
+ Severity sev_;
+ bool retryable_;
+ bool data_loss_;
+ unsigned char scope_;
+ // A nullptr state_ (which is at least the case for OK) means the extra
+ // message is empty.
+ std::unique_ptr<const char[]> state_;
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ mutable bool checked_ = false;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+
+ explicit Status(Code _code, SubCode _subcode = kNone)
+ : code_(_code),
+ subcode_(_subcode),
+ sev_(kNoError),
+ retryable_(false),
+ data_loss_(false),
+ scope_(0) {}
+
+ explicit Status(Code _code, SubCode _subcode, bool retryable, bool data_loss,
+ unsigned char scope)
+ : code_(_code),
+ subcode_(_subcode),
+ sev_(kNoError),
+ retryable_(retryable),
+ data_loss_(data_loss),
+ scope_(scope) {}
+
+ Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2,
+ Severity sev = kNoError);
+ Status(Code _code, const Slice& msg, const Slice& msg2)
+ : Status(_code, kNone, msg, msg2) {}
+
+ static std::unique_ptr<const char[]> CopyState(const char* s);
+
+ inline void MarkChecked() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ checked_ = true;
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+ }
+};
+
+inline Status::Status(const Status& s)
+ : code_(s.code_),
+ subcode_(s.subcode_),
+ sev_(s.sev_),
+ retryable_(s.retryable_),
+ data_loss_(s.data_loss_),
+ scope_(s.scope_) {
+ s.MarkChecked();
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status::Status(const Status& s, Severity sev)
+ : code_(s.code_),
+ subcode_(s.subcode_),
+ sev_(sev),
+ retryable_(s.retryable_),
+ data_loss_(s.data_loss_),
+ scope_(s.scope_) {
+ s.MarkChecked();
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status& Status::operator=(const Status& s) {
+ if (this != &s) {
+ s.MarkChecked();
+ MustCheck();
+ code_ = s.code_;
+ subcode_ = s.subcode_;
+ sev_ = s.sev_;
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+ }
+ return *this;
+}
+
+inline Status::Status(Status&& s) noexcept : Status() {
+ s.MarkChecked();
+ *this = std::move(s);
+}
+
+inline Status& Status::operator=(Status&& s) noexcept {
+ if (this != &s) {
+ s.MarkChecked();
+ MustCheck();
+ code_ = std::move(s.code_);
+ s.code_ = kOk;
+ subcode_ = std::move(s.subcode_);
+ s.subcode_ = kNone;
+ sev_ = std::move(s.sev_);
+ s.sev_ = kNoError;
+ retryable_ = std::move(s.retryable_);
+ s.retryable_ = false;
+ data_loss_ = std::move(s.data_loss_);
+ s.data_loss_ = false;
+ scope_ = std::move(s.scope_);
+ s.scope_ = 0;
+ state_ = std::move(s.state_);
+ }
+ return *this;
+}
+
+inline bool Status::operator==(const Status& rhs) const {
+ MarkChecked();
+ rhs.MarkChecked();
+ return (code_ == rhs.code_);
+}
+
+inline bool Status::operator!=(const Status& rhs) const {
+ MarkChecked();
+ rhs.MarkChecked();
+ return !(*this == rhs);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/system_clock.h b/src/rocksdb/include/rocksdb/system_clock.h
new file mode 100644
index 000000000..486183d60
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/system_clock.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+
+// A SystemClock is an interface used by the rocksdb implementation to access
+// operating system time-related functionality.
+class SystemClock : public Customizable {
+ public:
+ ~SystemClock() override {}
+
+ static const char* Type() { return "SystemClock"; }
+ static Status CreateFromString(const ConfigOptions& options,
+ const std::string& value,
+ std::shared_ptr<SystemClock>* result);
+ // The name of this system clock
+ virtual const char* Name() const override = 0;
+
+ // The name/nickname for the Default SystemClock. This name can be used
+ // to determine if the clock is the default one.
+ static const char* kDefaultName() { return "DefaultClock"; }
+
+ // Return a default SystemClock suitable for the current operating
+ // system.
+ static const std::shared_ptr<SystemClock>& Default();
+
+ // Returns the number of micro-seconds since some fixed point in time.
+ // It is often used as system time such as in GenericRateLimiter
+ // and other places so a port needs to return system time in order to work.
+ virtual uint64_t NowMicros() = 0;
+
+ // Returns the number of nano-seconds since some fixed point in time. Only
+ // useful for computing deltas of time in one run.
+ // Default implementation simply relies on NowMicros.
+ // In platform-specific implementations, NowNanos() should return time points
+ // that are MONOTONIC.
+ virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+ // Returns the number of micro-seconds of CPU time used by the current thread.
+ // 0 indicates not supported.
+ virtual uint64_t CPUMicros() { return 0; }
+
+ // Returns the number of nano-seconds of CPU time used by the current thread.
+ // Default implementation simply relies on CPUMicros.
+ // 0 indicates not supported.
+ virtual uint64_t CPUNanos() { return CPUMicros() * 1000; }
+
+ // Sleep/delay the thread for the prescribed number of micro-seconds.
+ virtual void SleepForMicroseconds(int micros) = 0;
+
+ // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+ // Only overwrites *unix_time on success.
+ virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+ // Converts seconds-since-Jan-01-1970 to a printable string
+ virtual std::string TimeToString(uint64_t time) = 0;
+};
+
+// Wrapper class for a SystemClock. Redirects all methods (except Name)
+// of the SystemClock interface to the target/wrapped class.
+class SystemClockWrapper : public SystemClock {
+ public:
+ explicit SystemClockWrapper(const std::shared_ptr<SystemClock>& t);
+
+ uint64_t NowMicros() override { return target_->NowMicros(); }
+
+ uint64_t NowNanos() override { return target_->NowNanos(); }
+
+ uint64_t CPUMicros() override { return target_->CPUMicros(); }
+
+ uint64_t CPUNanos() override { return target_->CPUNanos(); }
+
+ virtual void SleepForMicroseconds(int micros) override {
+ return target_->SleepForMicroseconds(micros);
+ }
+
+ Status GetCurrentTime(int64_t* unix_time) override {
+ return target_->GetCurrentTime(unix_time);
+ }
+
+ std::string TimeToString(uint64_t time) override {
+ return target_->TimeToString(time);
+ }
+
+ Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const override;
+#endif // ROCKSDB_LITE
+ const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+ std::shared_ptr<SystemClock> target_;
+};
+
+} // end namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
new file mode 100644
index 000000000..3a2bf2629
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -0,0 +1,940 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+// 1. Block-based table: this is the default table type that we inherited from
+// LevelDB, which was designed for storing data in hard disk or flash
+// device.
+// 2. Plain table: it is one of RocksDB's SST file format optimized
+// for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Block-based Table
+class Cache;
+class FilterPolicy;
+class FlushBlockPolicyFactory;
+class PersistentCache;
+class RandomAccessFile;
+struct TableReaderOptions;
+struct TableBuilderOptions;
+class TableBuilder;
+class TableFactory;
+class TableReader;
+class WritableFileWriter;
+struct ConfigOptions;
+struct EnvOptions;
+
+// Types of checksums to use for checking integrity of logical blocks within
+// files. All checksums currently use 32 bits of checking power (1 in 4B
+// chance of failing to detect random corruption).
+enum ChecksumType : char {
+ kNoChecksum = 0x0,
+ kCRC32c = 0x1,
+ kxxHash = 0x2,
+ kxxHash64 = 0x3,
+ kXXH3 = 0x4, // Supported since RocksDB 6.27
+};
+
+// `PinningTier` is used to specify which tier of block-based tables should
+// be affected by a block cache pinning setting (see
+// `MetadataCacheOptions` below).
+enum class PinningTier {
+ // For compatibility, this value specifies to fallback to the behavior
+ // indicated by the deprecated options,
+ // `pin_l0_filter_and_index_blocks_in_cache` and
+ // `pin_top_level_index_and_filter`.
+ kFallback,
+
+ // This tier contains no block-based tables.
+ kNone,
+
+ // This tier contains block-based tables that may have originated from a
+ // memtable flush. In particular, it includes tables from L0 that are smaller
+ // than 1.5 times the current `write_buffer_size`. Note these criteria imply
+ // it can include intra-L0 compaction outputs and ingested files, as long as
+ // they are not abnormally large compared to flushed files in L0.
+ kFlushedAndSimilar,
+
+ // This tier contains all block-based tables.
+ kAll,
+};
+
+// `MetadataCacheOptions` contains members indicating the desired caching
+// behavior for the different categories of metadata blocks.
+struct MetadataCacheOptions {
+ // The tier of block-based tables whose top-level index into metadata
+ // partitions will be pinned. Currently indexes and filters may be
+ // partitioned.
+ //
+ // Note `cache_index_and_filter_blocks` must be true for this option to have
+ // any effect. Otherwise any top-level index into metadata partitions would be
+ // held in table reader memory, outside the block cache.
+ PinningTier top_level_index_pinning = PinningTier::kFallback;
+
+ // The tier of block-based tables whose metadata partitions will be pinned.
+ // Currently indexes and filters may be partitioned.
+ PinningTier partition_pinning = PinningTier::kFallback;
+
+ // The tier of block-based tables whose unpartitioned metadata blocks will be
+ // pinned.
+ //
+ // Note `cache_index_and_filter_blocks` must be true for this option to have
+ // any effect. Otherwise the unpartitioned meta-blocks would be held in table
+ // reader memory, outside the block cache.
+ PinningTier unpartitioned_pinning = PinningTier::kFallback;
+};
+
+struct CacheEntryRoleOptions {
+ enum class Decision {
+ kEnabled,
+ kDisabled,
+ kFallback,
+ };
+ Decision charged = Decision::kFallback;
+ bool operator==(const CacheEntryRoleOptions& other) const {
+ return charged == other.charged;
+ }
+};
+
+struct CacheUsageOptions {
+ CacheEntryRoleOptions options;
+ std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+ static const char* kName() { return "BlockTableOptions"; };
+ // @flush_block_policy_factory creates the instances of flush block policy.
+ // which provides a configurable way to determine when to flush a block in
+ // the block based tables. If not set, table builder will use the default
+ // block flush policy, which cut blocks by block size (please refer to
+ // `FlushBlockBySizePolicy`).
+ std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+ // TODO(kailiu) Temporarily disable this feature by making the default value
+ // to be false.
+ //
+ // TODO(ajkr) we need to update names of variables controlling meta-block
+ // caching as they should now apply to range tombstone and compression
+ // dictionary meta-blocks, in addition to index and filter meta-blocks.
+ //
+ // Whether to put index/filter blocks in the block cache. When false,
+ // each "table reader" object will pre-load index/filter blocks during
+ // table initialization. Index and filter partition blocks always use
+ // block cache regardless of this option.
+ bool cache_index_and_filter_blocks = false;
+
+ // If cache_index_and_filter_blocks is enabled, cache index and filter
+ // blocks with high priority. If set to true, depending on implementation of
+ // block cache, index, filter, and other metadata blocks may be less likely
+ // to be evicted than data blocks.
+ bool cache_index_and_filter_blocks_with_high_priority = true;
+
+ // DEPRECATED: This option will be removed in a future version. For now, this
+ // option still takes effect by updating each of the following variables that
+ // has the default value, `PinningTier::kFallback`:
+ //
+ // - `MetadataCacheOptions::partition_pinning`
+ // - `MetadataCacheOptions::unpartitioned_pinning`
+ //
+ // The updated value is chosen as follows:
+ //
+ // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
+ // `PinningTier::kNone`
+ // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
+ // `PinningTier::kFlushedAndSimilar`
+ //
+ // To migrate away from this flag, explicitly configure
+ // `MetadataCacheOptions` as described above.
+ //
+ // if cache_index_and_filter_blocks is true and the below is true, then
+ // filter and index blocks are stored in the cache, but a reference is
+ // held in the "table reader" object so the blocks are pinned and only
+ // evicted from cache when the table reader is freed.
+ bool pin_l0_filter_and_index_blocks_in_cache = false;
+
+ // DEPRECATED: This option will be removed in a future version. For now, this
+ // option still takes effect by updating
+ // `MetadataCacheOptions::top_level_index_pinning` when it has the
+ // default value, `PinningTier::kFallback`.
+ //
+ // The updated value is chosen as follows:
+ //
+ // - `pin_top_level_index_and_filter == false` ->
+ // `PinningTier::kNone`
+ // - `pin_top_level_index_and_filter == true` ->
+ // `PinningTier::kAll`
+ //
+ // To migrate away from this flag, explicitly configure
+ // `MetadataCacheOptions` as described above.
+ //
+ // If cache_index_and_filter_blocks is true and the below is true, then
+ // the top-level index of partitioned filter and index blocks are stored in
+ // the cache, but a reference is held in the "table reader" object so the
+ // blocks are pinned and only evicted from cache when the table reader is
+ // freed. This is not limited to l0 in LSM tree.
+ bool pin_top_level_index_and_filter = true;
+
+ // The desired block cache pinning behavior for the different categories of
+ // metadata blocks. While pinning can reduce block cache contention, users
+ // must take care not to pin excessive amounts of data, which risks
+ // overflowing block cache.
+ MetadataCacheOptions metadata_cache_options;
+
+ // The index type that will be used for this table.
+ enum IndexType : char {
+ // A space efficient index block that is optimized for
+ // binary-search-based index.
+ kBinarySearch = 0x00,
+
+ // The hash index, if enabled, will do the hash lookup when
+ // `Options.prefix_extractor` is provided.
+ kHashSearch = 0x01,
+
+ // A two-level index implementation. Both levels are binary search indexes.
+ // Second level index blocks ("partitions") use block cache even when
+ // cache_index_and_filter_blocks=false.
+ kTwoLevelIndexSearch = 0x02,
+
+ // Like kBinarySearch, but index also contains first key of each block.
+ // This allows iterators to defer reading the block until it's actually
+ // needed. May significantly reduce read amplification of short range scans.
+ // Without it, iterator seek usually reads one block from each level-0 file
+ // and from each level, which may be expensive.
+ // Works best in combination with:
+ // - IndexShorteningMode::kNoShortening,
+ // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+ // e.g. when prefix changes.
+ // Makes the index significantly bigger (2x or more), especially when keys
+ // are long.
+ kBinarySearchWithFirstKey = 0x03,
+ };
+
+ IndexType index_type = kBinarySearch;
+
+ // The index type that will be used for the data block.
+ enum DataBlockIndexType : char {
+ kDataBlockBinarySearch = 0, // traditional block type
+ kDataBlockBinaryAndHash = 1, // additional hash index
+ };
+
+ DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+ // #entries/#buckets. It is valid only when data_block_hash_index_type is
+ // kDataBlockBinaryAndHash.
+ double data_block_hash_table_util_ratio = 0.75;
+
+ // Option hash_index_allow_collision is now deleted.
+ // It will behave as if hash_index_allow_collision=true.
+
+ // Use the specified checksum type. Newly created table files will be
+ // protected with this checksum type. Old table files will still be readable,
+ // even though they have different checksum type.
+ ChecksumType checksum = kXXH3;
+
+ // Disable block cache. If this is set to true,
+ // then no block cache should be used, and the block_cache should
+ // point to a nullptr object.
+ bool no_block_cache = false;
+
+ // If non-NULL use the specified cache for blocks.
+ // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+ std::shared_ptr<Cache> block_cache = nullptr;
+
+ // If non-NULL use the specified cache for pages read from device
+ // IF NULL, no page cache is used
+ std::shared_ptr<PersistentCache> persistent_cache = nullptr;
+
+ // DEPRECATED: This feature is planned for removal in a future release.
+ // Use SecondaryCache instead.
+ //
+ // If non-NULL use the specified cache for compressed blocks.
+ // If NULL, rocksdb will not use a compressed block cache.
+ // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+ // same type of object there.
+ std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ uint64_t block_size = 4 * 1024;
+
+ // This is used to close a block before it reaches the configured
+ // 'block_size'. If the percentage of free space in the current block is less
+ // than this specified number and adding a new record to the block will
+ // exceed the configured block size, then this block will be closed and the
+ // new record will be written to the next block.
+ int block_size_deviation = 10;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone. The minimum value allowed is 1. Any smaller
+ // value will be silently overwritten with 1.
+ int block_restart_interval = 16;
+
+ // Same as block_restart_interval but used for the index block.
+ int index_block_restart_interval = 1;
+
+ // Block size for partitioned metadata. Currently applied to indexes when
+ // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
+ // Note: Since in the current implementation the filters and index partitions
+ // are aligned, an index/filter block is created when either index or filter
+ // block size reaches the specified limit.
+ // Note: this limit is currently applied to only index blocks; a filter
+ // partition is cut right after an index block is cut
+ // TODO(myabandeh): remove the note above when filter partitions are cut
+ // separately
+ uint64_t metadata_block_size = 4096;
+
+ // `cache_usage_options` allows users to specify the default
+ // options (`cache_usage_options.options`) and the overriding
+ // options (`cache_usage_options.options_overrides`)
+ // for different `CacheEntryRole` under various features related to cache
+ // usage.
+ //
+ // For a certain `CacheEntryRole role` and a certain feature `f` of
+ // `CacheEntryRoleOptions`:
+ // 1. If `options_overrides` has an entry for `role` and
+ // `options_overrides[role].f != kFallback`, we use
+ // `options_overrides[role].f`
+ // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f`
+ // 3. Otherwise, we follow the compatible existing behavior for `f` (see
+ // each feature's comment for more)
+ //
+ // `cache_usage_options` currently supports specifying options for the
+ // following features:
+ //
+ // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`)
+ // Memory charging is a feature of accounting memory usage of specific area
+ // (represented by `CacheEntryRole`) toward usage in block cache (if
+ // available), by updating a dynamical charge to the block cache loosely based
+ // on the actual memory usage of that area.
+ //
+ // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer
+ // (i) If kEnabled:
+ // Charge memory usage of the buffered data used as training samples for
+ // dictionary compression.
+ // If such memory usage exceeds the avaible space left in the block cache
+ // at some point (i.e, causing a cache full under
+ // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be
+ // unbuffered.
+ // (ii) If kDisabled:
+ // Does not charge the memory usage mentioned above.
+ // (iii) Compatible existing behavior:
+ // Same as kEnabled.
+ //
+ // (b) CacheEntryRole::kFilterConstruction
+ // (i) If kEnabled:
+ // Charge memory usage of Bloom Filter
+ // (format_version >= 5) and Ribbon Filter construction.
+ // If additional temporary memory of Ribbon Filter exceeds the avaible
+ // space left in the block cache at some point (i.e, causing a cache full
+ // under `LRUCacheOptions::strict_capacity_limit` = true),
+ // construction will fall back to Bloom Filter.
+ // (ii) If kDisabled:
+ // Does not charge the memory usage mentioned above.
+ // (iii) Compatible existing behavior:
+ // Same as kDisabled.
+ //
+ // (c) CacheEntryRole::kBlockBasedTableReader
+ // (i) If kEnabled:
+ // Charge memory usage of table properties +
+ // index block/filter block/uncompression dictionary (when stored in table
+ // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
+ // false) + some internal data structures during table reader creation.
+ // If such a table reader exceeds
+ // the avaible space left in the block cache at some point (i.e, causing
+ // a cache full under `LRUCacheOptions::strict_capacity_limit` = true),
+ // creation will fail with Status::MemoryLimit().
+ // (ii) If kDisabled:
+ // Does not charge the memory usage mentioned above.
+ // (iii) Compatible existing behavior:
+ // Same as kDisabled.
+ //
+ // (d) CacheEntryRole::kFileMetadata
+ // (i) If kEnabled:
+ // Charge memory usage of file metadata. RocksDB holds one file metadata
+ // structure in-memory per on-disk table file.
+ // If such file metadata's
+ // memory exceeds the avaible space left in the block cache at some point
+ // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` =
+ // true), creation will fail with Status::MemoryLimit().
+ // (ii) If kDisabled:
+ // Does not charge the memory usage mentioned above.
+ // (iii) Compatible existing behavior:
+ // Same as kDisabled.
+ //
+ // (e) Other CacheEntryRole
+ // Not supported.
+ // `Status::kNotSupported` will be returned if
+ // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}.
+ //
+ //
+ // 2. More to come ...
+ //
+ CacheUsageOptions cache_usage_options;
+
+ // Note: currently this option requires kTwoLevelIndexSearch to be set as
+ // well.
+ // TODO(myabandeh): remove the note above once the limitation is lifted
+ // Use partitioned full filters for each SST file. This option is
+ // incompatible with block-based filters. Filter partition blocks use
+ // block cache even when cache_index_and_filter_blocks=false.
+ bool partition_filters = false;
+
+ // Option to generate Bloom/Ribbon filters that minimize memory
+ // internal fragmentation.
+ //
+ // When false, malloc_usable_size is not available, or format_version < 5,
+ // filters are generated without regard to internal fragmentation when
+ // loaded into memory (historical behavior). When true (and
+ // malloc_usable_size is available and format_version >= 5), then
+ // filters are generated to "round up" and "round down" their sizes to
+ // minimize internal fragmentation when loaded into memory, assuming the
+ // reading DB has the same memory allocation characteristics as the
+ // generating DB. This option does not break forward or backward
+ // compatibility.
+ //
+ // While individual filters will vary in bits/key and false positive rate
+ // when setting is true, the implementation attempts to maintain a weighted
+ // average FP rate for filters consistent with this option set to false.
+ //
+ // With Jemalloc for example, this setting is expected to save about 10% of
+ // the memory footprint and block cache charge of filters, while increasing
+ // disk usage of filters by about 1-2% due to encoding efficiency losses
+ // with variance in bits/key.
+ //
+ // NOTE: Because some memory counted by block cache might be unmapped pages
+ // within internal fragmentation, this option can increase observed RSS
+ // memory usage. With cache_index_and_filter_blocks=true, this option makes
+ // the block cache better at using space it is allowed. (These issues
+ // should not arise with partitioned filters.)
+ //
+ // NOTE: Do not set to true if you do not trust malloc_usable_size. With
+ // this option, RocksDB might access an allocated memory object beyond its
+ // original size if malloc_usable_size says it is safe to do so. While this
+ // can be considered bad practice, it should not produce undefined behavior
+ // unless malloc_usable_size is buggy or broken.
+ bool optimize_filters_for_memory = false;
+
+ // Use delta encoding to compress keys in blocks.
+ // ReadOptions::pin_data requires this option to be disabled.
+ //
+ // Default: true
+ bool use_delta_encoding = true;
+
+ // If non-nullptr, use the specified filter policy to reduce disk reads.
+ // Many applications will benefit from passing the result of
+ // NewBloomFilterPolicy() here.
+ std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+ // If true, place whole keys in the filter (not just prefixes).
+ // This must generally be true for gets to be efficient.
+ bool whole_key_filtering = true;
+
+ // If true, detect corruption during Bloom Filter (format_version >= 5)
+ // and Ribbon Filter construction.
+ //
+ // This is an extra check that is only
+ // useful in detecting software bugs or CPU+memory malfunction.
+ // Turning on this feature increases filter construction time by 30%.
+ //
+ // This parameter can be changed dynamically by
+ // DB::SetOptions({{"block_based_table_factory",
+ // "{detect_filter_construct_corruption=true;}"}});
+ //
+ // TODO: optimize this performance
+ bool detect_filter_construct_corruption = false;
+
+ // Verify that decompressing the compressed block gives back the input. This
+ // is a verification mode that we use to detect bugs in compression
+ // algorithms.
+ bool verify_compression = false;
+
+ // If used, For every data block we load into memory, we will create a bitmap
+ // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+ // will be used to figure out the percentage we actually read of the blocks.
+ //
+ // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+ // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+ // read amplification using this formula
+ // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+ //
+ // value => memory usage (percentage of loaded blocks memory)
+ // 1 => 12.50 %
+ // 2 => 06.25 %
+ // 4 => 03.12 %
+ // 8 => 01.56 %
+ // 16 => 00.78 %
+ //
+ // Note: This number must be a power of 2, if not it will be sanitized
+ // to be the next lowest power of 2, for example a value of 7 will be
+ // treated as 4, a value of 19 will be treated as 16.
+ //
+ // Default: 0 (disabled)
+ uint32_t read_amp_bytes_per_bit = 0;
+
+ // We currently have these versions:
+ // 0 -- This version can be read by really old RocksDB's. Doesn't support
+ // changing checksum type (default is CRC32).
+ // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+ // checksum, like xxHash. It is written by RocksDB when
+ // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+ // 0 is silently upconverted)
+ // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+ // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+ // don't plan to run RocksDB before version 3.10, you should probably use
+ // this.
+ // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
+ // encode the keys in index blocks. If you don't plan to run RocksDB before
+ // version 5.15, you should probably use this.
+ // This option only affects newly written tables. When reading existing
+ // tables, the information about version is read from the footer.
+ // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+ // encode the values in index blocks. If you don't plan to run RocksDB before
+ // version 5.16 and you are using index_block_restart_interval > 1, you should
+ // probably use this as it would reduce the index size.
+ // This option only affects newly written tables. When reading existing
+ // tables, the information about version is read from the footer.
+ // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
+ // filters use a generally faster and more accurate Bloom filter
+ // implementation, with a different schema.
+ uint32_t format_version = 5;
+
+ // Store index blocks on disk in compressed format. Changing this option to
+ // false will avoid the overhead of decompression if index blocks are evicted
+ // and read back
+ bool enable_index_compression = true;
+
+ // Align data blocks on lesser of page size and block size
+ bool block_align = false;
+
+ // This enum allows trading off increased index size for improved iterator
+ // seek performance in some situations, particularly when block cache is
+ // disabled (ReadOptions::fill_cache = false) and direct IO is
+ // enabled (DBOptions::use_direct_reads = true).
+ // The default mode is the best tradeoff for most use cases.
+ // This option only affects newly written tables.
+ //
+ // The index contains a key separating each pair of consecutive blocks.
+ // Let A be the highest key in one block, B the lowest key in the next block,
+ // and I the index entry separating these two blocks:
+ // [ ... A] I [B ...]
+ // I is allowed to be anywhere in [A, B).
+ // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+ // first block, then immediately fall through to the second block.
+ // However, if I=A, this can't happen, and we'll read only the second block.
+ // In kNoShortening mode, we use I=A. In other modes, we use the shortest
+ // key in [A, B), which usually significantly reduces index size.
+ //
+ // There's a similar story for the last index entry, which is an upper bound
+ // of the highest key in the file. If it's shortened and therefore
+ // overestimated, iterator is likely to unnecessarily read the last data block
+ // from each file on each seek.
+ enum class IndexShorteningMode : char {
+ // Use full keys.
+ kNoShortening,
+ // Shorten index keys between blocks, but use full key for the last index
+ // key, which is the upper bound of the whole file.
+ kShortenSeparators,
+ // Shorten both keys between blocks and key after last block.
+ kShortenSeparatorsAndSuccessor,
+ };
+
+ IndexShorteningMode index_shortening =
+ IndexShorteningMode::kShortenSeparators;
+
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
+ // for a table file if user doesn't provide readahead_size. The readahead
+ // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB)
+ // and doubles on every additional read upto max_auto_readahead_size and
+ // max_auto_readahead_size can be configured.
+ //
+ // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable
+ // the implicit auto prefetching.
+ // If max_auto_readahead_size provided is less
+ // than initial_auto_readahead_size, then RocksDB will sanitize the
+ // initial_auto_readahead_size and set it to max_auto_readahead_size.
+ //
+ // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
+ // the blocks.
+ //
+ // Found that 256 KB readahead size provides the best performance, based on
+ // experiments, for auto readahead. Experiment data is in PR #3282.
+ //
+ // This parameter can be changed dynamically by
+ // DB::SetOptions({{"block_based_table_factory",
+ // "{max_auto_readahead_size=0;}"}}));
+ //
+ // Changing the value dynamically will only affect files opened after the
+ // change.
+ //
+ // Default: 256 KB (256 * 1024).
+ size_t max_auto_readahead_size = 256 * 1024;
+
+ // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
+ // filter blocks) which are already in memory into block cache at the time of
+ // flush. On a flush, the block that is in memory (in memtables) get flushed
+ // to the device. If using Direct IO, additional IO is incurred to read this
+ // data back into memory again, which is avoided by enabling this option. This
+ // further helps if the workload exhibits high temporal locality, where most
+ // of the reads go to recently written data. This also helps in case of
+ // Distributed FileSystem.
+ //
+ // This parameter can be changed dynamically by
+ // DB::SetOptions({{"block_based_table_factory",
+ // "{prepopulate_block_cache=kFlushOnly;}"}}));
+ enum class PrepopulateBlockCache : char {
+ // Disable prepopulate block cache.
+ kDisable,
+ // Prepopulate blocks during flush only.
+ kFlushOnly,
+ };
+
+ PrepopulateBlockCache prepopulate_block_cache =
+ PrepopulateBlockCache::kDisable;
+
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
+ // for a table file if user doesn't provide readahead_size. The readahead size
+ // starts at initial_auto_readahead_size and doubles on every additional read
+ // upto BlockBasedTableOptions.max_auto_readahead_size.
+ // max_auto_readahead_size can also be configured.
+ //
+ // Scenarios:
+ // - If initial_auto_readahead_size is set 0 then it will disabled the
+ // implicit auto prefetching irrespective of max_auto_readahead_size.
+ // - If max_auto_readahead_size is set 0, it will disable the internal
+ // prefetching irrespective of initial_auto_readahead_size.
+ // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB
+ // will sanitize the value of initial_auto_readahead_size to
+ // max_auto_readahead_size and readahead_size will be
+ // max_auto_readahead_size.
+ //
+ // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch
+ // the blocks.
+ //
+ // This parameter can be changed dynamically by
+ // DB::SetOptions({{"block_based_table_factory",
+ // "{initial_auto_readahead_size=0;}"}}));
+ //
+ // Changing the value dynamically will only affect files opened after the
+ // change.
+ //
+ // Default: 8 KB (8 * 1024).
+ size_t initial_auto_readahead_size = 8 * 1024;
+
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
+ // for a table file if user doesn't provide readahead_size and reads are
+ // sequential.
+ // num_file_reads_for_auto_readahead indicates after how many
+ // sequential reads internal auto prefetching should be start.
+ //
+ // For example, if value is 2 then after reading 2 sequential data blocks on
+ // third data block prefetching will start.
+ // If set 0, it will start prefetching from the first read.
+ //
+ // This parameter can be changed dynamically by
+ // DB::SetOptions({{"block_based_table_factory",
+ // "{num_file_reads_for_auto_readahead=0;}"}}));
+ //
+ // Changing the value dynamically will only affect files opened after the
+ // change.
+ //
+ // Default: 2
+ uint64_t num_file_reads_for_auto_readahead = 2;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+ // value of this properties is a fixed int32 number.
+ static const std::string kIndexType;
+ // value is "1" for true and "0" for false.
+ static const std::string kWholeKeyFiltering;
+ // value is "1" for true and "0" for false.
+ static const std::string kPrefixFiltering;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+ const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+
+enum EncodingType : char {
+ // Always write full keys without any special encoding.
+ kPlain,
+ // Find opportunity to write the same prefix once for multiple rows.
+ // In some cases, when a key follows a previous key with the same prefix,
+ // instead of writing out the full key, it just writes out the size of the
+ // shared prefix, as well as other bytes, to save some bytes.
+ //
+ // When using this option, the user is required to use the same prefix
+ // extractor to make sure the same prefix will be extracted from the same key.
+ // The Name() value of the prefix extractor will be stored in the file. When
+ // reopening the file, the name of the options.prefix_extractor given will be
+ // bitwise compared to the prefix extractors stored in the file. An error
+ // will be returned if the two don't match.
+ kPrefix,
+};
+
+// Table Properties that are specific to plain table properties.
+struct PlainTablePropertyNames {
+ static const std::string kEncodingType;
+ static const std::string kBloomVersion;
+ static const std::string kNumBloomBlocks;
+};
+
+const uint32_t kPlainTableVariableLength = 0;
+
+struct PlainTableOptions {
+ static const char* kName() { return "PlainTableOptions"; };
+ // @user_key_len: plain table has optimization for fix-sized keys, which can
+ // be specified via user_key_len. Alternatively, you can pass
+ // `kPlainTableVariableLength` if your keys have variable
+ // lengths.
+ uint32_t user_key_len = kPlainTableVariableLength;
+
+ // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+ // You may disable it by passing a zero.
+ int bloom_bits_per_key = 10;
+
+ // @hash_table_ratio: the desired utilization of the hash table used for
+ // prefix hashing.
+ // hash_table_ratio = number of prefixes / #buckets in the
+ // hash table
+ double hash_table_ratio = 0.75;
+
+ // @index_sparseness: inside each prefix, need to build one index record for
+ // how many keys for binary search inside each hash bucket.
+ // For encoding type kPrefix, the value will be used when
+ // writing to determine an interval to rewrite the full
+ // key. It will also be used as a suggestion and satisfied
+ // when possible.
+ size_t index_sparseness = 16;
+
+ // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+ // Otherwise from huge page TLB. The user needs to
+ // reserve huge pages for it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ size_t huge_page_tlb_size = 0;
+
+ // @encoding_type: how to encode the keys. See enum EncodingType above for
+ // the choices. The value will determine how to encode keys
+ // when writing to a new SST file. This value will be stored
+ // inside the SST file which will be used when reading from
+ // the file, which makes it possible for users to choose
+ // different encoding type when reopening a DB. Files with
+ // different encoding types can co-exist in the same DB and
+ // can be read.
+ EncodingType encoding_type = kPlain;
+
+ // @full_scan_mode: mode for reading the whole file one record by one without
+ // using the index.
+ bool full_scan_mode = false;
+
+ // @store_index_in_file: compute plain table index and bloom filter during
+ // file building and store it in file. When reading
+ // file, index will be mapped instead of recomputation.
+ bool store_index_in_file = false;
+};
+
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extractor properly to make
+// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
+// the hash bucket found, a binary search is executed for hash conflicts.
+// Finally, a linear search is used.
+
+extern TableFactory* NewPlainTableFactory(
+ const PlainTableOptions& options = PlainTableOptions());
+
+struct CuckooTablePropertyNames {
+ // The key that is used to fill empty buckets.
+ static const std::string kEmptyKey;
+ // Fixed length of value.
+ static const std::string kValueLength;
+ // Number of hash functions used in Cuckoo Hash.
+ static const std::string kNumHashFunc;
+ // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+ // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+ // where starting bucket id is given by the hash function on the key. In case
+ // of a collision during inserting the key, the builder tries to insert the
+ // key in other locations of the cuckoo block before using the next hash
+ // function. This reduces cache miss during read operation in case of
+ // collision.
+ static const std::string kCuckooBlockSize;
+ // Size of the hash table. Use this number to compute the modulo of hash
+ // function. The actual number of buckets will be kMaxHashTableSize +
+ // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+ // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+ // implementation.
+ static const std::string kHashTableSize;
+ // Denotes if the key sorted in the file is Internal Key (if false)
+ // or User Key only (if true).
+ static const std::string kIsLastLevel;
+ // Indicate if using identity function for the first hash function.
+ static const std::string kIdentityAsFirstHash;
+ // Indicate if using module or bit and to calculate hash value
+ static const std::string kUseModuleHash;
+ // Fixed user key length
+ static const std::string kUserKeyLength;
+};
+
+struct CuckooTableOptions {
+ static const char* kName() { return "CuckooTableOptions"; };
+
+ // Determines the utilization of hash tables. Smaller values
+ // result in larger hash tables with fewer collisions.
+ double hash_table_ratio = 0.9;
+ // A property used by builder to determine the depth to go to
+ // to search for a path to displace elements in case of
+ // collision. See Builder.MakeSpaceForKey method. Higher
+ // values result in more efficient hash tables with fewer
+ // lookups but take more time to build.
+ uint32_t max_search_depth = 100;
+ // In case of collision while inserting, the builder
+ // attempts to insert in the next cuckoo_block_size
+ // locations before skipping over to the next Cuckoo hash
+ // function. This makes lookups more cache friendly in case
+ // of collisions.
+ uint32_t cuckoo_block_size = 5;
+ // If this option is enabled, user key is treated as uint64_t and its value
+ // is used as hash value directly. This option changes builder's behavior.
+ // Reader ignore this option and behave according to what specified in table
+ // property.
+ bool identity_as_first_hash = false;
+ // If this option is set to true, module is used during hash calculation.
+ // This often yields better space efficiency at the cost of performance.
+ // If this option is set to false, # of entries in table is constrained to be
+ // power of two, and bit and is used to calculate hash, which is faster in
+ // general.
+ bool use_module_hash = true;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+ const CuckooTableOptions& table_options = CuckooTableOptions());
+
+#endif // ROCKSDB_LITE
+
+class RandomAccessFileReader;
+
+// A base class for table factories.
+class TableFactory : public Customizable {
+ public:
+ virtual ~TableFactory() override {}
+
+ static const char* kBlockCacheOpts() { return "BlockCache"; };
+ static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
+ static const char* kPlainTableName() { return "PlainTable"; }
+ static const char* kCuckooTableName() { return "CuckooTable"; };
+
+ // Creates and configures a new TableFactory from the input options and id.
+ static Status CreateFromString(const ConfigOptions& config_options,
+ const std::string& id,
+ std::shared_ptr<TableFactory>* factory);
+
+ static const char* Type() { return "TableFactory"; }
+
+ // Returns a Table object table that can fetch data from file specified
+ // in parameter file. It's the caller's responsibility to make sure
+ // file is in the correct format.
+ //
+ // NewTableReader() is called in three places:
+ // (1) TableCache::FindTable() calls the function when table cache miss
+ // and cache the table object returned.
+ // (2) SstFileDumper (for SST Dump) opens the table and dump the table
+ // contents using the iterator of the table.
+ // (3) DBImpl::IngestExternalFile() calls this function to read the contents
+ // of the sst file it's attempting to add
+ //
+ // table_reader_options is a TableReaderOptions which contain all the
+ // needed parameters and configuration to open the table.
+ // file is a file handler to handle the file for the table.
+ // file_size is the physical file size of the file.
+ // table_reader is the output table reader.
+ virtual Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache = true) const {
+ ReadOptions ro;
+ return NewTableReader(ro, table_reader_options, std::move(file), file_size,
+ table_reader, prefetch_index_and_filter_in_cache);
+ }
+
+ // Overload of the above function that allows the caller to pass in a
+ // ReadOptions
+ virtual Status NewTableReader(
+ const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache) const = 0;
+
+ // Return a table builder to write to a file for this table type.
+ //
+ // It is called in several places:
+ // (1) When flushing memtable to a level-0 output file, it creates a table
+ // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+ // (2) During compaction, it gets the builder for writing compaction output
+ // files in DBImpl::OpenCompactionOutputFile().
+ // (3) When recovering from transaction logs, it creates a table builder to
+ // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+ // by calling BuildTable())
+ // (4) When running Repairer, it creates a table builder to convert logs to
+ // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+ //
+ // Multiple configured can be accessed from there, including and not limited
+ // to compression options. file is a handle of a writable file.
+ // It is the caller's responsibility to keep the file open and close the file
+ // after closing the table builder. compression_type is the compression type
+ // to use in this table.
+ virtual TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const = 0;
+
+ // Return is delete range supported
+ virtual bool IsDeleteRangeSupported() const { return false; }
+};
+
+#ifndef ROCKSDB_LITE
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
+// convert a DB from one table format to another.
+// @table_factory_to_write: the table factory used when writing to new files.
+// @block_based_table_factory: block based table factory to use. If NULL, use
+// a default one.
+// @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
+// one.
+extern TableFactory* NewAdaptiveTableFactory(
+ std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
+ std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
+ std::shared_ptr<TableFactory> plain_table_factory = nullptr,
+ std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
new file mode 100644
index 000000000..cbe87fa3a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -0,0 +1,327 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interpret these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+// ++pos) {
+// ...
+// }
+using UserCollectedProperties = std::map<std::string, std::string>;
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+ static const std::string kDbId;
+ static const std::string kDbSessionId;
+ static const std::string kDbHostId;
+ static const std::string kOriginalFileNumber;
+ static const std::string kDataSize;
+ static const std::string kIndexSize;
+ static const std::string kIndexPartitions;
+ static const std::string kTopLevelIndexSize;
+ static const std::string kIndexKeyIsUserKey;
+ static const std::string kIndexValueIsDeltaEncoded;
+ static const std::string kFilterSize;
+ static const std::string kRawKeySize;
+ static const std::string kRawValueSize;
+ static const std::string kNumDataBlocks;
+ static const std::string kNumEntries;
+ static const std::string kNumFilterEntries;
+ static const std::string kDeletedKeys;
+ static const std::string kMergeOperands;
+ static const std::string kNumRangeDeletions;
+ static const std::string kFormatVersion;
+ static const std::string kFixedKeyLen;
+ static const std::string kFilterPolicy;
+ static const std::string kColumnFamilyName;
+ static const std::string kColumnFamilyId;
+ static const std::string kComparator;
+ static const std::string kMergeOperator;
+ static const std::string kPrefixExtractorName;
+ static const std::string kPropertyCollectors;
+ static const std::string kCompression;
+ static const std::string kCompressionOptions;
+ static const std::string kCreationTime;
+ static const std::string kOldestKeyTime;
+ static const std::string kFileCreationTime;
+ static const std::string kSlowCompressionEstimatedDataSize;
+ static const std::string kFastCompressionEstimatedDataSize;
+ static const std::string kSequenceNumberTimeMapping;
+};
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own properties that they are interested in. This class is essentially
+// a collection of callback functions that will be invoked during table
+// building. It is constructed with TablePropertiesCollectorFactory. The methods
+// don't need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially.
+//
+// Statuses from these callbacks are currently logged when not OK, but
+// otherwise ignored by RocksDB.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollector {
+ public:
+ virtual ~TablePropertiesCollector() {}
+
+ // DEPRECATE User defined collector should implement AddUserKey(), though
+ // this old function still works for backward compatible reason.
+ // Add() will be called when a new key/value pair is inserted into the table.
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) {
+ return Status::InvalidArgument(
+ "TablePropertiesCollector::Add() deprecated.");
+ }
+
+ // AddUserKey() will be called when a new key/value pair is inserted into the
+ // table.
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status AddUserKey(const Slice& key, const Slice& value,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) {
+ // For backwards-compatibility.
+ return Add(key, value);
+ }
+
+ // Called after each new block is cut
+ virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+ uint64_t /* block_compressed_bytes_fast */,
+ uint64_t /* block_compressed_bytes_slow */) {
+ // Nothing to do here. Callback registers can override.
+ return;
+ }
+
+ // Finish() will be called when a table has already been built and is ready
+ // for writing the properties block.
+ // @params properties User will add their collected statistics to
+ // `properties`.
+ virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+ // Return the human-readable properties, where the key is property name and
+ // the value is the human-readable form of value.
+ virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const = 0;
+
+ // EXPERIMENTAL Return whether the output file should be further compacted
+ virtual bool NeedCompact() const { return false; }
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollectorFactory : public Customizable {
+ public:
+ struct Context {
+ uint32_t column_family_id;
+ // The level at creating the SST file (i.e, table), of which the
+ // properties are being collected.
+ int level_at_creation = kUnknownLevelAtCreation;
+ static const uint32_t kUnknownColumnFamily;
+ static const int kUnknownLevelAtCreation = -1;
+ };
+
+ ~TablePropertiesCollectorFactory() override {}
+ static const char* Type() { return "TablePropertiesCollectorFactory"; }
+ static Status CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<TablePropertiesCollectorFactory>* result);
+
+ // has to be thread-safe
+ virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ const char* Name() const override = 0;
+
+ // Can be overridden by sub-classes to return the Name, followed by
+ // configuration info that will // be logged to the info log when the
+ // DB is opened
+ virtual std::string ToString() const { return Name(); }
+};
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+ // the file number at creation time, or 0 for unknown. When known,
+ // combining with db_session_id must uniquely identify an SST file.
+ uint64_t orig_file_number = 0;
+ // the total size of all data blocks.
+ uint64_t data_size = 0;
+ // the size of index block.
+ uint64_t index_size = 0;
+ // Total number of index partitions if kTwoLevelIndexSearch is used
+ uint64_t index_partitions = 0;
+ // Size of the top-level index if kTwoLevelIndexSearch is used
+ uint64_t top_level_index_size = 0;
+ // Whether the index key is user key. Otherwise it includes 8 byte of sequence
+ // number added by internal key format.
+ uint64_t index_key_is_user_key = 0;
+ // Whether delta encoding is used to encode the index values.
+ uint64_t index_value_is_delta_encoded = 0;
+ // the size of filter block.
+ uint64_t filter_size = 0;
+ // total raw (uncompressed, undelineated) key size
+ uint64_t raw_key_size = 0;
+ // total raw (uncompressed, undelineated) value size
+ uint64_t raw_value_size = 0;
+ // the number of blocks in this table
+ uint64_t num_data_blocks = 0;
+ // the number of entries in this table
+ uint64_t num_entries = 0;
+ // the number of unique entries (keys or prefixes) added to filters
+ uint64_t num_filter_entries = 0;
+ // the number of deletions in the table
+ uint64_t num_deletions = 0;
+ // the number of merge operands in the table
+ uint64_t num_merge_operands = 0;
+ // the number of range deletions in this table
+ uint64_t num_range_deletions = 0;
+ // format version, reserved for backward compatibility
+ uint64_t format_version = 0;
+ // If 0, key is variable length. Otherwise number of bytes for each key.
+ uint64_t fixed_key_len = 0;
+ // ID of column family for this SST file, corresponding to the CF identified
+ // by column_family_name.
+ uint64_t column_family_id = ROCKSDB_NAMESPACE::
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+ // Timestamp of the latest key. 0 means unknown.
+ // TODO(sagar0): Should be changed to latest_key_time ... but don't know the
+ // full implications of backward compatibility. Hence retaining for now.
+ uint64_t creation_time = 0;
+
+ // Timestamp of the earliest key. 0 means unknown.
+ uint64_t oldest_key_time = 0;
+ // Actual SST file creation time. 0 means unknown.
+ uint64_t file_creation_time = 0;
+ // Estimated size of data blocks if compressed using a relatively slower
+ // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+ // 0 means unknown.
+ uint64_t slow_compression_estimated_data_size = 0;
+ // Estimated size of data blocks if compressed using a relatively faster
+ // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+ // 0 means unknown.
+ uint64_t fast_compression_estimated_data_size = 0;
+ // Offset of the value of the property "external sst file global seqno" in the
+ // file if the property exists.
+ // 0 means not exists.
+ uint64_t external_sst_file_global_seqno_offset = 0;
+
+ // DB identity
+ // db_id is an identifier generated the first time the DB is created
+ // If DB identity is unset or unassigned, `db_id` will be an empty string.
+ std::string db_id;
+
+ // DB session identity
+ // db_session_id is an identifier that gets reset every time the DB is opened
+ // If DB session identity is unset or unassigned, `db_session_id` will be an
+ // empty string.
+ std::string db_session_id;
+
+ // Location of the machine hosting the DB instance
+ // db_host_id identifies the location of the host in some form
+ // (hostname by default, but can also be any string of the user's choosing).
+ // It can potentially change whenever the DB is opened
+ std::string db_host_id;
+
+ // Name of the column family with which this SST file is associated.
+ // If column family is unknown, `column_family_name` will be an empty string.
+ std::string column_family_name;
+
+ // The name of the filter policy used in this table.
+ // If no filter policy is used, `filter_policy_name` will be an empty string.
+ std::string filter_policy_name;
+
+ // The name of the comparator used in this table.
+ std::string comparator_name;
+
+ // The name of the merge operator used in this table.
+ // If no merge operator is used, `merge_operator_name` will be "nullptr".
+ std::string merge_operator_name;
+
+ // The name of the prefix extractor used in this table
+ // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr".
+ std::string prefix_extractor_name;
+
+ // The names of the property collectors factories used in this table
+ // separated by commas
+ // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
+ std::string property_collectors_names;
+
+ // The compression algo used to compress the SST files.
+ std::string compression_name;
+
+ // Compression options used to compress the SST files.
+ std::string compression_options;
+
+ // Sequence number to time mapping, delta encoded.
+ std::string seqno_to_time_mapping;
+
+ // user collected properties
+ UserCollectedProperties user_collected_properties;
+ UserCollectedProperties readable_properties;
+
+ // convert this object to a human readable form
+ // @prop_delim: delimiter for each property.
+ std::string ToString(const std::string& prop_delim = "; ",
+ const std::string& kv_delim = "=") const;
+
+ // Aggregate the numerical member variables of the specified
+ // TableProperties.
+ void Add(const TableProperties& tp);
+
+ // Subset of properties that make sense when added together
+ // between tables. Keys match field names in this class instead
+ // of using full property names.
+ std::map<std::string, uint64_t> GetAggregatablePropertiesAsMap() const;
+
+ // Return the approximated memory usage of this TableProperties object,
+ // including memory used by the string properties and UserCollectedProperties
+ std::size_t ApproximateMemoryUsage() const;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
+ bool* property_present);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_reader_caller.h b/src/rocksdb/include/rocksdb/table_reader_caller.h
new file mode 100644
index 000000000..10ec08130
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_reader_caller.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+ kUserGet = 1,
+ kUserMultiGet = 2,
+ kUserIterator = 3,
+ kUserApproximateSize = 4,
+ kUserVerifyChecksum = 5,
+ kSSTDumpTool = 6,
+ kExternalSSTIngestion = 7,
+ kRepair = 8,
+ kPrefetch = 9,
+ kCompaction = 10,
+ // A compaction job may refill the block cache with blocks in the new SST
+ // files if paranoid_file_checks is true.
+ kCompactionRefill = 11,
+ // After building a table, it may load all its blocks into the block cache if
+ // paranoid_file_checks is true.
+ kFlush = 12,
+ // sst_file_reader.
+ kSSTFileReader = 13,
+ // A list of callers that are either not interesting for analysis or are
+ // calling from a test environment, e.g., unit test, benchmark, etc.
+ kUncategorized = 14,
+ // All callers should be added before kMaxBlockCacheLookupCaller.
+ kMaxBlockCacheLookupCaller
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
new file mode 100644
index 000000000..1b5f8c046
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread. Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS)
+#define ROCKSDB_USING_THREAD_STATUS
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): remove this function once c++14 is available
+// as std::max will be able to cover this.
+// Current MS compiler does not support constexpr
+template <int A, int B>
+struct constexpr_max {
+ static const int result = (A > B) ? A : B;
+};
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// ROCKSDB_NAMESPACE::GetThreadList().
+struct ThreadStatus {
+ // The type of a thread.
+ enum ThreadType : int {
+ HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool
+ LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool
+ USER, // User thread (Non-RocksDB BG thread)
+ BOTTOM_PRIORITY, // RocksDB BG thread in bottom-pri thread pool
+ NUM_THREAD_TYPES
+ };
+
+ // The type used to refer to a thread operation.
+ // A thread operation describes high-level action of a thread.
+ // Examples include compaction and flush.
+ enum OperationType : int {
+ OP_UNKNOWN = 0,
+ OP_COMPACTION,
+ OP_FLUSH,
+ NUM_OP_TYPES
+ };
+
+ enum OperationStage : int {
+ STAGE_UNKNOWN = 0,
+ STAGE_FLUSH_RUN,
+ STAGE_FLUSH_WRITE_L0,
+ STAGE_COMPACTION_PREPARE,
+ STAGE_COMPACTION_RUN,
+ STAGE_COMPACTION_PROCESS_KV,
+ STAGE_COMPACTION_INSTALL,
+ STAGE_COMPACTION_SYNC_FILE,
+ STAGE_PICK_MEMTABLES_TO_FLUSH,
+ STAGE_MEMTABLE_ROLLBACK,
+ STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+ NUM_OP_STAGES
+ };
+
+ enum CompactionPropertyType : int {
+ COMPACTION_JOB_ID = 0,
+ COMPACTION_INPUT_OUTPUT_LEVEL,
+ COMPACTION_PROP_FLAGS,
+ COMPACTION_TOTAL_INPUT_BYTES,
+ COMPACTION_BYTES_READ,
+ COMPACTION_BYTES_WRITTEN,
+ NUM_COMPACTION_PROPERTIES
+ };
+
+ enum FlushPropertyType : int {
+ FLUSH_JOB_ID = 0,
+ FLUSH_BYTES_MEMTABLES,
+ FLUSH_BYTES_WRITTEN,
+ NUM_FLUSH_PROPERTIES
+ };
+
+ // The maximum number of properties of an operation.
+ // This number should be set to the biggest NUM_XXX_PROPERTIES.
+ static const int kNumOperationProperties =
+ constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+
+ // The type used to refer to a thread state.
+ // A state describes lower-level action of a thread
+ // such as reading / writing a file or waiting for a mutex.
+ enum StateType : int {
+ STATE_UNKNOWN = 0,
+ STATE_MUTEX_WAIT = 1,
+ NUM_STATE_TYPES
+ };
+
+ ThreadStatus(const uint64_t _id, const ThreadType _thread_type,
+ const std::string& _db_name, const std::string& _cf_name,
+ const OperationType _operation_type,
+ const uint64_t _op_elapsed_micros,
+ const OperationStage _operation_stage,
+ const uint64_t _op_props[], const StateType _state_type)
+ : thread_id(_id),
+ thread_type(_thread_type),
+ db_name(_db_name),
+ cf_name(_cf_name),
+ operation_type(_operation_type),
+ op_elapsed_micros(_op_elapsed_micros),
+ operation_stage(_operation_stage),
+ state_type(_state_type) {
+ for (int i = 0; i < kNumOperationProperties; ++i) {
+ op_properties[i] = _op_props[i];
+ }
+ }
+
+ // An unique ID for the thread.
+ const uint64_t thread_id;
+
+ // The type of the thread, it could be HIGH_PRIORITY,
+ // LOW_PRIORITY, and USER
+ const ThreadType thread_type;
+
+ // The name of the DB instance where the thread is currently
+ // involved with. It would be set to empty string if the thread
+ // does not involve in any DB operation.
+ const std::string db_name;
+
+ // The name of the column family where the thread is currently
+ // It would be set to empty string if the thread does not involve
+ // in any column family.
+ const std::string cf_name;
+
+ // The operation (high-level action) that the current thread is involved.
+ const OperationType operation_type;
+
+ // The elapsed time of the current thread operation in microseconds.
+ const uint64_t op_elapsed_micros;
+
+ // An integer showing the current stage where the thread is involved
+ // in the current operation.
+ const OperationStage operation_stage;
+
+ // A list of properties that describe some details about the current
+ // operation. Same field in op_properties[] might have different
+ // meanings for different operations.
+ uint64_t op_properties[kNumOperationProperties];
+
+ // The state (lower-level action) that the current thread is involved.
+ const StateType state_type;
+
+ // The followings are a set of utility functions for interpreting
+ // the information of ThreadStatus
+
+ static std::string GetThreadTypeName(ThreadType thread_type);
+
+ // Obtain the name of an operation given its type.
+ static const std::string& GetOperationName(OperationType op_type);
+
+ static const std::string MicrosToString(uint64_t op_elapsed_time);
+
+ // Obtain a human-readable string describing the specified operation stage.
+ static const std::string& GetOperationStageName(OperationStage stage);
+
+ // Obtain the name of the "i"th operation property of the
+ // specified operation.
+ static const std::string& GetOperationPropertyName(OperationType op_type,
+ int i);
+
+ // Translate the "i"th property of the specified operation given
+ // a property value.
+ static std::map<std::string, uint64_t> InterpretOperationProperties(
+ OperationType op_type, const uint64_t* op_properties);
+
+ // Obtain the name of a state given its type.
+ static const std::string& GetStateName(StateType state_type);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h
new file mode 100644
index 000000000..f1cc55752
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/threadpool.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <functional>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+ * ThreadPool is a component that will spawn N background threads that will
+ * be used to execute scheduled work, The number of background threads could
+ * be modified by calling SetBackgroundThreads().
+ * */
+class ThreadPool {
+ public:
+ virtual ~ThreadPool() {}
+
+ // Wait for all threads to finish.
+ // Discard those threads that did not start
+ // executing
+ virtual void JoinAllThreads() = 0;
+
+ // Set the number of background threads that will be executing the
+ // scheduled jobs.
+ virtual void SetBackgroundThreads(int num) = 0;
+ virtual int GetBackgroundThreads() = 0;
+
+ // Get the number of jobs scheduled in the ThreadPool queue.
+ virtual unsigned int GetQueueLen() const = 0;
+
+ // Waits for all jobs to complete those
+ // that already started running and those that did not
+ // start yet. This ensures that everything that was thrown
+ // on the TP runs even though
+ // we may not have specified enough threads for the amount
+ // of jobs
+ virtual void WaitForJobsAndJoinAllThreads() = 0;
+
+ // Submit a fire and forget jobs
+ // This allows to submit the same job multiple times
+ virtual void SubmitJob(const std::function<void()>&) = 0;
+ // This moves the function in for efficiency
+ virtual void SubmitJob(std::function<void()>&&) = 0;
+
+ // Reserve available background threads. This function does not ensure
+ // so many threads can be reserved, instead it will return the number of
+ // threads that can be reserved against the desired one. In other words,
+ // the number of available threads could be less than the input.
+ virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; }
+
+ // Release a specific number of reserved threads
+ virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; }
+};
+
+// NewThreadPool() is a function that could be used to create a ThreadPool
+// with `num_threads` background threads.
+extern ThreadPool* NewThreadPool(int num_threads);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h
new file mode 100644
index 000000000..335e091dc
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+ virtual ~TraceWriter() = default;
+
+ virtual Status Write(const Slice& data) = 0;
+ virtual Status Close() = 0;
+ virtual uint64_t GetFileSize() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay operations.
+class TraceReader {
+ public:
+ virtual ~TraceReader() = default;
+
+ virtual Status Read(std::string* data) = 0;
+ virtual Status Close() = 0;
+
+ // Seek back to the trace header. Replayer can call this method to restart
+ // replaying. Note this method may fail if the reader is already closed.
+ virtual Status Reset() = 0;
+};
+
+// Factory methods to write/read traces to/from a file.
+// The implementations may not be thread-safe.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceReader>* trace_reader);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record.h b/src/rocksdb/include/rocksdb/trace_record.h
new file mode 100644
index 000000000..c00f5cafb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class DB;
+
+// Supported trace record types.
+enum TraceType : char {
+ kTraceNone = 0,
+ kTraceBegin = 1,
+ kTraceEnd = 2,
+ // Query level tracing related trace types.
+ kTraceWrite = 3,
+ kTraceGet = 4,
+ kTraceIteratorSeek = 5,
+ kTraceIteratorSeekForPrev = 6,
+ // Block cache tracing related trace types.
+ kBlockTraceIndexBlock = 7,
+ // TODO: split out kinds of filter blocks?
+ kBlockTraceFilterBlock = 8,
+ kBlockTraceDataBlock = 9,
+ kBlockTraceUncompressionDictBlock = 10,
+ kBlockTraceRangeDeletionBlock = 11,
+ // IO tracing related trace type.
+ kIOTracer = 12,
+ // Query level tracing related trace type.
+ kTraceMultiGet = 13,
+ // All trace types should be added before kTraceMax
+ kTraceMax,
+};
+
+class GetQueryTraceRecord;
+class IteratorSeekQueryTraceRecord;
+class MultiGetQueryTraceRecord;
+class TraceRecordResult;
+class WriteQueryTraceRecord;
+
+// Base class for all types of trace records.
+class TraceRecord {
+ public:
+ explicit TraceRecord(uint64_t timestamp);
+
+ virtual ~TraceRecord() = default;
+
+ // Type of the trace record.
+ virtual TraceType GetTraceType() const = 0;
+
+ // Timestamp (in microseconds) of this trace.
+ virtual uint64_t GetTimestamp() const;
+
+ class Handler {
+ public:
+ virtual ~Handler() = default;
+
+ virtual Status Handle(const WriteQueryTraceRecord& record,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+
+ virtual Status Handle(const GetQueryTraceRecord& record,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+
+ virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+
+ virtual Status Handle(const MultiGetQueryTraceRecord& record,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+ };
+
+ // Accept the handler and report the corresponding result in `result`.
+ virtual Status Accept(Handler* handler,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+
+ // Create a handler for the exeution of TraceRecord.
+ static Handler* NewExecutionHandler(
+ DB* db, const std::vector<ColumnFamilyHandle*>& handles);
+
+ private:
+ uint64_t timestamp_;
+};
+
+// Base class for all query types of trace records.
+class QueryTraceRecord : public TraceRecord {
+ public:
+ explicit QueryTraceRecord(uint64_t timestamp);
+};
+
+// Trace record for DB::Write() operation.
+class WriteQueryTraceRecord : public QueryTraceRecord {
+ public:
+ WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp);
+
+ WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp);
+
+ virtual ~WriteQueryTraceRecord() override;
+
+ TraceType GetTraceType() const override { return kTraceWrite; }
+
+ // rep string for the WriteBatch.
+ virtual Slice GetWriteBatchRep() const;
+
+ Status Accept(Handler* handler,
+ std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+ PinnableSlice rep_;
+};
+
+// Trace record for DB::Get() operation
+class GetQueryTraceRecord : public QueryTraceRecord {
+ public:
+ GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key,
+ uint64_t timestamp);
+
+ GetQueryTraceRecord(uint32_t column_family_id, const std::string& key,
+ uint64_t timestamp);
+
+ virtual ~GetQueryTraceRecord() override;
+
+ TraceType GetTraceType() const override { return kTraceGet; }
+
+ // Column family ID.
+ virtual uint32_t GetColumnFamilyID() const;
+
+ // Key to get.
+ virtual Slice GetKey() const;
+
+ Status Accept(Handler* handler,
+ std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+ uint32_t cf_id_;
+ PinnableSlice key_;
+};
+
+// Base class for all Iterator related operations.
+class IteratorQueryTraceRecord : public QueryTraceRecord {
+ public:
+ explicit IteratorQueryTraceRecord(uint64_t timestamp);
+
+ IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+ PinnableSlice&& upper_bound, uint64_t timestamp);
+
+ IteratorQueryTraceRecord(const std::string& lower_bound,
+ const std::string& upper_bound, uint64_t timestamp);
+
+ virtual ~IteratorQueryTraceRecord() override;
+
+ // Get the iterator's lower/upper bound. They may be used in ReadOptions to
+ // create an Iterator instance.
+ virtual Slice GetLowerBound() const;
+ virtual Slice GetUpperBound() const;
+
+ private:
+ PinnableSlice lower_;
+ PinnableSlice upper_;
+};
+
+// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation.
+class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord {
+ public:
+ // Currently we only support Seek() and SeekForPrev().
+ enum SeekType {
+ kSeek = kTraceIteratorSeek,
+ kSeekForPrev = kTraceIteratorSeekForPrev
+ };
+
+ IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+ PinnableSlice&& key, uint64_t timestamp);
+
+ IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+ const std::string& key, uint64_t timestamp);
+
+ IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+ PinnableSlice&& key, PinnableSlice&& lower_bound,
+ PinnableSlice&& upper_bound, uint64_t timestamp);
+
+ IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+ const std::string& key,
+ const std::string& lower_bound,
+ const std::string& upper_bound,
+ uint64_t timestamp);
+
+ virtual ~IteratorSeekQueryTraceRecord() override;
+
+ // Trace type matches the seek type.
+ TraceType GetTraceType() const override;
+
+ // Type of seek, Seek or SeekForPrev.
+ virtual SeekType GetSeekType() const;
+
+ // Column family ID.
+ virtual uint32_t GetColumnFamilyID() const;
+
+ // Key to seek to.
+ virtual Slice GetKey() const;
+
+ Status Accept(Handler* handler,
+ std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+ SeekType type_;
+ uint32_t cf_id_;
+ PinnableSlice key_;
+};
+
+// Trace record for DB::MultiGet() operation.
+class MultiGetQueryTraceRecord : public QueryTraceRecord {
+ public:
+ MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+ std::vector<PinnableSlice>&& keys,
+ uint64_t timestamp);
+
+ MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+ const std::vector<std::string>& keys,
+ uint64_t timestamp);
+
+ virtual ~MultiGetQueryTraceRecord() override;
+
+ TraceType GetTraceType() const override { return kTraceMultiGet; }
+
+ // Column familiy IDs.
+ virtual std::vector<uint32_t> GetColumnFamilyIDs() const;
+
+ // Keys to get.
+ virtual std::vector<Slice> GetKeys() const;
+
+ Status Accept(Handler* handler,
+ std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+ std::vector<uint32_t> cf_ids_;
+ std::vector<PinnableSlice> keys_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record_result.h b/src/rocksdb/include/rocksdb/trace_record_result.h
new file mode 100644
index 000000000..0cd0004a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record_result.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IteratorTraceExecutionResult;
+class MultiValuesTraceExecutionResult;
+class SingleValueTraceExecutionResult;
+class StatusOnlyTraceExecutionResult;
+
+// Base class for the results of all types of trace records.
+// Theses classes can be used to report the execution result of
+// TraceRecord::Handler::Handle() or TraceRecord::Accept().
+class TraceRecordResult {
+ public:
+ explicit TraceRecordResult(TraceType trace_type);
+
+ virtual ~TraceRecordResult() = default;
+
+ // Trace type of the corresponding TraceRecord.
+ virtual TraceType GetTraceType() const;
+
+ class Handler {
+ public:
+ virtual ~Handler() = default;
+
+ virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0;
+
+ virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0;
+
+ virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0;
+
+ virtual Status Handle(const IteratorTraceExecutionResult& result) = 0;
+ };
+
+ // Accept the handler.
+ virtual Status Accept(Handler* handler) = 0;
+
+ private:
+ TraceType trace_type_;
+};
+
+// Base class for the results from the trace record execution handler (created
+// by TraceRecord::NewExecutionHandler()).
+//
+// The actual execution status or returned values may be hidden from
+// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a
+// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may
+// return Status::NotFound() but TraceRecord::Handler::Handle() or
+// TraceRecord::Accept() will still return Status::OK(). The actual status from
+// DB::Get() and the returned value string may be saved in a
+// SingleValueTraceExecutionResult.
+class TraceExecutionResult : public TraceRecordResult {
+ public:
+ TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp,
+ TraceType trace_type);
+
+ // Execution start/end timestamps and request latency in microseconds.
+ virtual uint64_t GetStartTimestamp() const;
+ virtual uint64_t GetEndTimestamp() const;
+ inline uint64_t GetLatency() const {
+ return GetEndTimestamp() - GetStartTimestamp();
+ }
+
+ private:
+ uint64_t ts_start_;
+ uint64_t ts_end_;
+};
+
+// Result for operations that only return a single Status.
+// Example operation: DB::Write()
+class StatusOnlyTraceExecutionResult : public TraceExecutionResult {
+ public:
+ StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp,
+ uint64_t end_timestamp, TraceType trace_type);
+
+ virtual ~StatusOnlyTraceExecutionResult() override = default;
+
+ // Return value of DB::Write(), etc.
+ virtual const Status& GetStatus() const;
+
+ virtual Status Accept(Handler* handler) override;
+
+ private:
+ Status status_;
+};
+
+// Result for operations that return a Status and a value.
+// Example operation: DB::Get()
+class SingleValueTraceExecutionResult : public TraceExecutionResult {
+ public:
+ SingleValueTraceExecutionResult(Status status, const std::string& value,
+ uint64_t start_timestamp,
+ uint64_t end_timestamp, TraceType trace_type);
+
+ SingleValueTraceExecutionResult(Status status, std::string&& value,
+ uint64_t start_timestamp,
+ uint64_t end_timestamp, TraceType trace_type);
+
+ virtual ~SingleValueTraceExecutionResult() override;
+
+ // Return status of DB::Get().
+ virtual const Status& GetStatus() const;
+
+ // Value for the searched key.
+ virtual const std::string& GetValue() const;
+
+ virtual Status Accept(Handler* handler) override;
+
+ private:
+ Status status_;
+ std::string value_;
+};
+
+// Result for operations that return multiple Status(es) and values as vectors.
+// Example operation: DB::MultiGet()
+class MultiValuesTraceExecutionResult : public TraceExecutionResult {
+ public:
+ MultiValuesTraceExecutionResult(std::vector<Status> multi_status,
+ std::vector<std::string> values,
+ uint64_t start_timestamp,
+ uint64_t end_timestamp, TraceType trace_type);
+
+ virtual ~MultiValuesTraceExecutionResult() override;
+
+ // Returned Status(es) of DB::MultiGet().
+ virtual const std::vector<Status>& GetMultiStatus() const;
+
+ // Returned values for the searched keys.
+ virtual const std::vector<std::string>& GetValues() const;
+
+ virtual Status Accept(Handler* handler) override;
+
+ private:
+ std::vector<Status> multi_status_;
+ std::vector<std::string> values_;
+};
+
+// Result for Iterator operations.
+// Example operations: Iterator::Seek(), Iterator::SeekForPrev()
+class IteratorTraceExecutionResult : public TraceExecutionResult {
+ public:
+ IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key,
+ PinnableSlice&& value, uint64_t start_timestamp,
+ uint64_t end_timestamp, TraceType trace_type);
+
+ IteratorTraceExecutionResult(bool valid, Status status,
+ const std::string& key, const std::string& value,
+ uint64_t start_timestamp, uint64_t end_timestamp,
+ TraceType trace_type);
+
+ virtual ~IteratorTraceExecutionResult() override;
+
+ // Return if the Iterator is valid.
+ virtual bool GetValid() const;
+
+ // Return the status of the Iterator.
+ virtual const Status& GetStatus() const;
+
+ // Key of the current iterating entry, empty if GetValid() is false.
+ virtual Slice GetKey() const;
+
+ // Value of the current iterating entry, empty if GetValid() is false.
+ virtual Slice GetValue() const;
+
+ virtual Status Accept(Handler* handler) override;
+
+ private:
+ bool valid_;
+ Status status_;
+ PinnableSlice key_;
+ PinnableSlice value_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h
new file mode 100644
index 000000000..e13ad8f80
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/transaction_log.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFile;
+using VectorLogPtr = std::vector<std::unique_ptr<LogFile>>;
+
+enum WalFileType {
+ /* Indicates that WAL file is in archive directory. WAL files are moved from
+ * the main db directory to archive directory once they are not live and stay
+ * there until cleaned up. Files are cleaned depending on archive size
+ * (Options::WAL_size_limit_MB) and time since last cleaning
+ * (Options::WAL_ttl_seconds).
+ */
+ kArchivedLogFile = 0,
+
+ /* Indicates that WAL file is live and resides in the main db directory */
+ kAliveLogFile = 1
+};
+
+class LogFile {
+ public:
+ LogFile() {}
+ virtual ~LogFile() {}
+
+ // Returns log file's pathname relative to the main db dir
+ // Eg. For a live-log-file = /000003.log
+ // For an archived-log-file = /archive/000003.log
+ virtual std::string PathName() const = 0;
+
+ // Primary identifier for log file.
+ // This is directly proportional to creation time of the log file
+ virtual uint64_t LogNumber() const = 0;
+
+ // Log file can be either alive or archived
+ virtual WalFileType Type() const = 0;
+
+ // Starting sequence number of writebatch written in this log file
+ virtual SequenceNumber StartSequence() const = 0;
+
+ // Size of log file on disk in Bytes
+ virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+ SequenceNumber sequence = 0;
+ std::unique_ptr<WriteBatch> writeBatchPtr;
+
+ // Add empty __ctor and __dtor for the rule of five
+ // However, preserve the original semantics and prohibit copying
+ // as the std::unique_ptr member does not copy.
+ BatchResult() {}
+
+ ~BatchResult() {}
+
+ BatchResult(const BatchResult&) = delete;
+
+ BatchResult& operator=(const BatchResult&) = delete;
+
+ BatchResult(BatchResult&& bResult)
+ : sequence(std::move(bResult.sequence)),
+ writeBatchPtr(std::move(bResult.writeBatchPtr)) {}
+
+ BatchResult& operator=(BatchResult&& bResult) {
+ sequence = std::move(bResult.sequence);
+ writeBatchPtr = std::move(bResult.writeBatchPtr);
+ return *this;
+ }
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+ TransactionLogIterator() {}
+ virtual ~TransactionLogIterator() {}
+
+ // An iterator is either positioned at a WriteBatch or not valid.
+ // This method returns true if the iterator is valid.
+ // Can read data from a valid iterator.
+ virtual bool Valid() = 0;
+
+ // Moves the iterator to the next WriteBatch.
+ // REQUIRES: Valid() to be true.
+ virtual void Next() = 0;
+
+ // Returns ok if the iterator is valid.
+ // Returns the Error when something has gone wrong.
+ virtual Status status() = 0;
+
+ // If valid return's the current write_batch and the sequence number of the
+ // earliest transaction contained in the batch.
+ // ONLY use if Valid() is true and status() is OK.
+ virtual BatchResult GetBatch() = 0;
+
+ // The read options for TransactionLogIterator.
+ struct ReadOptions {
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: true
+ bool verify_checksums_;
+
+ ReadOptions() : verify_checksums_(true) {}
+
+ explicit ReadOptions(bool verify_checksums)
+ : verify_checksums_(verify_checksums) {}
+ };
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h
new file mode 100644
index 000000000..6fb53d846
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/types.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Define all public custom types here.
+
+using ColumnFamilyId = uint32_t;
+
+// Represents a sequence number in a WAL file.
+using SequenceNumber = uint64_t;
+
+const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed
+
+enum class TableFileCreationReason {
+ kFlush,
+ kCompaction,
+ kRecovery,
+ kMisc,
+};
+
+enum class BlobFileCreationReason {
+ kFlush,
+ kCompaction,
+ kRecovery,
+};
+
+// The types of files RocksDB uses in a DB directory. (Available for
+// advanced options.)
+enum FileType {
+ kWalFile,
+ kDBLockFile,
+ kTableFile,
+ kDescriptorFile,
+ kCurrentFile,
+ kTempFile,
+ kInfoLogFile, // Either the current one, or an old one
+ kMetaDatabase,
+ kIdentityFile,
+ kOptionsFile,
+ kBlobFile
+};
+
+// User-oriented representation of internal key types.
+// Ordering of this enum entries should not change.
+enum EntryType {
+ kEntryPut,
+ kEntryDelete,
+ kEntrySingleDelete,
+ kEntryMerge,
+ kEntryRangeDeletion,
+ kEntryBlobIndex,
+ kEntryDeleteWithTimestamp,
+ kEntryWideColumnEntity,
+ kEntryOther,
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/unique_id.h b/src/rocksdb/include/rocksdb/unique_id.h
new file mode 100644
index 000000000..eb0c77826
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/unique_id.h
@@ -0,0 +1,55 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Computes a stable, universally unique 128-bit (16 binary char) identifier
+// for an SST file from TableProperties. This is supported for table (SST)
+// files created with RocksDB 6.24 and later. NotSupported will be returned
+// for other cases. The first 16 bytes (128 bits) is of sufficient quality
+// for almost all applications, and shorter prefixes are usable as a
+// hash of the full unique id.
+//
+// Note: .c_str() is not compatible with binary char strings, so using
+// .c_str() on the result will often result in information loss and very
+// poor uniqueness probability.
+//
+// More detail: the value is *guaranteed* unique for SST files
+// generated in the same process (even different DBs, RocksDB >= 6.26),
+// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26)
+// so that the "all zeros" value can be used reliably for a null ID.
+// These IDs are more than sufficient for SST uniqueness within each of
+// many DBs or hosts. For an extreme example assuming random IDs, consider
+// 10^9 hosts each with 10^9 live SST files being replaced at 10^6/second.
+// Such a service would need to run for 10 million years to see an ID
+// collision among live SST files on any host.
+//
+// And assuming one generates many SST files in the lifetime of each process,
+// the probability of ID collisions is much "better than random"; see
+// https://github.com/pdillinger/unique_id
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+ std::string *out_id);
+
+// Computes a 192-bit (24 binary char) stable, universally unique ID
+// with an extra 64 bits of uniqueness compared to the standard ID. It is only
+// appropriate to use this ID instead of the 128-bit ID if ID collisions
+// between files among any hosts in a vast fleet is a problem, such as a shared
+// global namespace for SST file backups. Under this criteria, the extreme
+// example above would expect a global file ID collision every 4 days with
+// 128-bit IDs (using some worst-case assumptions about process lifetime).
+// It's 10^17 years with 192-bit IDs.
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
+ std::string *out_id);
+
+// Converts a binary string (unique id) to hexadecimal, with each 64 bits
+// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
+// Also works on unique id prefix.
+std::string UniqueIdToHumanString(const std::string &id);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
new file mode 100644
index 000000000..0b0a85e1c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+ kCompactionStopStyleSimilarSize, // pick files of similar size
+ kCompactionStopStyleTotalSize // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+ // Percentage flexibility while comparing file size. If the candidate file(s)
+ // size is 1% smaller than the next file's size, then include next file into
+ // this candidate set. // Default: 1
+ unsigned int size_ratio;
+
+ // The minimum number of files in a single compaction run. Default: 2
+ unsigned int min_merge_width;
+
+ // The maximum number of files in a single compaction run. Default: UINT_MAX
+ unsigned int max_merge_width;
+
+ // The size amplification is defined as the amount (in percentage) of
+ // additional storage needed to store a single byte of data in the database.
+ // For example, a size amplification of 2% means that a database that
+ // contains 100 bytes of user-data may occupy up to 102 bytes of
+ // physical storage. By this definition, a fully compacted database has
+ // a size amplification of 0%. Rocksdb uses the following heuristic
+ // to calculate size amplification: it assumes that all files excluding
+ // the earliest file contribute to the size amplification.
+ // Default: 200, which means that a 100 byte database could require up to
+ // 300 bytes of storage.
+ unsigned int max_size_amplification_percent;
+
+ // If this option is set to be -1 (the default value), all the output files
+ // will follow compression type specified.
+ //
+ // If this option is not negative, we will try to make sure compressed
+ // size is just above this value. In normal cases, at least this percentage
+ // of data will be compressed.
+ // When we are compacting to a new file, here is the criteria whether
+ // it needs to be compressed: assuming here are the list of files sorted
+ // by generation time:
+ // A1...An B1...Bm C1...Ct
+ // where A1 is the newest and Ct is the oldest, and we are going to compact
+ // B1...Bm, we calculate the total size of all the files as total_size, as
+ // well as the total size of C1...Ct as total_C, the compaction output file
+ // will be compressed iff
+ // total_C / total_size < this percentage
+ // Default: -1
+ int compression_size_percent;
+
+ // The algorithm used to stop picking files into a single compaction run
+ // Default: kCompactionStopStyleTotalSize
+ CompactionStopStyle stop_style;
+
+ // Option to optimize the universal multi level compaction by enabling
+ // trivial move for non overlapping files.
+ // Default: false
+ bool allow_trivial_move;
+
+ // EXPERIMENTAL
+ // If true, try to limit compaction size under max_compaction_bytes.
+ // This might cause higher write amplification, but can prevent some
+ // problem caused by large compactions.
+ // Default: false
+ bool incremental;
+
+ // Default set of parameters
+ CompactionOptionsUniversal()
+ : size_ratio(1),
+ min_merge_width(2),
+ max_merge_width(UINT_MAX),
+ max_size_amplification_percent(200),
+ compression_size_percent(-1),
+ stop_style(kCompactionStopStyleTotalSize),
+ allow_trivial_move(false),
+ incremental(false) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/agg_merge.h b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
new file mode 100644
index 000000000..4e21082db
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The feature is still in development so the encoding format is subject
+// to change.
+//
+// Aggregation Merge Operator is a merge operator that allows users to
+// aggregate merge operands of different keys with different registered
+// aggregation functions. The aggregation can also change for the same
+// key if the functions store the data in the same format.
+// The target application highly overlaps with merge operator in general
+// but we try to provide a better interface so that users are more likely
+// to use pre-implemented plug-in functions and connect with existing
+// third-party aggregation functions (such as those from SQL engines).
+// In this case, the need for users to write customized C++ plug-in code
+// is reduced.
+// If the idea proves to useful, we might consider to move it to be
+// a core functionality of RocksDB, and reduce the support of merge
+// operators.
+//
+// Users can implement aggregation functions by implementing abstract
+// class Aggregator, and register it using AddAggregator().
+// The merge operator can be retrieved from GetAggMergeOperator() and
+// it is a singleton.
+//
+// Users can push values to be updated with a merge operand encoded with
+// registered function name and payload using EncodeAggFuncAndPayload(),
+// and the merge operator will invoke the aggregation function.
+// An example:
+//
+// // Assume class ExampleSumAggregator is implemented to do simple sum.
+// AddAggregator("sum", std::make_unique<ExampleSumAggregator>());
+// std::shared_ptr<MergeOperator> mp_guard = CreateAggMergeOperator();
+// options.merge_operator = mp_guard.get();
+// ...... // Creating DB
+//
+//
+// std::string encoded_value;
+// s = EncodeAggFuncAndPayload(kUnamedFuncName, "200", encoded_value);
+// assert(s.ok());
+// db->Put(WriteOptions(), "foo", encoded_value);
+// s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+// assert(s.ok());
+// db->Merge(WriteOptions(), "foo", encoded_value);
+// s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+// assert(s.ok());
+// db->Merge(WriteOptions(), "foo", encoded_value);
+//
+// std::string value;
+// Status s = db->Get(ReadOptions, "foo", &value);
+// assert(s.ok());
+// Slice func, aggregated_value;
+// assert(ExtractAggFuncAndValue(value, func, aggregated_value));
+// assert(func == "sum");
+// assert(aggregated_value == "600");
+//
+//
+// DB::Put() can also be used to add a payloadin the same way as Merge().
+//
+// kUnamedFuncName can be used as a placeholder function name. This will
+// be aggregated with merge operands inserted later based on function
+// name given there.
+//
+// If the aggregation function is not registered or there is an error
+// returned by aggregation function, the result will be encoded with a fake
+// aggregation function kErrorFuncName, with each merge operands to be encoded
+// into a list that can be extracted using ExtractList();
+//
+// If users add a merge operand using a different aggregation function from
+// the previous one, the merge operands for the previous one is aggregated
+// and the payload part of the result is treated as the first payload of
+// the items for the new aggregation function. For example, users can
+// Merge("plus, 1"), merge("plus 2"), merge("minus 3") and the aggregation
+// result would be "minus 0".
+//
+
+// A class used to aggregate data per key/value. The plug-in function is
+// implemented and registered using AddAggregator(). And then use it
+// with merge operator created using CreateAggMergeOperator().
+class Aggregator {
+ public:
+ virtual ~Aggregator() {}
+ // The input list is in reverse insertion order, with values[0] to be
+ // the one inserted last and values.back() to be the one inserted first.
+ // The oldest one might be from Get().
+ // Return whether aggregation succeeded. False for aggregation error.
+ virtual bool Aggregate(const std::vector<Slice>& values,
+ std::string& result) const = 0;
+
+ // True if a partial aggregation should be invoked. Some aggregators
+ // might opt to skip partial aggregation if possible.
+ virtual bool DoPartialAggregate() const { return true; }
+};
+
+// The function adds aggregation plugin by function name. It is used
+// by all the aggregation operator created using CreateAggMergeOperator().
+// It's currently not thread safe to run concurrently with the aggregation
+// merge operator. It is recommended that all the aggregation function
+// is added before calling CreateAggMergeOperator().
+Status AddAggregator(const std::string& function_name,
+ std::unique_ptr<Aggregator>&& agg);
+
+// Get the singleton instance of merge operator for aggregation.
+// Always the same one is returned with a shared_ptr is hold as a
+// static variable by the function.
+// This is done so because options.merge_operator is shared_ptr.
+std::shared_ptr<MergeOperator> GetAggMergeOperator();
+
+// Encode aggregation function and payload that can be consumed by aggregation
+// merge operator.
+Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload,
+ std::string& output);
+// Helper function to extract aggregation function name and payload.
+// Return false if it fails to decode.
+bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value);
+
+// Extract encoded list. This can be used to extract error merge operands when
+// the returned function name is kErrorFuncName.
+bool ExtractList(const Slice& encoded_list, std::vector<Slice>& decoded_list);
+
+// Special function name that allows it to be merged to subsequent type.
+extern const std::string kUnnamedFuncName;
+
+// Special error function name reserved for merging or aggregation error.
+extern const std::string kErrorFuncName;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/backup_engine.h b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
new file mode 100644
index 000000000..f28ad9618
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
@@ -0,0 +1,631 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The default DB file checksum function name.
+constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+// The default BackupEngine file checksum function name.
+constexpr char kBackupFileChecksumFuncName[] = "crc32c";
+
+struct BackupEngineOptions {
+ // Where to keep the backup files. Has to be different than dbname_
+ // Best to set this to dbname_ + "/backups"
+ // Required
+ std::string backup_dir;
+
+ // Backup Env object. It will be used for backup file I/O. If it's
+ // nullptr, backups will be written out using DBs Env. If it's
+ // non-nullptr, backup's I/O will be performed using this object.
+ // Default: nullptr
+ Env* backup_env;
+
+ // share_table_files supports table and blob files.
+ //
+ // If share_table_files == true, the backup directory will share table and
+ // blob files among backups, to save space among backups of the same DB and to
+ // enable incremental backups by only copying new files.
+ // If share_table_files == false, each backup will be on its own and will not
+ // share any data with other backups.
+ //
+ // default: true
+ bool share_table_files;
+
+ // Backup info and error messages will be written to info_log
+ // if non-nullptr.
+ // Default: nullptr
+ Logger* info_log;
+
+ // If sync == true, we can guarantee you'll get consistent backup and
+ // restore even on a machine crash/reboot. Backup and restore processes are
+ // slower with sync enabled. If sync == false, we can only guarantee that
+ // other previously synced backups and restores are not modified while
+ // creating a new one.
+ // Default: true
+ bool sync;
+
+ // If true, it will delete whatever backups there are already
+ // Default: false
+ bool destroy_old_data;
+
+ // If false, we won't backup log files. This option can be useful for backing
+ // up in-memory databases where log file are persisted, but table files are in
+ // memory.
+ // Default: true
+ bool backup_log_files;
+
+ // Max bytes that can be transferred in a second during backup.
+ // If 0, go as fast as you can
+ // This limit only applies to writes. To also limit reads,
+ // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+ // have to be passed in through the option "backup_rate_limiter"
+ // Default: 0
+ uint64_t backup_rate_limit;
+
+ // Backup rate limiter. Used to control transfer speed for backup. If this is
+ // not null, backup_rate_limit is ignored.
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+ // Max bytes that can be transferred in a second during restore.
+ // If 0, go as fast as you can
+ // This limit only applies to writes. To also limit reads,
+ // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+ // have to be passed in through the option "restore_rate_limiter"
+ // Default: 0
+ uint64_t restore_rate_limit;
+
+ // Restore rate limiter. Used to control transfer speed during restore. If
+ // this is not null, restore_rate_limit is ignored.
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+ // share_files_with_checksum supports table and blob files.
+ //
+ // Only used if share_table_files is set to true. Setting to false is
+ // DEPRECATED and potentially dangerous because in that case BackupEngine
+ // can lose data if backing up databases with distinct or divergent
+ // history, for example if restoring from a backup other than the latest,
+ // writing to the DB, and creating another backup. Setting to true (default)
+ // prevents these issues by ensuring that different table files (SSTs) and
+ // blob files with the same number are treated as distinct. See
+ // share_files_with_checksum_naming and ShareFilesNaming.
+ //
+ // Default: true
+ bool share_files_with_checksum;
+
+ // Up to this many background threads will copy files for CreateNewBackup()
+ // and RestoreDBFromBackup()
+ // Default: 1
+ int max_background_operations;
+
+ // During backup user can get callback every time next
+ // callback_trigger_interval_size bytes being copied.
+ // Default: 4194304
+ uint64_t callback_trigger_interval_size;
+
+ // For BackupEngineReadOnly, Open() will open at most this many of the
+ // latest non-corrupted backups.
+ //
+ // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+ // writable BackupEngine because it would inhibit accounting for shared
+ // files for proper backup deletion, including purging any incompletely
+ // created backups on creation of a new backup.
+ //
+ // Default: INT_MAX
+ int max_valid_backups_to_open;
+
+ // ShareFilesNaming describes possible naming schemes for backup
+ // table and blob file names when they are stored in the
+ // shared_checksum directory (i.e., both share_table_files and
+ // share_files_with_checksum are true).
+ enum ShareFilesNaming : uint32_t {
+ // Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
+ // backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
+ // where <crc32c> is an unsigned decimal integer. This is the
+ // original/legacy naming scheme for share_files_with_checksum,
+ // with two problems:
+ // * At massive scale, collisions on this triple with different file
+ // contents is plausible.
+ // * Determining the name to use requires computing the checksum,
+ // so generally requires reading the whole file even if the file
+ // is already backed up.
+ //
+ // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
+ kLegacyCrc32cAndFileSize = 1U,
+
+ // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
+ // pair of values should be very strongly unique for a given SST file
+ // and easily determined before computing a checksum. The 's' indicates
+ // the value is a DB session id, not a checksum.
+ //
+ // Exceptions:
+ // * For blob files, kLegacyCrc32cAndFileSize is used as currently
+ // db_session_id is not supported by the blob file format.
+ // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
+ // will be used instead, matching the names assigned by RocksDB versions
+ // not supporting the newer naming scheme.
+ // * See also flags below.
+ kUseDbSessionId = 2U,
+
+ kMaskNoNamingFlags = 0xffffU,
+
+ // If not already part of the naming scheme, insert
+ // _<file_size>
+ // before .sst and .blob in the name. In case of user code actually parsing
+ // the last _<whatever> before the .sst and .blob as the file size, this
+ // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
+ // option makes official that unofficial feature of the backup metadata.
+ //
+ // We do not consider SST and blob file sizes to have sufficient entropy to
+ // contribute significantly to naming uniqueness.
+ kFlagIncludeFileSize = 1U << 31,
+
+ kMaskNamingFlags = ~kMaskNoNamingFlags,
+ };
+
+ // Naming option for share_files_with_checksum table and blob files. See
+ // ShareFilesNaming for details.
+ //
+ // Modifying this option cannot introduce a downgrade compatibility issue
+ // because RocksDB can read, restore, and delete backups using different file
+ // names, and it's OK for a backup directory to use a mixture of table and
+ // blob files naming schemes.
+ //
+ // However, modifying this option and saving more backups to the same
+ // directory can lead to the same file getting saved again to that
+ // directory, under the new shared name in addition to the old shared
+ // name.
+ //
+ // Default: kUseDbSessionId | kFlagIncludeFileSize
+ //
+ // Note: This option comes into effect only if both share_files_with_checksum
+ // and share_table_files are true.
+ ShareFilesNaming share_files_with_checksum_naming;
+
+ // Major schema version to use when writing backup meta files
+ // 1 (default) - compatible with very old versions of RocksDB.
+ // 2 - can be read by RocksDB versions >= 6.19.0. Minimum schema version for
+ // * (Experimental) saving and restoring file temperature metadata
+ int schema_version = 1;
+
+ // (Experimental - subject to change or removal) When taking a backup and
+ // saving file temperature info (minimum schema_version is 2), there are
+ // two potential sources of truth for the placement of files into temperature
+ // tiers: (a) the current file temperature reported by the FileSystem or
+ // (b) the expected file temperature recorded in DB manifest. When this
+ // option is false (default), (b) overrides (a) if both are not UNKNOWN.
+ // When true, (a) overrides (b) if both are not UNKNOWN. Regardless of this
+ // setting, a known temperature overrides UNKNOWN.
+ bool current_temperatures_override_manifest = false;
+
+ void Dump(Logger* logger) const;
+
+ explicit BackupEngineOptions(
+ const std::string& _backup_dir, Env* _backup_env = nullptr,
+ bool _share_table_files = true, Logger* _info_log = nullptr,
+ bool _sync = true, bool _destroy_old_data = false,
+ bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+ uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+ uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+ int _max_valid_backups_to_open = INT_MAX,
+ ShareFilesNaming _share_files_with_checksum_naming =
+ static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize))
+ : backup_dir(_backup_dir),
+ backup_env(_backup_env),
+ share_table_files(_share_table_files),
+ info_log(_info_log),
+ sync(_sync),
+ destroy_old_data(_destroy_old_data),
+ backup_log_files(_backup_log_files),
+ backup_rate_limit(_backup_rate_limit),
+ restore_rate_limit(_restore_rate_limit),
+ share_files_with_checksum(true),
+ max_background_operations(_max_background_operations),
+ callback_trigger_interval_size(_callback_trigger_interval_size),
+ max_valid_backups_to_open(_max_valid_backups_to_open),
+ share_files_with_checksum_naming(_share_files_with_checksum_naming) {
+ assert(share_table_files || !share_files_with_checksum);
+ assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
+ }
+};
+
+inline BackupEngineOptions::ShareFilesNaming operator&(
+ BackupEngineOptions::ShareFilesNaming lhs,
+ BackupEngineOptions::ShareFilesNaming rhs) {
+ uint32_t l = static_cast<uint32_t>(lhs);
+ uint32_t r = static_cast<uint32_t>(rhs);
+ assert(r == BackupEngineOptions::kMaskNoNamingFlags ||
+ (r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+ return static_cast<BackupEngineOptions::ShareFilesNaming>(l & r);
+}
+
+inline BackupEngineOptions::ShareFilesNaming operator|(
+ BackupEngineOptions::ShareFilesNaming lhs,
+ BackupEngineOptions::ShareFilesNaming rhs) {
+ uint32_t l = static_cast<uint32_t>(lhs);
+ uint32_t r = static_cast<uint32_t>(rhs);
+ assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+ return static_cast<BackupEngineOptions::ShareFilesNaming>(l | r);
+}
+
+struct CreateBackupOptions {
+ // Flush will always trigger if 2PC is enabled.
+ // If write-ahead logs are disabled, set flush_before_backup=true to
+ // avoid losing unflushed key/value pairs from the memtable.
+ bool flush_before_backup = false;
+
+ // Callback for reporting progress, based on callback_trigger_interval_size.
+ //
+ // RocksDB callbacks are NOT exception-safe. A callback completing with an
+ // exception can lead to undefined behavior in RocksDB, including data loss,
+ // unreported corruption, deadlocks, and more.
+ std::function<void()> progress_callback = []() {};
+
+ // If false, background_thread_cpu_priority is ignored.
+ // Otherwise, the cpu priority can be decreased,
+ // if you try to increase the priority, the priority will not change.
+ // The initial priority of the threads is CpuPriority::kNormal,
+ // so you can decrease to priorities lower than kNormal.
+ bool decrease_background_thread_cpu_priority = false;
+ CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
+};
+
+struct RestoreOptions {
+ // If true, restore won't overwrite the existing log files in wal_dir. It will
+ // also move all log files from archive directory to wal_dir. Use this option
+ // in combination with BackupEngineOptions::backup_log_files = false for
+ // persisting in-memory databases.
+ // Default: false
+ bool keep_log_files;
+
+ explicit RestoreOptions(bool _keep_log_files = false)
+ : keep_log_files(_keep_log_files) {}
+};
+
+using BackupID = uint32_t;
+
+using BackupFileInfo = FileStorageInfo;
+
+struct BackupInfo {
+ BackupID backup_id = 0U;
+ // Creation time, according to GetCurrentTime
+ int64_t timestamp = 0;
+
+ // Total size in bytes (based on file payloads, not including filesystem
+ // overheads or backup meta file)
+ uint64_t size = 0U;
+
+ // Number of backed up files, some of which might be shared with other
+ // backups. Does not include backup meta file.
+ uint32_t number_files = 0U;
+
+ // Backup API user metadata
+ std::string app_metadata;
+
+ // Backup file details, if requested with include_file_details=true
+ std::vector<BackupFileInfo> file_details;
+
+ // DB "name" (a directory in the backup_env) for opening this backup as a
+ // read-only DB. This should also be used as the DBOptions::wal_dir, such
+ // as by default setting wal_dir="". See also env_for_open.
+ // This field is only set if include_file_details=true
+ std::string name_for_open;
+
+ // An Env(+FileSystem) for opening this backup as a read-only DB, with
+ // DB::OpenForReadOnly or similar. This field is only set if
+ // include_file_details=true. (The FileSystem in this Env takes care
+ // of making shared backup files openable from the `name_for_open` DB
+ // directory.) See also name_for_open.
+ //
+ // This Env might or might not be shared with other backups. To work
+ // around DBOptions::env being a raw pointer, this is a shared_ptr so
+ // that keeping either this BackupInfo, the BackupEngine, or a copy of
+ // this shared_ptr alive is sufficient to keep the Env alive for use by
+ // a read-only DB.
+ std::shared_ptr<Env> env_for_open;
+
+ BackupInfo() {}
+
+ BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+ uint32_t _number_files, const std::string& _app_metadata)
+ : backup_id(_backup_id),
+ timestamp(_timestamp),
+ size(_size),
+ number_files(_number_files),
+ app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+ BackupStatistics() {
+ number_success_backup = 0;
+ number_fail_backup = 0;
+ }
+
+ BackupStatistics(uint32_t _number_success_backup,
+ uint32_t _number_fail_backup)
+ : number_success_backup(_number_success_backup),
+ number_fail_backup(_number_fail_backup) {}
+
+ ~BackupStatistics() {}
+
+ void IncrementNumberSuccessBackup();
+ void IncrementNumberFailBackup();
+
+ uint32_t GetNumberSuccessBackup() const;
+ uint32_t GetNumberFailBackup() const;
+
+ std::string ToString() const;
+
+ private:
+ uint32_t number_success_backup;
+ uint32_t number_fail_backup;
+};
+
+// Read-only functions of a BackupEngine. (Restore writes to another directory
+// not the backup directory.) See BackupEngine comments for details on
+// safe concurrent operations.
+class BackupEngineReadOnlyBase {
+ public:
+ virtual ~BackupEngineReadOnlyBase() {}
+
+ // Returns info about the latest good backup in backup_info, or NotFound
+ // no good backup exists.
+ // Setting include_file_details=true provides information about each
+ // backed-up file in BackupInfo::file_details and more.
+ virtual Status GetLatestBackupInfo(
+ BackupInfo* backup_info, bool include_file_details = false) const = 0;
+
+ // Returns info about a specific backup in backup_info, or NotFound
+ // or Corruption status if the requested backup id does not exist or is
+ // known corrupt.
+ // Setting include_file_details=true provides information about each
+ // backed-up file in BackupInfo::file_details and more.
+ virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+ bool include_file_details = false) const = 0;
+
+ // Returns info about non-corrupt backups in backup_infos.
+ // Setting include_file_details=true provides information about each
+ // backed-up file in BackupInfo::file_details and more.
+ virtual void GetBackupInfo(std::vector<BackupInfo>* backup_infos,
+ bool include_file_details = false) const = 0;
+
+ // Returns info about corrupt backups in corrupt_backups.
+ // WARNING: Any write to the BackupEngine could trigger automatic
+ // GarbageCollect(), which could delete files that would be needed to
+ // manually recover a corrupt backup or to preserve an unrecognized (e.g.
+ // incompatible future version) backup.
+ virtual void GetCorruptedBackups(
+ std::vector<BackupID>* corrupt_backup_ids) const = 0;
+
+ // Restore to specified db_dir and wal_dir from backup_id.
+ virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+ BackupID backup_id,
+ const std::string& db_dir,
+ const std::string& wal_dir) const = 0;
+
+ // keep for backward compatibility.
+ virtual IOStatus RestoreDBFromBackup(
+ BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& options = RestoreOptions()) const {
+ return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+ }
+
+ // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id
+ virtual IOStatus RestoreDBFromLatestBackup(
+ const RestoreOptions& options, const std::string& db_dir,
+ const std::string& wal_dir) const = 0;
+
+ // keep for backward compatibility.
+ virtual IOStatus RestoreDBFromLatestBackup(
+ const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& options = RestoreOptions()) const {
+ return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
+ }
+
+ // If verify_with_checksum is true, this function
+ // inspects the current checksums and file sizes of backup files to see if
+ // they match our expectation.
+ //
+ // If verify_with_checksum is false, this function
+ // checks that each file exists and that the size of the file matches our
+ // expectation. It does not check file checksum.
+ //
+ // If this BackupEngine created the backup, it compares the files' current
+ // sizes (and current checksum) against the number of bytes written to
+ // them (and the checksum calculated) during creation.
+ // Otherwise, it compares the files' current sizes (and checksums) against
+ // their sizes (and checksums) when the BackupEngine was opened.
+ //
+ // Returns Status::OK() if all checks are good
+ virtual IOStatus VerifyBackup(BackupID backup_id,
+ bool verify_with_checksum = false) const = 0;
+};
+
+// Append-only functions of a BackupEngine. See BackupEngine comment for
+// details on distinction between Append and Write operations and safe
+// concurrent operations.
+class BackupEngineAppendOnlyBase {
+ public:
+ virtual ~BackupEngineAppendOnlyBase() {}
+
+ // same as CreateNewBackup, but stores extra application metadata.
+ virtual IOStatus CreateNewBackupWithMetadata(
+ const CreateBackupOptions& options, DB* db,
+ const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0;
+
+ // keep here for backward compatibility.
+ virtual IOStatus CreateNewBackupWithMetadata(
+ DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+ std::function<void()> progress_callback = []() {}) {
+ CreateBackupOptions options;
+ options.flush_before_backup = flush_before_backup;
+ options.progress_callback = progress_callback;
+ return CreateNewBackupWithMetadata(options, db, app_metadata);
+ }
+
+ // Captures the state of the database by creating a new (latest) backup.
+ // On success (OK status), the BackupID of the new backup is saved to
+ // *new_backup_id when not nullptr.
+ // NOTE: db_paths and cf_paths are not supported for creating backups,
+ // and NotSupported will be returned when the DB (without WALs) uses more
+ // than one directory.
+ virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db,
+ BackupID* new_backup_id = nullptr) {
+ return CreateNewBackupWithMetadata(options, db, "", new_backup_id);
+ }
+
+ // keep here for backward compatibility.
+ virtual IOStatus CreateNewBackup(
+ DB* db, bool flush_before_backup = false,
+ std::function<void()> progress_callback = []() {}) {
+ CreateBackupOptions options;
+ options.flush_before_backup = flush_before_backup;
+ options.progress_callback = progress_callback;
+ return CreateNewBackup(options, db);
+ }
+
+ // Call this from another thread if you want to stop the backup
+ // that is currently happening. It will return immediately, will
+ // not wait for the backup to stop.
+ // The backup will stop ASAP and the call to CreateNewBackup will
+ // return Status::Incomplete(). It will not clean up after itself, but
+ // the state will remain consistent. The state will be cleaned up the
+ // next time you call CreateNewBackup or GarbageCollect.
+ virtual void StopBackup() = 0;
+
+ // Will delete any files left over from incomplete creation or deletion of
+ // a backup. This is not normally needed as those operations also clean up
+ // after prior incomplete calls to the same kind of operation (create or
+ // delete). This does not delete corrupt backups but can delete files that
+ // would be needed to manually recover a corrupt backup or to preserve an
+ // unrecognized (e.g. incompatible future version) backup.
+ // NOTE: This is not designed to delete arbitrary files added to the backup
+ // directory outside of BackupEngine, and clean-up is always subject to
+ // permissions on and availability of the underlying filesystem.
+ // NOTE2: For concurrency and interference purposes (see BackupEngine
+ // comment), GarbageCollect (GC) is like other Append operations, even
+ // though it seems different. Although GC can delete physical data, it does
+ // not delete any logical data read by Read operations. GC can interfere
+ // with Append or Write operations in another BackupEngine on the same
+ // backup_dir, because temporary files will be treated as obsolete and
+ // deleted.
+ virtual IOStatus GarbageCollect() = 0;
+};
+
+// A backup engine for organizing and managing backups.
+// This class is not user-extensible.
+//
+// This class declaration adds "Write" operations in addition to the
+// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase.
+//
+// # Concurrency between threads on the same BackupEngine* object
+//
+// As of version 6.20, BackupEngine* operations are generally thread-safe,
+// using a read-write lock, though single-thread operation is still
+// recommended to avoid TOCTOU bugs. Specifically, particular kinds of
+// concurrent operations behave like this:
+//
+// op1\op2| Read | Append | Write
+// -------|-------|--------|--------
+// Read | conc | block | block
+// Append | block | block | block
+// Write | block | block | block
+//
+// conc = operations safely proceed concurrently
+// block = one of the operations safely blocks until the other completes.
+// There is generally no guarantee as to which completes first.
+//
+// StopBackup is the only operation that affects an ongoing operation.
+//
+// # Interleaving operations between BackupEngine* objects open on the
+// same backup_dir
+//
+// It is recommended only to have one BackupEngine* object open for a given
+// backup_dir, but it is possible to mix / interleave some operations
+// (regardless of whether they are concurrent) with these caveats:
+//
+// op1\op2| Open | Read | Append | Write
+// -------|--------|--------|--------|--------
+// Open | conc | conc | atomic | unspec
+// Read | conc | conc | old | unspec
+// Append | atomic | old | unspec | unspec
+// Write | unspec | unspec | unspec | unspec
+//
+// Special case: Open with destroy_old_data=true is really a Write
+//
+// conc = operations safely proceed, concurrently when applicable
+// atomic = operations are effectively atomic; if a concurrent Append
+// operation has not completed at some key point during Open, the
+// opened BackupEngine* will never see the result of the Append op.
+// old = Read operations do not include any state changes from other
+// BackupEngine* objects; they return the state at their Open time.
+// unspec = Behavior is unspecified, including possibly trashing the
+// backup_dir, but is "memory safe" (no C++ undefined behavior)
+//
+class BackupEngine : public BackupEngineReadOnlyBase,
+ public BackupEngineAppendOnlyBase {
+ public:
+ virtual ~BackupEngine() {}
+
+ // BackupEngineOptions have to be the same as the ones used in previous
+ // BackupEngines for the same backup directory.
+ static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+ BackupEngine** backup_engine_ptr);
+
+ // keep for backward compatibility.
+ static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+ BackupEngine** backup_engine_ptr) {
+ return BackupEngine::Open(options, db_env, backup_engine_ptr);
+ }
+
+ // Deletes old backups, keeping latest num_backups_to_keep alive.
+ // See also DeleteBackup.
+ virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+ // Deletes a specific backup. If this operation (or PurgeOldBackups)
+ // is not completed due to crash, power failure, etc. the state
+ // will be cleaned up the next time you call DeleteBackup,
+ // PurgeOldBackups, or GarbageCollect.
+ virtual IOStatus DeleteBackup(BackupID backup_id) = 0;
+};
+
+// A variant of BackupEngine that only allows "Read" operations. See
+// BackupEngine comment for details. This class is not user-extensible.
+class BackupEngineReadOnly : public BackupEngineReadOnlyBase {
+ public:
+ virtual ~BackupEngineReadOnly() {}
+
+ static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+ BackupEngineReadOnly** backup_engine_ptr);
+ // keep for backward compatibility.
+ static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+ BackupEngineReadOnly** backup_engine_ptr) {
+ return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
new file mode 100644
index 000000000..fde03db7e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
@@ -0,0 +1,142 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The classes and functions in this header file is used for dumping out the
+// blocks in a block cache, storing or transfering the blocks to another
+// destination host, and load these blocks to the secondary cache at destination
+// host.
+// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They
+// my be changed in the future when the development continues.
+
+// The major and minor version number of the data format to be stored/trandfered
+// via CacheDumpWriter and read out via CacheDumpReader
+static const int kCacheDumpMajorVersion = 0;
+static const int kCacheDumpMinorVersion = 1;
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to write or transfer the data that is created by
+// CacheDumper. We pack one block with its block type, dump time, block key in
+// the block cache, block len, block crc32c checksum and block itself as a unit
+// and it is stored via WritePacket. Before we call WritePacket, we must call
+// WriteMetadata once, which stores the sequence number, block unit checksum,
+// and block unit size.
+// We provide file based CacheDumpWriter to store the metadata and its package
+// sequentially in a file as the defualt implementation. Users can implement
+// their own CacheDumpWriter to store/transfer the data. For example, user can
+// create a subclass which transfer the metadata and package on the fly.
+class CacheDumpWriter {
+ public:
+ virtual ~CacheDumpWriter() = default;
+
+ // Called ONCE before the calls to WritePacket
+ virtual IOStatus WriteMetadata(const Slice& metadata) = 0;
+ virtual IOStatus WritePacket(const Slice& data) = 0;
+ virtual IOStatus Close() = 0;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to read or receive the data that is stored
+// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called
+// once before we call a ReadPacket.
+class CacheDumpReader {
+ public:
+ virtual ~CacheDumpReader() = default;
+ // Called ONCE before the calls to ReadPacket
+ virtual IOStatus ReadMetadata(std::string* metadata) = 0;
+ // Sets data to empty string on EOF
+ virtual IOStatus ReadPacket(std::string* data) = 0;
+ // (Close not needed)
+};
+
+// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any
+// dump or load process related control variables can be added here.
+struct CacheDumpOptions {
+ SystemClock* clock;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This the class to dump out the block in the block cache, store/transfer them
+// via CacheDumpWriter. In order to dump out the blocks belonging to a certain
+// DB or a list of DB (block cache can be shared by many DB), user needs to call
+// SetDumpFilter to specify a list of DB to filter out the blocks that do not
+// belong to those DB.
+// A typical use case is: when we migrate a DB instance from host A to host B.
+// We need to reopen the DB at host B after all the files are copied to host B.
+// At this moment, the block cache at host B does not have any block from this
+// migrated DB. Therefore, the read performance can be low due to cache warm up.
+// By using CacheDumper before we shut down the DB at host A and using
+// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache
+// ahead. This function can be used in other use cases also.
+class CacheDumper {
+ public:
+ virtual ~CacheDumper() = default;
+ // Only dump the blocks in the block cache that belong to the DBs in this list
+ virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+ (void)db_list;
+ return Status::NotSupported("SetDumpFilter is not supported");
+ }
+ // The main function to dump out all the blocks that satisfy the filter
+ // condition from block cache to a certain CacheDumpWriter in one shot. This
+ // process may take some time.
+ virtual IOStatus DumpCacheEntriesToWriter() {
+ return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported");
+ }
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is the class to load the dumped blocks to the destination cache. For now
+// we only load the blocks to the SecondaryCache. In the future, we may plan to
+// support loading to the block cache.
+class CacheDumpedLoader {
+ public:
+ virtual ~CacheDumpedLoader() = default;
+ virtual IOStatus RestoreCacheEntriesToSecondaryCache() {
+ return IOStatus::NotSupported(
+ "RestoreCacheEntriesToSecondaryCache is not supported");
+ }
+};
+
+// Get the writer which stores all the metadata and data sequentially to a file
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_opts,
+ const std::string& file_name,
+ std::unique_ptr<CacheDumpWriter>* writer);
+
+// Get the reader which read out the metadata and data sequentially from a file
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_opts,
+ const std::string& file_name,
+ std::unique_ptr<CacheDumpReader>* reader);
+
+// Get the default cache dumper
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+ const std::shared_ptr<Cache>& cache,
+ std::unique_ptr<CacheDumpWriter>&& writer,
+ std::unique_ptr<CacheDumper>* cache_dumper);
+
+// Get the default cache dump loader
+Status NewDefaultCacheDumpedLoader(
+ const CacheDumpOptions& dump_options,
+ const BlockBasedTableOptions& toptions,
+ const std::shared_ptr<SecondaryCache>& secondary_cache,
+ std::unique_ptr<CacheDumpReader>&& reader,
+ std::unique_ptr<CacheDumpedLoader>* cache_dump_loader);
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 000000000..ecf920616
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
+
+class Checkpoint {
+ public:
+ // Creates a Checkpoint object to be used for creating openable snapshots
+ static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+ // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an
+ // absolute path. The specified directory should not exist, since it will be
+ // created by the API.
+ // When a checkpoint is created,
+ // (1) SST and blob files are hard linked if the output directory is on the
+ // same filesystem as the database, and copied otherwise.
+ // (2) other required files (like MANIFEST) are always copied.
+ // log_size_for_flush: if the total log file size is equal or larger than
+ // this value, then a flush is triggered for all the column families. The
+ // default value is 0, which means flush is always triggered. If you move
+ // away from the default, the checkpoint may not contain up-to-date data
+ // if WAL writing is not always enabled.
+ // Flush will always trigger if it is 2PC.
+ // sequence_number_ptr: if it is not nullptr, the value it points to will be
+ // set to a sequence number guaranteed to be part of the DB, not necessarily
+ // the latest. The default value of this parameter is nullptr.
+ // NOTE: db_paths and cf_paths are not supported for creating checkpoints
+ // and NotSupported will be returned when the DB (without WALs) uses more
+ // than one directory.
+ virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
+ uint64_t log_size_for_flush = 0,
+ uint64_t* sequence_number_ptr = nullptr);
+
+ // Exports all live SST files of a specified Column Family onto export_dir,
+ // returning SST files information in metadata.
+ // - SST files will be created as hard links when the directory specified
+ // is in the same partition as the db directory, copied otherwise.
+ // - export_dir should not already exist and will be created by this API.
+ // - Always triggers a flush.
+ virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+ const std::string& export_dir,
+ ExportImportFilesMetaData** metadata);
+
+ virtual ~Checkpoint() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
new file mode 100644
index 000000000..f61afd69e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// This file was moved to rocksdb/convenience.h"
+
+#include "rocksdb/convenience.h"
diff --git a/src/rocksdb/include/rocksdb/utilities/customizable_util.h b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
new file mode 100644
index 000000000..62240763b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// The methods in this file are used to instantiate new Customizable
+// instances of objects. These methods are most typically used by
+// the "CreateFromString" method of a customizable class.
+// If not developing a new Type of customizable class, you probably
+// do not need the methods in this file.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use customizable objects
+
+#pragma once
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The FactoryFunc functions are used to create a new customizable object
+// without going through the ObjectRegistry. This methodology is especially
+// useful in LITE mode, where there is no ObjectRegistry. The methods take
+// in an ID of the object to create and a pointer to store the created object.
+// If the factory successfully recognized the input ID, the method should return
+// success; otherwise false should be returned. On success, the object
+// parameter contains the new object.
+template <typename T>
+using SharedFactoryFunc =
+ std::function<bool(const std::string&, std::shared_ptr<T>*)>;
+
+template <typename T>
+using UniqueFactoryFunc =
+ std::function<bool(const std::string&, std::unique_ptr<T>*)>;
+
+template <typename T>
+using StaticFactoryFunc = std::function<bool(const std::string&, T**)>;
+
+// Creates a new shared customizable instance object based on the
+// input parameters using the object registry.
+//
+// The id parameter specifies the instance class of the object to create.
+// The opt_map parameter specifies the configuration of the new instance.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created. This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewSharedObject(
+ const ConfigOptions& config_options, const std::string& id,
+ const std::unordered_map<std::string, std::string>& opt_map,
+ std::shared_ptr<T>* result) {
+ if (!id.empty()) {
+ Status status;
+#ifndef ROCKSDB_LITE
+ status = config_options.registry->NewSharedObject(id, result);
+#else
+ status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif // ROCKSDB_LITE
+ if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+ status = Status::OK();
+ } else if (status.ok()) {
+ status = Customizable::ConfigureNewObject(config_options, result->get(),
+ opt_map);
+ }
+ return status;
+ } else if (opt_map.empty()) {
+ // There was no ID and no map (everything empty), so reset/clear the result
+ result->reset();
+ return Status::OK();
+ } else {
+ return Status::NotSupported("Cannot reset object ");
+ }
+}
+
+// Creates a new managed customizable instance object based on the
+// input parameters using the object registry. Unlike "shared" objects,
+// managed objects are limited to a single instance per ID.
+//
+// The id parameter specifies the instance class of the object to create.
+// If an object with this id exists in the registry, the existing object
+// will be returned. If the object does not exist, a new one will be created.
+//
+// The opt_map parameter specifies the configuration of the new instance.
+// If the object already exists, the existing object is returned "as is" and
+// this parameter is ignored.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the object. This string
+// will be used by the object registry to locate the appropriate object to
+// create or return.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The managed instance.
+template <typename T>
+static Status NewManagedObject(
+ const ConfigOptions& config_options, const std::string& id,
+ const std::unordered_map<std::string, std::string>& opt_map,
+ std::shared_ptr<T>* result) {
+ Status status;
+ if (!id.empty()) {
+#ifndef ROCKSDB_LITE
+ status = config_options.registry->GetOrCreateManagedObject<T>(
+ id, result, [config_options, opt_map](T* object) {
+ return object->ConfigureFromMap(config_options, opt_map);
+ });
+#else
+ (void)result;
+ (void)opt_map;
+ status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif // ROCKSDB_LITE
+ if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+ return Status::OK();
+ }
+ } else {
+ status = Status::NotSupported("Cannot reset object ");
+ }
+ return status;
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+// This method parses the input value to determine the type of instance to
+// create. If there is an existing instance (in result) and it is the same ID
+// as the object being created, the existing configuration is stored and used as
+// the default for the new object.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings. If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object. Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadSharedObject(const ConfigOptions& config_options,
+ const std::string& value,
+ const SharedFactoryFunc<T>& func,
+ std::shared_ptr<T>* result) {
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+
+ Status status = Customizable::GetOptionsMap(config_options, result->get(),
+ value, &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ } else if (func == nullptr ||
+ !func(id, result)) { // No factory, or it failed
+ return NewSharedObject(config_options, id, opt_map, result);
+ } else {
+ return Customizable::ConfigureNewObject(config_options, result->get(),
+ opt_map);
+ }
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings. If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object. Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The "id" field from the value (either the whole field or "id=XX") is used
+// to determine the type/id of the object to return. For a given id, there
+// the same instance of the object will be returned from this method (as opposed
+// to LoadSharedObject which would create different objects for the same id.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadManagedObject(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<T>* result) {
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+ Status status = Customizable::GetOptionsMap(config_options, nullptr, value,
+ &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ } else if (value.empty()) { // No Id and no options. Clear the object
+ *result = nullptr;
+ return Status::OK();
+ } else {
+ return NewManagedObject(config_options, id, opt_map, result);
+ }
+}
+
+// Creates a new unique pointer customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created. This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewUniqueObject(
+ const ConfigOptions& config_options, const std::string& id,
+ const std::unordered_map<std::string, std::string>& opt_map,
+ std::unique_ptr<T>* result) {
+ if (!id.empty()) {
+ Status status;
+#ifndef ROCKSDB_LITE
+ status = config_options.registry->NewUniqueObject(id, result);
+#else
+ status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif // ROCKSDB_LITE
+ if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+ status = Status::OK();
+ } else if (status.ok()) {
+ status = Customizable::ConfigureNewObject(config_options, result->get(),
+ opt_map);
+ }
+ return status;
+ } else if (opt_map.empty()) {
+ // There was no ID and no map (everything empty), so reset/clear the result
+ result->reset();
+ return Status::OK();
+ } else {
+ return Status::NotSupported("Cannot reset object ");
+ }
+}
+
+// Creates a new unique customizable instance object based on the input
+// parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadUniqueObject(const ConfigOptions& config_options,
+ const std::string& value,
+ const UniqueFactoryFunc<T>& func,
+ std::unique_ptr<T>* result) {
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+ Status status = Customizable::GetOptionsMap(config_options, result->get(),
+ value, &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ } else if (func == nullptr ||
+ !func(id, result)) { // No factory, or it failed
+ return NewUniqueObject(config_options, id, opt_map, result);
+ } else {
+ return Customizable::ConfigureNewObject(config_options, result->get(),
+ opt_map);
+ }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created. This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewStaticObject(
+ const ConfigOptions& config_options, const std::string& id,
+ const std::unordered_map<std::string, std::string>& opt_map, T** result) {
+ if (!id.empty()) {
+ Status status;
+#ifndef ROCKSDB_LITE
+ status = config_options.registry->NewStaticObject(id, result);
+#else
+ status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif // ROCKSDB_LITE
+ if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+ status = Status::OK();
+ } else if (status.ok()) {
+ status =
+ Customizable::ConfigureNewObject(config_options, *result, opt_map);
+ }
+ return status;
+ } else if (opt_map.empty()) {
+ // There was no ID and no map (everything empty), so reset/clear the result
+ *result = nullptr;
+ return Status::OK();
+ } else {
+ return Status::NotSupported("Cannot reset object ");
+ }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadStaticObject(const ConfigOptions& config_options,
+ const std::string& value,
+ const StaticFactoryFunc<T>& func, T** result) {
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+ Status status = Customizable::GetOptionsMap(config_options, *result, value,
+ &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ } else if (func == nullptr ||
+ !func(id, result)) { // No factory, or it failed
+ return NewStaticObject(config_options, id, opt_map, result);
+ } else {
+ return Customizable::ConfigureNewObject(config_options, *result, opt_map);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
new file mode 100644
index 000000000..d57e7473a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+// meant to be removed from the db in a non-strict 'ttl' amount of time
+// Therefore, this guarantees that key-values inserted will remain in the
+// db for >= ttl amount of time and the db will make efforts to remove the
+// key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+// triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+// corrupt values(timestamp suffixed) and no ttl effect will be there
+// during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+// whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+ virtual Status CreateColumnFamilyWithTtl(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ ColumnFamilyHandle** handle, int ttl) = 0;
+
+ static Status Open(const Options& options, const std::string& dbname,
+ DBWithTTL** dbptr, int32_t ttl = 0,
+ bool read_only = false);
+
+ static Status Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ DBWithTTL** dbptr, const std::vector<int32_t>& ttls,
+ bool read_only = false);
+
+ virtual void SetTtl(int32_t ttl) = 0;
+
+ virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
+
+ protected:
+ explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h
new file mode 100644
index 000000000..0e0526557
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/debug.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Data associated with a particular version of a key. A database may internally
+// store multiple versions of a same user key due to snapshots, compaction not
+// happening yet, etc.
+struct KeyVersion {
+ KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
+
+ KeyVersion(const std::string& _user_key, const std::string& _value,
+ SequenceNumber _sequence, int _type)
+ : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
+
+ std::string user_key;
+ std::string value;
+ SequenceNumber sequence;
+ int type;
+ std::string GetTypeName() const;
+};
+
+// Returns listing of all versions of keys in the provided user key range.
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
+// The result is inserted into the provided vector, `key_versions`.
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+ size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions);
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+ Slice end_key, size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
new file mode 100644
index 000000000..ffde5effa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
@@ -0,0 +1,181 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// MirrorEnv is an Env implementation that mirrors all file-related
+// operations to two backing Env's (provided at construction time).
+// Writes are mirrored. For read operations, we do the read from both
+// backends and assert that the results match.
+//
+// This is useful when implementing a new Env and ensuring that the
+// semantics and behavior are correct (in that they match that of an
+// existing, stable Env, like the default POSIX one).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SequentialFileMirror;
+class RandomAccessFileMirror;
+class WritableFileMirror;
+
+class EnvMirror : public EnvWrapper {
+ Env *a_, *b_;
+ bool free_a_, free_b_;
+
+ public:
+ EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false)
+ : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {}
+ ~EnvMirror() {
+ if (free_a_) delete a_;
+ if (free_b_) delete b_;
+ }
+
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) override;
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) override;
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+ virtual Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override {
+ std::unique_ptr<Directory> br;
+ Status as = a_->NewDirectory(name, result);
+ Status bs = b_->NewDirectory(name, &br);
+ assert(as == bs);
+ return as;
+ }
+ Status FileExists(const std::string& f) override {
+ Status as = a_->FileExists(f);
+ Status bs = b_->FileExists(f);
+ assert(as == bs);
+ return as;
+ }
+#if defined(_MSC_VER)
+#pragma warning(push)
+// logical operation on address of string constant
+#pragma warning(disable : 4130)
+#endif
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* r) override {
+ std::vector<std::string> ar, br;
+ Status as = a_->GetChildren(dir, &ar);
+ Status bs = b_->GetChildren(dir, &br);
+ assert(as == bs);
+ std::sort(ar.begin(), ar.end());
+ std::sort(br.begin(), br.end());
+ if (!as.ok() || ar != br) {
+ assert(0 == "getchildren results don't match");
+ }
+ *r = ar;
+ return as;
+ }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+ Status DeleteFile(const std::string& f) override {
+ Status as = a_->DeleteFile(f);
+ Status bs = b_->DeleteFile(f);
+ assert(as == bs);
+ return as;
+ }
+ Status CreateDir(const std::string& d) override {
+ Status as = a_->CreateDir(d);
+ Status bs = b_->CreateDir(d);
+ assert(as == bs);
+ return as;
+ }
+ Status CreateDirIfMissing(const std::string& d) override {
+ Status as = a_->CreateDirIfMissing(d);
+ Status bs = b_->CreateDirIfMissing(d);
+ assert(as == bs);
+ return as;
+ }
+ Status DeleteDir(const std::string& d) override {
+ Status as = a_->DeleteDir(d);
+ Status bs = b_->DeleteDir(d);
+ assert(as == bs);
+ return as;
+ }
+ Status GetFileSize(const std::string& f, uint64_t* s) override {
+ uint64_t asize, bsize;
+ Status as = a_->GetFileSize(f, &asize);
+ Status bs = b_->GetFileSize(f, &bsize);
+ assert(as == bs);
+ assert(!as.ok() || asize == bsize);
+ *s = asize;
+ return as;
+ }
+
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override {
+ uint64_t amtime, bmtime;
+ Status as = a_->GetFileModificationTime(fname, &amtime);
+ Status bs = b_->GetFileModificationTime(fname, &bmtime);
+ assert(as == bs);
+ assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000);
+ *file_mtime = amtime;
+ return as;
+ }
+
+ Status RenameFile(const std::string& s, const std::string& t) override {
+ Status as = a_->RenameFile(s, t);
+ Status bs = b_->RenameFile(s, t);
+ assert(as == bs);
+ return as;
+ }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ Status as = a_->LinkFile(s, t);
+ Status bs = b_->LinkFile(s, t);
+ assert(as == bs);
+ return as;
+ }
+
+ class FileLockMirror : public FileLock {
+ public:
+ FileLock *a_, *b_;
+ FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {}
+ };
+
+ Status LockFile(const std::string& f, FileLock** l) override {
+ FileLock *al, *bl;
+ Status as = a_->LockFile(f, &al);
+ Status bs = b_->LockFile(f, &bl);
+ assert(as == bs);
+ if (as.ok()) *l = new FileLockMirror(al, bl);
+ return as;
+ }
+
+ Status UnlockFile(FileLock* l) override {
+ FileLockMirror* ml = static_cast<FileLockMirror*>(l);
+ Status as = a_->UnlockFile(ml->a_);
+ Status bs = b_->UnlockFile(ml->b_);
+ assert(as == bs);
+ delete ml;
+ return as;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
new file mode 100644
index 000000000..824f8a3df
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function can be used to list the Information logs,
+// given the db pointer.
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
new file mode 100644
index 000000000..007638192
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
@@ -0,0 +1,318 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/ldb_cmd_execute_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommand {
+ public:
+ // Command-line arguments
+ static const std::string ARG_ENV_URI;
+ static const std::string ARG_FS_URI;
+ static const std::string ARG_DB;
+ static const std::string ARG_PATH;
+ static const std::string ARG_SECONDARY_PATH;
+ static const std::string ARG_HEX;
+ static const std::string ARG_KEY_HEX;
+ static const std::string ARG_VALUE_HEX;
+ static const std::string ARG_CF_NAME;
+ static const std::string ARG_TTL;
+ static const std::string ARG_TTL_START;
+ static const std::string ARG_TTL_END;
+ static const std::string ARG_TIMESTAMP;
+ static const std::string ARG_TRY_LOAD_OPTIONS;
+ static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
+ static const std::string ARG_FROM;
+ static const std::string ARG_TO;
+ static const std::string ARG_MAX_KEYS;
+ static const std::string ARG_BLOOM_BITS;
+ static const std::string ARG_FIX_PREFIX_LEN;
+ static const std::string ARG_COMPRESSION_TYPE;
+ static const std::string ARG_COMPRESSION_MAX_DICT_BYTES;
+ static const std::string ARG_BLOCK_SIZE;
+ static const std::string ARG_AUTO_COMPACTION;
+ static const std::string ARG_DB_WRITE_BUFFER_SIZE;
+ static const std::string ARG_WRITE_BUFFER_SIZE;
+ static const std::string ARG_FILE_SIZE;
+ static const std::string ARG_CREATE_IF_MISSING;
+ static const std::string ARG_NO_VALUE;
+ static const std::string ARG_DISABLE_CONSISTENCY_CHECKS;
+ static const std::string ARG_ENABLE_BLOB_FILES;
+ static const std::string ARG_MIN_BLOB_SIZE;
+ static const std::string ARG_BLOB_FILE_SIZE;
+ static const std::string ARG_BLOB_COMPRESSION_TYPE;
+ static const std::string ARG_ENABLE_BLOB_GARBAGE_COLLECTION;
+ static const std::string ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF;
+ static const std::string ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD;
+ static const std::string ARG_BLOB_COMPACTION_READAHEAD_SIZE;
+ static const std::string ARG_BLOB_FILE_STARTING_LEVEL;
+ static const std::string ARG_PREPOPULATE_BLOB_CACHE;
+ static const std::string ARG_DECODE_BLOB_INDEX;
+ static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS;
+
+ struct ParsedParams {
+ std::string cmd;
+ std::vector<std::string> cmd_params;
+ std::map<std::string, std::string> option_map;
+ std::vector<std::string> flags;
+ };
+
+ static LDBCommand* SelectCommand(const ParsedParams& parsed_parms);
+
+ static LDBCommand* InitFromCmdLineArgs(
+ const std::vector<std::string>& args, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families,
+ const std::function<LDBCommand*(const ParsedParams&)>& selector =
+ SelectCommand);
+
+ static LDBCommand* InitFromCmdLineArgs(
+ int argc, char const* const* argv, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families);
+
+ bool ValidateCmdLineOptions();
+
+ virtual void PrepareOptions();
+
+ virtual void OverrideBaseOptions();
+
+ virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts);
+
+ virtual void SetDBOptions(Options options) { options_ = options; }
+
+ virtual void SetColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>* column_families) {
+ if (column_families != nullptr) {
+ column_families_ = *column_families;
+ } else {
+ column_families_.clear();
+ }
+ }
+
+ void SetLDBOptions(const LDBOptions& ldb_options) {
+ ldb_options_ = ldb_options;
+ }
+
+ const std::map<std::string, std::string>& TEST_GetOptionMap() {
+ return option_map_;
+ }
+
+ const std::vector<std::string>& TEST_GetFlags() { return flags_; }
+
+ virtual bool NoDBOpen() { return false; }
+
+ virtual ~LDBCommand() { CloseDB(); }
+
+ /* Run the command, and return the execute result. */
+ void Run();
+
+ virtual void DoCommand() = 0;
+
+ LDBCommandExecuteResult GetExecuteState() { return exec_state_; }
+
+ void ClearPreviousRunState() { exec_state_.Reset(); }
+
+ // Consider using Slice::DecodeHex directly instead if you don't need the
+ // 0x prefix
+ static std::string HexToString(const std::string& str);
+
+ // Consider using Slice::ToString(true) directly instead if
+ // you don't need the 0x prefix
+ static std::string StringToHex(const std::string& str);
+
+ static const char* DELIM;
+
+ protected:
+ LDBCommandExecuteResult exec_state_;
+ std::string env_uri_;
+ std::string fs_uri_;
+ std::string db_path_;
+ // If empty, open DB as primary. If non-empty, open the DB as secondary
+ // with this secondary path. When running against a database opened by
+ // another process, ldb wll leave the source directory completely intact.
+ std::string secondary_path_;
+ std::string column_family_name_;
+ DB* db_;
+ DBWithTTL* db_ttl_;
+ std::map<std::string, ColumnFamilyHandle*> cf_handles_;
+
+ /**
+ * true implies that this command can work if the db is opened in read-only
+ * mode.
+ */
+ bool is_read_only_;
+
+ /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+ bool is_key_hex_;
+
+ /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+ bool is_value_hex_;
+
+ /** If true, the value is treated as timestamp suffixed */
+ bool is_db_ttl_;
+
+ // If true, the kvs are output with their insert/modify timestamp in a ttl db
+ bool timestamp_;
+
+ // If true, try to construct options from DB's option files.
+ bool try_load_options_;
+
+ // The value passed to options.force_consistency_checks.
+ bool force_consistency_checks_;
+
+ bool enable_blob_files_;
+
+ bool enable_blob_garbage_collection_;
+
+ bool create_if_missing_;
+
+ /**
+ * Map of options passed on the command-line.
+ */
+ const std::map<std::string, std::string> option_map_;
+
+ /**
+ * Flags passed on the command-line.
+ */
+ const std::vector<std::string> flags_;
+
+ /** List of command-line options valid for this command */
+ const std::vector<std::string> valid_cmd_line_options_;
+
+ /** Shared pointer to underlying environment if applicable **/
+ std::shared_ptr<Env> env_guard_;
+
+ bool ParseKeyValue(const std::string& line, std::string* key,
+ std::string* value, bool is_key_hex, bool is_value_hex);
+
+ LDBCommand(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags, bool is_read_only,
+ const std::vector<std::string>& valid_cmd_line_options);
+
+ void OpenDB();
+
+ void CloseDB();
+
+ ColumnFamilyHandle* GetCfHandle();
+
+ static std::string PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_key_hex,
+ bool is_value_hex);
+
+ static std::string PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_hex);
+
+ /**
+ * Return true if the specified flag is present in the specified flags vector
+ */
+ static bool IsFlagPresent(const std::vector<std::string>& flags,
+ const std::string& flag) {
+ return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+ }
+
+ static std::string HelpRangeCmdArgs();
+
+ /**
+ * A helper function that returns a list of command line options
+ * used by this command. It includes the common options and the ones
+ * passed in.
+ */
+ static std::vector<std::string> BuildCmdLineOptions(
+ std::vector<std::string> options);
+
+ bool ParseIntOption(const std::map<std::string, std::string>& options,
+ const std::string& option, int& value,
+ LDBCommandExecuteResult& exec_state);
+
+ bool ParseDoubleOption(const std::map<std::string, std::string>& options,
+ const std::string& option, double& value,
+ LDBCommandExecuteResult& exec_state);
+
+ bool ParseStringOption(const std::map<std::string, std::string>& options,
+ const std::string& option, std::string* value);
+
+ bool ParseCompressionTypeOption(
+ const std::map<std::string, std::string>& options,
+ const std::string& option, CompressionType& value,
+ LDBCommandExecuteResult& exec_state);
+
+ /**
+ * Returns the value of the specified option as a boolean.
+ * default_val is used if the option is not found in options.
+ * Throws an exception if the value of the option is not
+ * "true" or "false" (case insensitive).
+ */
+ bool ParseBooleanOption(const std::map<std::string, std::string>& options,
+ const std::string& option, bool default_val);
+
+ Options options_;
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ ConfigOptions config_options_;
+ LDBOptions ldb_options_;
+
+ private:
+ /**
+ * Interpret command line options and flags to determine if the key
+ * should be input/output in hex.
+ */
+ bool IsKeyHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ /**
+ * Interpret command line options and flags to determine if the value
+ * should be input/output in hex.
+ */
+ bool IsValueHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ bool IsTryLoadOptions(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ /**
+ * Converts val to a boolean.
+ * val must be either true or false (case insensitive).
+ * Otherwise an exception is thrown.
+ */
+ bool StringToBool(std::string val);
+};
+
+class LDBCommandRunner {
+ public:
+ static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name,
+ bool to_stderr = true);
+
+ // Returns the status code to return. 0 is no error.
+ static int RunCommand(
+ int argc, char const* const* argv, Options options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
new file mode 100644
index 000000000..57bac3346
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef FAILED
+#undef FAILED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommandExecuteResult {
+ public:
+ enum State {
+ EXEC_NOT_STARTED = 0,
+ EXEC_SUCCEED = 1,
+ EXEC_FAILED = 2,
+ };
+
+ LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
+
+ LDBCommandExecuteResult(State state, std::string& msg)
+ : state_(state), message_(msg) {}
+
+ std::string ToString() {
+ std::string ret;
+ switch (state_) {
+ case EXEC_SUCCEED:
+ break;
+ case EXEC_FAILED:
+ ret.append("Failed: ");
+ break;
+ case EXEC_NOT_STARTED:
+ ret.append("Not started: ");
+ }
+ if (!message_.empty()) {
+ ret.append(message_);
+ }
+ return ret;
+ }
+
+ void Reset() {
+ state_ = EXEC_NOT_STARTED;
+ message_ = "";
+ }
+
+ bool IsSucceed() { return state_ == EXEC_SUCCEED; }
+
+ bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; }
+
+ bool IsFailed() { return state_ == EXEC_FAILED; }
+
+ static LDBCommandExecuteResult Succeed(std::string msg) {
+ return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+ }
+
+ static LDBCommandExecuteResult Failed(std::string msg) {
+ return LDBCommandExecuteResult(EXEC_FAILED, msg);
+ }
+
+ private:
+ State state_;
+ std::string message_;
+
+ bool operator==(const LDBCommandExecuteResult&);
+ bool operator!=(const LDBCommandExecuteResult&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 000000000..7e4a6faa4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+struct Options;
+class Snapshot;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator;
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists;
+
+ // If true, the implementation will do aggressive checking of the
+ // data it is processing and will stop early if it detects any
+ // errors. This may have unforeseen ramifications: for example, a
+ // corruption of one DB entry may cause a large number of entries to
+ // become unreadable or for the entire DB to become unopenable.
+ // Default: false
+ bool paranoid_checks;
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc.
+ // Default: Env::Default()
+ Env* env;
+
+ // Any internal progress/error information generated by the db will
+ // be written to info_log if it is non-NULL, or to a file stored
+ // in the same directory as the DB contents if info_log is NULL.
+ // Default: NULL
+ Logger* info_log;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
+ //
+ // Larger values increase performance, especially during bulk loads.
+ // Up to two write buffers may be held in memory at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
+ // Also, a larger write buffer will result in a longer recovery time
+ // the next time the database is opened.
+ //
+ // Default: 4MB
+ size_t write_buffer_size;
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set (budget
+ // one open file per 2MB of working set).
+ //
+ // Default: 1000
+ int max_open_files;
+
+ // Control over blocks (user data is stored in a set of blocks, and
+ // a block is the unit of reading from disk).
+
+ // If non-NULL, use the specified cache for blocks.
+ // If NULL, leveldb will automatically create and use an 8MB internal cache.
+ // Default: NULL
+ Cache* block_cache;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ //
+ // Default: 4K
+ size_t block_size;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone.
+ //
+ // Default: 16
+ int block_restart_interval;
+
+ // Compress blocks using the specified compression algorithm. This
+ // parameter can be changed dynamically.
+ //
+ // Default: kSnappyCompression, which gives lightweight but fast
+ // compression.
+ //
+ // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kSnappyCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ CompressionType compression;
+
+ // If non-NULL, use the specified filter policy to reduce disk reads.
+ // Many applications will benefit from passing the result of
+ // NewBloomFilterPolicy() here.
+ //
+ // Default: NULL
+ const FilterPolicy* filter_policy;
+
+ // Create a LevelDBOptions object with default values for all fields.
+ LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
new file mode 100644
index 000000000..f617da02b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2016, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifdef LUA
+
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+// A class that used to define custom C Library that is callable
+// from Lua script
+class RocksLuaCustomLibrary {
+ public:
+ virtual ~RocksLuaCustomLibrary() {}
+ // The name of the C library. This name will also be used as the table
+ // (namespace) in Lua that contains the C library.
+ virtual const char* Name() const = 0;
+
+ // Returns a "static const struct luaL_Reg[]", which includes a list of
+ // C functions. Note that the last entry of this static array must be
+ // {nullptr, nullptr} as required by Lua.
+ //
+ // More details about how to implement Lua C libraries can be found
+ // in the official Lua document http://www.lua.org/pil/26.2.html
+ virtual const struct luaL_Reg* Lib() const = 0;
+
+ // A function that will be called right after the library has been created
+ // and pushed on the top of the lua_State. This custom setup function
+ // allows developers to put additional table or constant values inside
+ // the same table / namespace.
+ virtual void CustomSetup(lua_State* /*L*/) const {}
+};
+} // namespace lua
+} // namespace ROCKSDB_NAMESPACE
+#endif // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
new file mode 100644
index 000000000..3427b65ef
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2016, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+#ifdef LUA
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+class LuaStateWrapper {
+ public:
+ explicit LuaStateWrapper(const std::string& lua_script) {
+ lua_state_ = luaL_newstate();
+ Init(lua_script, {});
+ }
+ LuaStateWrapper(
+ const std::string& lua_script,
+ const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+ lua_state_ = luaL_newstate();
+ Init(lua_script, libraries);
+ }
+ lua_State* GetLuaState() const { return lua_state_; }
+ ~LuaStateWrapper() { lua_close(lua_state_); }
+
+ private:
+ void Init(
+ const std::string& lua_script,
+ const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+ if (lua_state_) {
+ luaL_openlibs(lua_state_);
+ for (const auto& library : libraries) {
+ luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
+ library->CustomSetup(lua_state_);
+ }
+ luaL_dostring(lua_state_, lua_script.c_str());
+ }
+ }
+
+ lua_State* lua_state_;
+};
+} // namespace lua
+} // namespace ROCKSDB_NAMESPACE
+#endif // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h
new file mode 100644
index 000000000..4f1606b51
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the current memory usage of the specified DB instances.
+class MemoryUtil {
+ public:
+ enum UsageType : int {
+ // Memory usage of all the mem-tables.
+ kMemTableTotal = 0,
+ // Memory usage of those un-flushed mem-tables.
+ kMemTableUnFlushed = 1,
+ // Memory usage of all the table readers.
+ kTableReadersTotal = 2,
+ // Memory usage by Cache.
+ kCacheTotal = 3,
+ kNumUsageTypes = 4
+ };
+
+ // Returns the approximate memory usage of different types in the input
+ // list of DBs and Cache set. For instance, in the output map
+ // usage_by_type, usage_by_type[kMemTableTotal] will store the memory
+ // usage of all the mem-tables from all the input rocksdb instances.
+ //
+ // Note that for memory usage inside Cache class, we will
+ // only report the usage of the input "cache_set" without
+ // including those Cache usage inside the input list "dbs"
+ // of DBs.
+ static Status GetApproximateMemoryUsageByType(
+ const std::vector<DB*>& dbs,
+ const std::unordered_set<const Cache*> cache_set,
+ std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h
new file mode 100644
index 000000000..3bafb837c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h
@@ -0,0 +1,585 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Customizable;
+class Logger;
+class ObjectLibrary;
+
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
+template <typename T>
+using FactoryFunc =
+ std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+// The signature of the function for loading factories
+// into an object library. This method is expected to register
+// factory functions in the supplied ObjectLibrary.
+// The ObjectLibrary is the library in which the factories will be loaded.
+// The std::string is the argument passed to the loader function.
+// The RegistrarFunc should return the number of objects loaded into this
+// library
+using RegistrarFunc = std::function<int(ObjectLibrary&, const std::string&)>;
+
+template <typename T>
+using ConfigureFunc = std::function<Status(T*)>;
+
+class ObjectLibrary {
+ private:
+ // Base class for an Entry in the Registry.
+ class Entry {
+ public:
+ virtual ~Entry() {}
+ virtual bool Matches(const std::string& target) const = 0;
+ virtual const char* Name() const = 0;
+ };
+
+ public:
+ // Class for matching target strings to a pattern.
+ // Entries consist of a name that starts the pattern and attributes
+ // The following attributes can be added to the entry:
+ // -Suffix: Comparable to name(suffix)
+ // -Separator: Comparable to name(separator).+ or name(separator).*
+ // -Number: Comparable to name(separator).[0-9]+
+ // -AltName: Comparable to (name|alt)
+ // -Optional: Comparable to name(separator)?
+ // Multiple separators can be combined and cause multiple matches.
+ // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#")
+ // is roughly equivalent to "(A|B)@.+#.+"
+ //
+ // Note that though this class does provide some regex-style matching,
+ // it is not a full regex parser and has some key differences:
+ // - Separators are matched left-most. For example, an entry
+ // Name("Hello").AddSeparator(" ").AddSuffix("!") would match
+ // "Hello world!", but not "Hello world!!"
+ // - No backtracking is necessary, enabling reliably efficient matching
+ class PatternEntry : public Entry {
+ private:
+ enum Quantifier {
+ kMatchZeroOrMore, // [suffix].*
+ kMatchAtLeastOne, // [suffix].+
+ kMatchExact, // [suffix]
+ kMatchInteger, // [suffix][0-9]+
+ kMatchDecimal, // [suffix][0-9]+[.][0-9]+
+ };
+
+ public:
+ // Short-cut for creating an entry that matches to a
+ // Customizable::IndividualId
+ static PatternEntry AsIndividualId(const std::string& name) {
+ PatternEntry entry(name, true);
+ entry.AddSeparator("@");
+ entry.AddSeparator("#");
+ return entry;
+ }
+
+ // Creates a new PatternEntry for "name". If optional is true,
+ // Matches will also return true if name==target
+ explicit PatternEntry(const std::string& name, bool optional = true)
+ : name_(name), optional_(optional), slength_(0) {
+ nlength_ = name_.size();
+ }
+
+ // Adds a suffix (exact match of separator with no trailing characters) to
+ // the separator
+ PatternEntry& AddSuffix(const std::string& suffix) {
+ separators_.emplace_back(suffix, kMatchExact);
+ slength_ += suffix.size();
+ return *this;
+ }
+
+ // Adds a separator (exact match of separator with trailing characters) to
+ // the entry
+ // If at_least_one is true, the separator must be followed by at least
+ // one character (e.g. separator.+).
+ // If at_least_one is false, the separator may be followed by zero or
+ // more characters (e.g. separator.*).
+ PatternEntry& AddSeparator(const std::string& separator,
+ bool at_least_one = true) {
+ slength_ += separator.size();
+ if (at_least_one) {
+ separators_.emplace_back(separator, kMatchAtLeastOne);
+ ++slength_;
+ } else {
+ separators_.emplace_back(separator, kMatchZeroOrMore);
+ }
+ return *this;
+ }
+
+ // Adds a separator (exact match of separator with trailing numbers) to the
+ // entry
+ PatternEntry& AddNumber(const std::string& separator, bool is_int = true) {
+ separators_.emplace_back(separator,
+ (is_int) ? kMatchInteger : kMatchDecimal);
+ slength_ += separator.size() + 1;
+ return *this;
+ }
+
+ // Sets another name that this entry will match, similar to (name|alt)
+ PatternEntry& AnotherName(const std::string& alt) {
+ names_.emplace_back(alt);
+ return *this;
+ }
+
+ // Sets whether the separators are required -- similar to name(separator)?
+ // If optional is true, then name(separator)? would match
+ // If optional is false, then the separators must also match
+ PatternEntry& SetOptional(bool optional) {
+ optional_ = optional;
+ return *this;
+ }
+
+ // Checks to see if the target matches this entry
+ bool Matches(const std::string& target) const override;
+ const char* Name() const override { return name_.c_str(); }
+
+ private:
+ size_t MatchSeparatorAt(size_t start, Quantifier mode,
+ const std::string& target, size_t tlen,
+ const std::string& pattern) const;
+
+ bool MatchesTarget(const std::string& name, size_t nlen,
+ const std::string& target, size_t ylen) const;
+ std::string name_; // The base name for this entry
+ size_t nlength_; // The length of name_
+ std::vector<std::string> names_; // Alternative names for this entry
+ bool optional_; // Whether matching of separators is required
+ size_t slength_; // The minimum required length to match the separators
+ std::vector<std::pair<std::string, Quantifier>>
+ separators_; // What to match
+ }; // End class Entry
+
+ private:
+ // An Entry containing a FactoryFunc for creating new Objects
+ template <typename T>
+ class FactoryEntry : public Entry {
+ public:
+ FactoryEntry(Entry* e, FactoryFunc<T> f)
+ : entry_(e), factory_(std::move(f)) {}
+ bool Matches(const std::string& target) const override {
+ return entry_->Matches(target);
+ }
+ const char* Name() const override { return entry_->Name(); }
+
+ // Creates a new T object.
+ T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+ std::string* msg) const {
+ return factory_(target, guard, msg);
+ }
+ const FactoryFunc<T>& GetFactory() const { return factory_; }
+
+ private:
+ std::unique_ptr<Entry> entry_; // What to match for this entry
+ FactoryFunc<T> factory_;
+ }; // End class FactoryEntry
+ public:
+ explicit ObjectLibrary(const std::string& id) { id_ = id; }
+
+ const std::string& GetID() const { return id_; }
+
+ // Finds the factory function for the input target.
+ // @see PatternEntry for the matching rules to target
+ // @return If matched, the FactoryFunc for this target, else nullptr
+ template <typename T>
+ FactoryFunc<T> FindFactory(const std::string& target) const {
+ std::unique_lock<std::mutex> lock(mu_);
+ auto factories = factories_.find(T::Type());
+ if (factories != factories_.end()) {
+ for (const auto& e : factories->second) {
+ if (e->Matches(target)) {
+ const auto* fe =
+ static_cast<const ObjectLibrary::FactoryEntry<T>*>(e.get());
+ return fe->GetFactory();
+ }
+ }
+ }
+ return nullptr;
+ }
+
+ // Returns the total number of factories registered for this library.
+ // This method returns the sum of all factories registered for all types.
+ // @param num_types returns how many unique types are registered.
+ size_t GetFactoryCount(size_t* num_types) const;
+
+ // Returns the number of factories registered for this library
+ // for the input type.
+ // @param num_types returns how many unique types are registered.
+ size_t GetFactoryCount(const std::string& type) const;
+
+ // Returns the registered factory names for the input type
+ // names is updated to include the names for the type
+ void GetFactoryNames(const std::string& type,
+ std::vector<std::string>* names) const;
+
+ void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+ void Dump(Logger* logger) const;
+
+ // Registers the factory with the library for the name.
+ // If name==target, the factory may be used to create a new object.
+ template <typename T>
+ const FactoryFunc<T>& AddFactory(const std::string& name,
+ const FactoryFunc<T>& func) {
+ std::unique_ptr<Entry> entry(
+ new FactoryEntry<T>(new PatternEntry(name), func));
+ AddFactoryEntry(T::Type(), std::move(entry));
+ return func;
+ }
+
+ // Registers the factory with the library for the entry.
+ // If the entry matches the target, the factory may be used to create a new
+ // object.
+ // @see PatternEntry for the matching rules.
+ // NOTE: This function replaces the old ObjectLibrary::Register()
+ template <typename T>
+ const FactoryFunc<T>& AddFactory(const PatternEntry& entry,
+ const FactoryFunc<T>& func) {
+ std::unique_ptr<Entry> factory(
+ new FactoryEntry<T>(new PatternEntry(entry), func));
+ AddFactoryEntry(T::Type(), std::move(factory));
+ return func;
+ }
+
+ // Invokes the registrar function with the supplied arg for this library.
+ int Register(const RegistrarFunc& registrar, const std::string& arg) {
+ return registrar(*this, arg);
+ }
+
+ // Returns the default ObjectLibrary
+ static std::shared_ptr<ObjectLibrary>& Default();
+
+ private:
+ void AddFactoryEntry(const char* type, std::unique_ptr<Entry>&& entry) {
+ std::unique_lock<std::mutex> lock(mu_);
+ auto& factories = factories_[type];
+ factories.emplace_back(std::move(entry));
+ }
+
+ // Protects the entry map
+ mutable std::mutex mu_;
+ // ** FactoryFunctions for this loader, organized by type
+ std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>>
+ factories_;
+
+ // The name for this library
+ std::string id_;
+};
+
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+ static std::shared_ptr<ObjectRegistry> NewInstance();
+ static std::shared_ptr<ObjectRegistry> NewInstance(
+ const std::shared_ptr<ObjectRegistry>& parent);
+ static std::shared_ptr<ObjectRegistry> Default();
+ explicit ObjectRegistry(const std::shared_ptr<ObjectRegistry>& parent)
+ : parent_(parent) {}
+ explicit ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library);
+
+ std::shared_ptr<ObjectLibrary> AddLibrary(const std::string& id) {
+ auto library = std::make_shared<ObjectLibrary>(id);
+ AddLibrary(library);
+ return library;
+ }
+
+ void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ libraries_.push_back(library);
+ }
+
+ void AddLibrary(const std::string& id, const RegistrarFunc& registrar,
+ const std::string& arg) {
+ auto library = AddLibrary(id);
+ library->Register(registrar, arg);
+ }
+
+ // Finds the factory for target and instantiates a new T.
+ // Returns NotSupported if no factory is found
+ // Returns InvalidArgument if a factory is found but the factory failed.
+ template <typename T>
+ Status NewObject(const std::string& target, T** object,
+ std::unique_ptr<T>* guard) {
+ assert(guard != nullptr);
+ guard->reset();
+ auto factory = FindFactory<T>(target);
+ if (factory != nullptr) {
+ std::string errmsg;
+ *object = factory(target, guard, &errmsg);
+ if (*object != nullptr) {
+ return Status::OK();
+ } else if (errmsg.empty()) {
+ return Status::InvalidArgument(
+ std::string("Could not load ") + T::Type(), target);
+ } else {
+ return Status::InvalidArgument(errmsg, target);
+ }
+ } else {
+ return Status::NotSupported(std::string("Could not load ") + T::Type(),
+ target);
+ }
+ }
+ // Creates a new unique T using the input factory functions.
+ // Returns OK if a new unique T was successfully created
+ // Returns NotSupported if the type/target could not be created
+ // Returns InvalidArgument if the factory return an unguarded object
+ // (meaning it cannot be managed by a unique ptr)
+ template <typename T>
+ Status NewUniqueObject(const std::string& target,
+ std::unique_ptr<T>* result) {
+ T* ptr = nullptr;
+ std::unique_ptr<T> guard;
+ Status s = NewObject(target, &ptr, &guard);
+ if (!s.ok()) {
+ return s;
+ } else if (guard) {
+ result->reset(guard.release());
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument(std::string("Cannot make a unique ") +
+ T::Type() + " from unguarded one ",
+ target);
+ }
+ }
+
+ // Creates a new shared T using the input factory functions.
+ // Returns OK if a new shared T was successfully created
+ // Returns NotSupported if the type/target could not be created
+ // Returns InvalidArgument if the factory return an unguarded object
+ // (meaning it cannot be managed by a shared ptr)
+ template <typename T>
+ Status NewSharedObject(const std::string& target,
+ std::shared_ptr<T>* result) {
+ std::unique_ptr<T> guard;
+ T* ptr = nullptr;
+ Status s = NewObject(target, &ptr, &guard);
+ if (!s.ok()) {
+ return s;
+ } else if (guard) {
+ result->reset(guard.release());
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument(std::string("Cannot make a shared ") +
+ T::Type() + " from unguarded one ",
+ target);
+ }
+ }
+
+ // Creates a new static T using the input factory functions.
+ // Returns OK if a new static T was successfully created
+ // Returns NotSupported if the type/target could not be created
+ // Returns InvalidArgument if the factory return a guarded object
+ // (meaning it is managed by a unique ptr)
+ template <typename T>
+ Status NewStaticObject(const std::string& target, T** result) {
+ std::unique_ptr<T> guard;
+ T* ptr = nullptr;
+ Status s = NewObject(target, &ptr, &guard);
+ if (!s.ok()) {
+ return s;
+ } else if (guard.get()) {
+ return Status::InvalidArgument(std::string("Cannot make a static ") +
+ T::Type() + " from a guarded one ",
+ target);
+ } else {
+ *result = ptr;
+ return Status::OK();
+ }
+ }
+
+ // Sets the object for the given id/type to be the input object
+ // If the registry does not contain this id/type, the object is added and OK
+ // is returned. If the registry contains a different object, an error is
+ // returned. If the registry contains the input object, OK is returned.
+ template <typename T>
+ Status SetManagedObject(const std::shared_ptr<T>& object) {
+ assert(object != nullptr);
+ return SetManagedObject(object->GetId(), object);
+ }
+
+ template <typename T>
+ Status SetManagedObject(const std::string& id,
+ const std::shared_ptr<T>& object) {
+ const auto c = std::static_pointer_cast<Customizable>(object);
+ return SetManagedObject(T::Type(), id, c);
+ }
+
+ // Returns the object for the given id, if one exists.
+ // If the object is not found in the registry, a nullptr is returned
+ template <typename T>
+ std::shared_ptr<T> GetManagedObject(const std::string& id) const {
+ auto c = GetManagedObject(T::Type(), id);
+ return std::static_pointer_cast<T>(c);
+ }
+
+ // Returns the set of managed objects found in the registry matching
+ // the input type and ID.
+ // If the input id is not empty, then only objects of that class
+ // (IsInstanceOf(id)) will be returned (for example, only return LRUCache
+ // objects) If the input id is empty, then all objects of that type (all Cache
+ // objects)
+ template <typename T>
+ Status ListManagedObjects(const std::string& id,
+ std::vector<std::shared_ptr<T>>* results) const {
+ std::vector<std::shared_ptr<Customizable>> customizables;
+ results->clear();
+ Status s = ListManagedObjects(T::Type(), id, &customizables);
+ if (s.ok()) {
+ for (const auto& c : customizables) {
+ results->push_back(std::static_pointer_cast<T>(c));
+ }
+ }
+ return s;
+ }
+
+ template <typename T>
+ Status ListManagedObjects(std::vector<std::shared_ptr<T>>* results) const {
+ return ListManagedObjects("", results);
+ }
+
+ // Creates a new ManagedObject in the registry for the id if one does not
+ // currently exist. If an object with that ID already exists, the current
+ // object is returned.
+ //
+ // The ID is the identifier of the object to be returned/created and returned
+ // in result
+ // If a new object is created (using the object factories), the cfunc
+ // parameter will be invoked to configure the new object.
+ template <typename T>
+ Status GetOrCreateManagedObject(const std::string& id,
+ std::shared_ptr<T>* result,
+ const ConfigureFunc<T>& cfunc = nullptr) {
+ if (parent_ != nullptr) {
+ auto object = parent_->GetManagedObject(T::Type(), id);
+ if (object != nullptr) {
+ *result = std::static_pointer_cast<T>(object);
+ return Status::OK();
+ }
+ }
+ {
+ std::unique_lock<std::mutex> lock(objects_mutex_);
+ auto key = ToManagedObjectKey(T::Type(), id);
+ auto iter = managed_objects_.find(key);
+ if (iter != managed_objects_.end()) {
+ auto object = iter->second.lock();
+ if (object != nullptr) {
+ *result = std::static_pointer_cast<T>(object);
+ return Status::OK();
+ }
+ }
+ std::shared_ptr<T> object;
+ Status s = NewSharedObject(id, &object);
+ if (s.ok() && cfunc != nullptr) {
+ s = cfunc(object.get());
+ }
+ if (s.ok()) {
+ auto c = std::static_pointer_cast<Customizable>(object);
+ if (id != c->Name()) {
+ // If the ID is not the base name of the class, add the new
+ // object under the input ID
+ managed_objects_[key] = c;
+ }
+ if (id != c->GetId() && c->GetId() != c->Name()) {
+ // If the input and current ID do not match, and the
+ // current ID is not the base bame, add the new object under
+ // its new ID
+ key = ToManagedObjectKey(T::Type(), c->GetId());
+ managed_objects_[key] = c;
+ }
+ *result = object;
+ }
+ return s;
+ }
+ }
+
+ // Returns the number of factories registered for this library
+ // for the input type.
+ // @param num_types returns how many unique types are registered.
+ size_t GetFactoryCount(const std::string& type) const;
+
+ // Returns the names of registered factories for the input type.
+ // names is updated to include the names for the type
+ void GetFactoryNames(const std::string& type,
+ std::vector<std::string>* names) const;
+
+ void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+ // Dump the contents of the registry to the logger
+ void Dump(Logger* logger) const;
+
+ // Invokes the input function to retrieve the properties for this plugin.
+ int RegisterPlugin(const std::string& name, const RegistrarFunc& func);
+
+ private:
+ static std::string ToManagedObjectKey(const std::string& type,
+ const std::string& id) {
+ return type + "://" + id;
+ }
+
+ // Returns the Customizable managed object associated with the key (Type/ID).
+ // If not found, nullptr is returned.
+ std::shared_ptr<Customizable> GetManagedObject(const std::string& type,
+ const std::string& id) const;
+ Status ListManagedObjects(
+ const std::string& type, const std::string& pattern,
+ std::vector<std::shared_ptr<Customizable>>* results) const;
+ // Sets the managed object associated with the key (Type/ID) to c.
+ // If the named managed object does not exist, the object is added and OK is
+ // returned If the object exists and is the same as c, OK is returned
+ // Otherwise, an error status is returned.
+ Status SetManagedObject(const std::string& type, const std::string& id,
+ const std::shared_ptr<Customizable>& c);
+
+ // Searches (from back to front) the libraries looking for the
+ // factory that matches this name.
+ // Returns the factory if it is found, and nullptr otherwise
+ template <typename T>
+ const FactoryFunc<T> FindFactory(const std::string& name) const {
+ {
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ for (auto iter = libraries_.crbegin(); iter != libraries_.crend();
+ ++iter) {
+ const auto factory = iter->get()->FindFactory<T>(name);
+ if (factory != nullptr) {
+ return factory;
+ }
+ }
+ }
+ if (parent_ == nullptr) {
+ return nullptr;
+ } else {
+ return parent_->FindFactory<T>(name);
+ }
+ }
+
+ // The set of libraries to search for factories for this registry.
+ // The libraries are searched in reverse order (back to front) when
+ // searching for entries.
+ std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+ std::vector<std::string> plugins_;
+ static std::unordered_map<std::string, RegistrarFunc> builtins_;
+ std::map<std::string, std::weak_ptr<Customizable>> managed_objects_;
+ std::shared_ptr<ObjectRegistry> parent_;
+ mutable std::mutex objects_mutex_; // Mutex for managed objects
+ mutable std::mutex library_mutex_; // Mutex for managed libraries
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
new file mode 100644
index 000000000..c070e49a3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Transaction;
+
+// Database with Transaction support.
+//
+// See optimistic_transaction.h and examples/transaction_example.cc
+
+// Options to use when starting an Optimistic Transaction
+struct OptimisticTransactionOptions {
+ // Setting set_snapshot=true is the same as calling SetSnapshot().
+ bool set_snapshot = false;
+
+ // Should be set if the DB has a non-default comparator.
+ // See comment in WriteBatchWithIndex constructor.
+ const Comparator* cmp = BytewiseComparator();
+};
+
+enum class OccValidationPolicy {
+ // Validate serially at commit stage, AFTER entering the write-group.
+ // Isolation validation is processed single-threaded(since in the
+ // write-group).
+ // May suffer from high mutex contention, as per this link:
+ // https://github.com/facebook/rocksdb/issues/4402
+ kValidateSerial = 0,
+ // Validate parallelly before commit stage, BEFORE entering the write-group to
+ // reduce mutex contention. Each txn acquires locks for its write-set
+ // records in some well-defined order.
+ kValidateParallel = 1
+};
+
+struct OptimisticTransactionDBOptions {
+ OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel;
+
+ // works only if validate_policy == OccValidationPolicy::kValidateParallel
+ uint32_t occ_lock_buckets = (1 << 20);
+};
+
+// Range deletions (including those in `WriteBatch`es passed to `Write()`) are
+// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status`
+class OptimisticTransactionDB : public StackableDB {
+ public:
+ // Open an OptimisticTransactionDB similar to DB::Open().
+ static Status Open(const Options& options, const std::string& dbname,
+ OptimisticTransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options,
+ const OptimisticTransactionDBOptions& occ_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr);
+
+ virtual ~OptimisticTransactionDB() {}
+
+ // Starts a new Transaction.
+ //
+ // Caller is responsible for deleting the returned transaction when no
+ // longer needed.
+ //
+ // If old_txn is not null, BeginTransaction will reuse this Transaction
+ // handle instead of allocating a new one. This is an optimization to avoid
+ // extra allocations when repeatedly creating transactions.
+ virtual Transaction* BeginTransaction(
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options =
+ OptimisticTransactionOptions(),
+ Transaction* old_txn = nullptr) = 0;
+
+ OptimisticTransactionDB(const OptimisticTransactionDB&) = delete;
+ void operator=(const OptimisticTransactionDB&) = delete;
+
+ protected:
+ // To Create an OptimisticTransactionDB, call Open()
+ explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
new file mode 100644
index 000000000..a73324a9e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Try to migrate DB created with old_opts to be use new_opts.
+// Multiple column families is not supported.
+// It is best-effort. No guarantee to succeed.
+// A full compaction may be executed.
+// If the target options use FIFO compaction, the FIFO condition might be
+// sacrificed: for data migrated, data inserted later might be dropped
+// earlier. This is to gurantee FIFO compaction won't drop all the
+// migrated data to fit max_table_files_size.
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+ const Options& new_opts);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_type.h b/src/rocksdb/include/rocksdb/utilities/options_type.h
new file mode 100644
index 000000000..cd340ed59
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_type.h
@@ -0,0 +1,1221 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// The OptionTypeInfo and related classes provide a framework for
+// configuring and validating RocksDB classes via the Options framework.
+// This file is part of the public API to allow developers who wish to
+// write their own extensions and plugins to take use the Options
+// framework in their custom implementations.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use custom extensions
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct DBOptions;
+
+// The underlying "class/type" of the option.
+// This enum is used to determine how the option should
+// be converted to/from strings and compared.
+enum class OptionType {
+ kBoolean,
+ kInt,
+ kInt32T,
+ kInt64T,
+ kUInt,
+ kUInt8T,
+ kUInt32T,
+ kUInt64T,
+ kSizeT,
+ kString,
+ kDouble,
+ kCompactionStyle,
+ kCompactionPri,
+ kCompressionType,
+ kCompactionStopStyle,
+ kChecksumType,
+ kEncodingType,
+ kEnv,
+ kEnum,
+ kStruct,
+ kVector,
+ kConfigurable,
+ kCustomizable,
+ kEncodedString,
+ kTemperature,
+ kArray,
+ kUnknown,
+};
+
+enum class OptionVerificationType {
+ kNormal,
+ kByName, // The option is pointer typed so we can only verify
+ // based on it's name.
+ kByNameAllowNull, // Same as kByName, but it also allows the case
+ // where one of them is a nullptr.
+ kByNameAllowFromNull, // Same as kByName, but it also allows the case
+ // where the old option is nullptr.
+ kDeprecated, // The option is no longer used in rocksdb. The RocksDB
+ // OptionsParser will still accept this option if it
+ // happen to exists in some Options file. However,
+ // the parser will not include it in serialization
+ // and verification processes.
+ kAlias, // This option represents is a name/shortcut for
+ // another option and should not be written or verified
+ // independently
+};
+
+// A set of modifier flags used to alter how an option is evaluated or
+// processed. These flags can be combined together (e.g. kMutable | kShared).
+// The kCompare flags can be used to control if/when options are compared.
+// If kCompareNever is set, two related options would never be compared (always
+// equal) If kCompareExact is set, the options will only be compared if the
+// sanity mode
+// is exact
+// kMutable means the option can be changed after it is prepared
+// kShared means the option is contained in a std::shared_ptr
+// kUnique means the option is contained in a std::uniqued_ptr
+// kRawPointer means the option is a raw pointer value.
+// kAllowNull means that an option is allowed to be null for verification
+// purposes.
+// kDontSerialize means this option should not be serialized and included in
+// the string representation.
+// kDontPrepare means do not call PrepareOptions for this pointer value.
+enum class OptionTypeFlags : uint32_t {
+ kNone = 0x00, // No flags
+ kCompareDefault = 0x0,
+ kCompareNever = ConfigOptions::kSanityLevelNone,
+ kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible,
+ kCompareExact = ConfigOptions::kSanityLevelExactMatch,
+
+ kMutable = 0x0100, // Option is mutable
+ kRawPointer = 0x0200, // The option is stored as a raw pointer
+ kShared = 0x0400, // The option is stored as a shared_ptr
+ kUnique = 0x0800, // The option is stored as a unique_ptr
+ kAllowNull = 0x1000, // The option can be null
+ kDontSerialize = 0x2000, // Don't serialize the option
+ kDontPrepare = 0x4000, // Don't prepare or sanitize this option
+ kStringNameOnly = 0x8000, // The option serializes to a name only
+};
+
+inline OptionTypeFlags operator|(const OptionTypeFlags& a,
+ const OptionTypeFlags& b) {
+ return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) |
+ static_cast<uint32_t>(b));
+}
+
+inline OptionTypeFlags operator&(const OptionTypeFlags& a,
+ const OptionTypeFlags& b) {
+ return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) &
+ static_cast<uint32_t>(b));
+}
+
+// Converts an string into its enumerated value.
+// @param type_map Mapping between strings and enum values
+// @param type The string representation of the enum
+// @param value Returns the enum value represented by the string
+// @return true if the string was found in the enum map, false otherwise.
+template <typename T>
+bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
+ const std::string& type, T* value) {
+ auto iter = type_map.find(type);
+ if (iter != type_map.end()) {
+ *value = iter->second;
+ return true;
+ }
+ return false;
+}
+
+// Converts an enum into its string representation.
+// @param type_map Mapping between strings and enum values
+// @param type The enum
+// @param value Returned as the string representation of the enum
+// @return true if the enum was found in the enum map, false otherwise.
+template <typename T>
+bool SerializeEnum(const std::unordered_map<std::string, T>& type_map,
+ const T& type, std::string* value) {
+ for (const auto& pair : type_map) {
+ if (pair.second == type) {
+ *value = pair.first;
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::string& value,
+ std::array<T, kSize>* result);
+
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::array<T, kSize>& vec,
+ std::string* value);
+
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, const std::string& name,
+ const std::array<T, kSize>& array1,
+ const std::array<T, kSize>& array2, std::string* mismatch);
+
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::string& value,
+ std::vector<T>* result);
+
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::vector<T>& vec,
+ std::string* value);
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, const std::string& name,
+ const std::vector<T>& vec1, const std::vector<T>& vec2,
+ std::string* mismatch);
+
+// Function for converting a option string value into its underlying
+// representation in "addr"
+// On success, Status::OK is returned and addr is set to the parsed form
+// On failure, a non-OK status is returned
+// @param opts The ConfigOptions controlling how the value is parsed
+// @param name The name of the options being parsed
+// @param value The string representation of the option
+// @param addr Pointer to the object
+using ParseFunc = std::function<Status(
+ const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const std::string& /*value*/, void* /*addr*/)>;
+
+// Function for converting an option "addr" into its string representation.
+// On success, Status::OK is returned and value is the serialized form.
+// On failure, a non-OK status is returned
+// @param opts The ConfigOptions controlling how the values are serialized
+// @param name The name of the options being serialized
+// @param addr Pointer to the value being serialized
+// @param value The result of the serialization.
+using SerializeFunc = std::function<Status(
+ const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const void* /*addr*/, std::string* /*value*/)>;
+
+// Function for comparing two option values
+// If they are not equal, updates "mismatch" with the name of the bad option
+// @param opts The ConfigOptions controlling how the values are compared
+// @param name The name of the options being compared
+// @param addr1 The first address to compare
+// @param addr2 The address to compare to
+// @param mismatch If the values are not equal, the name of the option that
+// first differs
+using EqualsFunc = std::function<bool(
+ const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>;
+
+// Function for preparing/initializing an option.
+using PrepareFunc =
+ std::function<Status(const ConfigOptions& /*opts*/,
+ const std::string& /*name*/, void* /*addr*/)>;
+
+// Function for validating an option.
+using ValidateFunc = std::function<Status(
+ const DBOptions& /*db_opts*/, const ColumnFamilyOptions& /*cf_opts*/,
+ const std::string& /*name*/, const void* /*addr*/)>;
+
+// A struct for storing constant option information such as option name,
+// option type, and offset.
+class OptionTypeInfo {
+ public:
+ // A simple "normal", non-mutable Type "type" at offset
+ OptionTypeInfo(int offset, OptionType type)
+ : offset_(offset),
+ parse_func_(nullptr),
+ serialize_func_(nullptr),
+ equals_func_(nullptr),
+ type_(type),
+ verification_(OptionVerificationType::kNormal),
+ flags_(OptionTypeFlags::kNone) {}
+
+ OptionTypeInfo(int offset, OptionType type,
+ OptionVerificationType verification, OptionTypeFlags flags)
+ : offset_(offset),
+ parse_func_(nullptr),
+ serialize_func_(nullptr),
+ equals_func_(nullptr),
+ type_(type),
+ verification_(verification),
+ flags_(flags) {}
+
+ OptionTypeInfo(int offset, OptionType type,
+ OptionVerificationType verification, OptionTypeFlags flags,
+ const ParseFunc& parse_func)
+ : offset_(offset),
+ parse_func_(parse_func),
+ serialize_func_(nullptr),
+ equals_func_(nullptr),
+ type_(type),
+ verification_(verification),
+ flags_(flags) {}
+
+ OptionTypeInfo(int offset, OptionType type,
+ OptionVerificationType verification, OptionTypeFlags flags,
+ const ParseFunc& parse_func,
+ const SerializeFunc& serialize_func,
+ const EqualsFunc& equals_func)
+ : offset_(offset),
+ parse_func_(parse_func),
+ serialize_func_(serialize_func),
+ equals_func_(equals_func),
+ type_(type),
+ verification_(verification),
+ flags_(flags) {}
+
+ // Creates an OptionTypeInfo for an enum type. Enums use an additional
+ // map to convert the enums to/from their string representation.
+ // To create an OptionTypeInfo that is an Enum, one should:
+ // - Create a static map of string values to the corresponding enum value
+ // - Call this method passing the static map in as a parameter.
+ // Note that it is not necessary to add a new OptionType or make any
+ // other changes -- the returned object handles parsing, serialization, and
+ // comparisons.
+ //
+ // @param offset The offset in the option object for this enum
+ // @param map The string to enum mapping for this enum
+ template <typename T>
+ static OptionTypeInfo Enum(
+ int offset, const std::unordered_map<std::string, T>* const map,
+ OptionTypeFlags flags = OptionTypeFlags::kNone) {
+ OptionTypeInfo info(offset, OptionType::kEnum,
+ OptionVerificationType::kNormal, flags);
+ info.SetParseFunc(
+ // Uses the map argument to convert the input string into
+ // its corresponding enum value. If value is found in the map,
+ // addr is updated to the corresponding map entry.
+ // @return OK if the value is found in the map
+ // @return InvalidArgument if the value is not found in the map
+ [map](const ConfigOptions&, const std::string& name,
+ const std::string& value, void* addr) {
+ if (map == nullptr) {
+ return Status::NotSupported("No enum mapping ", name);
+ } else if (ParseEnum<T>(*map, value, static_cast<T*>(addr))) {
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument("No mapping for enum ", name);
+ }
+ });
+ info.SetSerializeFunc(
+ // Uses the map argument to convert the input enum into
+ // its corresponding string value. If enum value is found in the map,
+ // value is updated to the corresponding string value in the map.
+ // @return OK if the enum is found in the map
+ // @return InvalidArgument if the enum is not found in the map
+ [map](const ConfigOptions&, const std::string& name, const void* addr,
+ std::string* value) {
+ if (map == nullptr) {
+ return Status::NotSupported("No enum mapping ", name);
+ } else if (SerializeEnum<T>(*map, (*static_cast<const T*>(addr)),
+ value)) {
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument("No mapping for enum ", name);
+ }
+ });
+ info.SetEqualsFunc(
+ // Casts addr1 and addr2 to the enum type and returns true if
+ // they are equal, false otherwise.
+ [](const ConfigOptions&, const std::string&, const void* addr1,
+ const void* addr2, std::string*) {
+ return (*static_cast<const T*>(addr1) ==
+ *static_cast<const T*>(addr2));
+ });
+ return info;
+ } // End OptionTypeInfo::Enum
+
+ // Creates an OptionTypeInfo for a Struct type. Structs have a
+ // map of string-OptionTypeInfo associated with them that describes how
+ // to process the object for parsing, serializing, and matching.
+ // Structs also have a struct_name, which is the name of the object
+ // as registered in the parent map.
+ // When processing a struct, the option name can be specified as:
+ // - <struct_name> Meaning to process the entire struct.
+ // - <struct_name.field> Meaning to process the single field
+ // - <field> Process the single fields
+ // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions
+ // are all examples of Struct options.
+ //
+ // To create an OptionTypeInfo that is a Struct, one should:
+ // - Create a static map of string-OptionTypeInfo corresponding to the
+ // properties of the object that can be set via the options.
+ // - Call this method passing the name and map in as parameters.
+ // Note that it is not necessary to add a new OptionType or make any
+ // other changes -- the returned object handles parsing, serialization, and
+ // comparisons.
+ //
+ // @param offset The offset in the option object for this enum
+ // @param map The string to enum mapping for this enum
+ static OptionTypeInfo Struct(
+ const std::string& struct_name,
+ const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+ int offset, OptionVerificationType verification, OptionTypeFlags flags) {
+ OptionTypeInfo info(offset, OptionType::kStruct, verification, flags);
+ info.SetParseFunc(
+ // Parses the struct and updates the fields at addr
+ [struct_name, struct_map](const ConfigOptions& opts,
+ const std::string& name,
+ const std::string& value, void* addr) {
+ return ParseStruct(opts, struct_name, struct_map, name, value, addr);
+ });
+ info.SetSerializeFunc(
+ // Serializes the struct options into value
+ [struct_name, struct_map](const ConfigOptions& opts,
+ const std::string& name, const void* addr,
+ std::string* value) {
+ return SerializeStruct(opts, struct_name, struct_map, name, addr,
+ value);
+ });
+ info.SetEqualsFunc(
+ // Compares the struct fields of addr1 and addr2 for equality
+ [struct_name, struct_map](const ConfigOptions& opts,
+ const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
+ addr2, mismatch);
+ });
+ return info;
+ }
+ static OptionTypeInfo Struct(
+ const std::string& struct_name,
+ const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+ int offset, OptionVerificationType verification, OptionTypeFlags flags,
+ const ParseFunc& parse_func) {
+ OptionTypeInfo info(
+ Struct(struct_name, struct_map, offset, verification, flags));
+ return info.SetParseFunc(parse_func);
+ }
+
+ template <typename T, size_t kSize>
+ static OptionTypeInfo Array(int _offset, OptionVerificationType _verification,
+ OptionTypeFlags _flags,
+ const OptionTypeInfo& elem_info,
+ char separator = ':') {
+ OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags);
+ info.SetParseFunc([elem_info, separator](
+ const ConfigOptions& opts, const std::string& name,
+ const std::string& value, void* addr) {
+ auto result = static_cast<std::array<T, kSize>*>(addr);
+ return ParseArray<T, kSize>(opts, elem_info, separator, name, value,
+ result);
+ });
+ info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+ const std::string& name,
+ const void* addr,
+ std::string* value) {
+ const auto& array = *(static_cast<const std::array<T, kSize>*>(addr));
+ return SerializeArray<T, kSize>(opts, elem_info, separator, name, array,
+ value);
+ });
+ info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+ const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto& array1 = *(static_cast<const std::array<T, kSize>*>(addr1));
+ const auto& array2 = *(static_cast<const std::array<T, kSize>*>(addr2));
+ return ArraysAreEqual<T, kSize>(opts, elem_info, name, array1, array2,
+ mismatch);
+ });
+ return info;
+ }
+
+ template <typename T>
+ static OptionTypeInfo Vector(int _offset,
+ OptionVerificationType _verification,
+ OptionTypeFlags _flags,
+ const OptionTypeInfo& elem_info,
+ char separator = ':') {
+ OptionTypeInfo info(_offset, OptionType::kVector, _verification, _flags);
+ info.SetParseFunc([elem_info, separator](
+ const ConfigOptions& opts, const std::string& name,
+ const std::string& value, void* addr) {
+ auto result = static_cast<std::vector<T>*>(addr);
+ return ParseVector<T>(opts, elem_info, separator, name, value, result);
+ });
+ info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+ const std::string& name,
+ const void* addr,
+ std::string* value) {
+ const auto& vec = *(static_cast<const std::vector<T>*>(addr));
+ return SerializeVector<T>(opts, elem_info, separator, name, vec, value);
+ });
+ info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+ const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto& vec1 = *(static_cast<const std::vector<T>*>(addr1));
+ const auto& vec2 = *(static_cast<const std::vector<T>*>(addr2));
+ return VectorsAreEqual<T>(opts, elem_info, name, vec1, vec2, mismatch);
+ });
+ return info;
+ }
+
+ // Create a new std::shared_ptr<Customizable> OptionTypeInfo
+ // This function will call the T::CreateFromString method to create a new
+ // std::shared_ptr<T> object.
+ //
+ // @param offset The offset for the Customizable from the base pointer
+ // @param ovt How to verify this option
+ // @param flags, Extra flags specifying the behavior of this option
+ // @param _sfunc Optional function for serializing this option
+ // @param _efunc Optional function for comparing this option
+ template <typename T>
+ static OptionTypeInfo AsCustomSharedPtr(int offset,
+ OptionVerificationType ovt,
+ OptionTypeFlags flags) {
+ OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+ flags | OptionTypeFlags::kShared);
+ return info.SetParseFunc([](const ConfigOptions& opts,
+ const std::string& name,
+ const std::string& value, void* addr) {
+ auto* shared = static_cast<std::shared_ptr<T>*>(addr);
+ if (name == kIdPropName() && value.empty()) {
+ shared->reset();
+ return Status::OK();
+ } else {
+ return T::CreateFromString(opts, value, shared);
+ }
+ });
+ }
+
+ template <typename T>
+ static OptionTypeInfo AsCustomSharedPtr(int offset,
+ OptionVerificationType ovt,
+ OptionTypeFlags flags,
+ const SerializeFunc& serialize_func,
+ const EqualsFunc& equals_func) {
+ OptionTypeInfo info(AsCustomSharedPtr<T>(offset, ovt, flags));
+ info.SetSerializeFunc(serialize_func);
+ info.SetEqualsFunc(equals_func);
+ return info;
+ }
+
+ // Create a new std::unique_ptr<Customizable> OptionTypeInfo
+ // This function will call the T::CreateFromString method to create a new
+ // std::unique_ptr<T> object.
+ //
+ // @param offset The offset for the Customizable from the base pointer
+ // @param ovt How to verify this option
+ // @param flags, Extra flags specifying the behavior of this option
+ // @param _sfunc Optional function for serializing this option
+ // @param _efunc Optional function for comparing this option
+ template <typename T>
+ static OptionTypeInfo AsCustomUniquePtr(int offset,
+ OptionVerificationType ovt,
+ OptionTypeFlags flags) {
+ OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+ flags | OptionTypeFlags::kUnique);
+ return info.SetParseFunc([](const ConfigOptions& opts,
+ const std::string& name,
+ const std::string& value, void* addr) {
+ auto* unique = static_cast<std::unique_ptr<T>*>(addr);
+ if (name == kIdPropName() && value.empty()) {
+ unique->reset();
+ return Status::OK();
+ } else {
+ return T::CreateFromString(opts, value, unique);
+ }
+ });
+ }
+
+ template <typename T>
+ static OptionTypeInfo AsCustomUniquePtr(int offset,
+ OptionVerificationType ovt,
+ OptionTypeFlags flags,
+ const SerializeFunc& serialize_func,
+ const EqualsFunc& equals_func) {
+ OptionTypeInfo info(AsCustomUniquePtr<T>(offset, ovt, flags));
+ info.SetSerializeFunc(serialize_func);
+ info.SetEqualsFunc(equals_func);
+ return info;
+ }
+
+ // Create a new Customizable* OptionTypeInfo
+ // This function will call the T::CreateFromString method to create a new
+ // T object.
+ //
+ // @param _offset The offset for the Customizable from the base pointer
+ // @param ovt How to verify this option
+ // @param flags, Extra flags specifying the behavior of this option
+ // @param _sfunc Optional function for serializing this option
+ // @param _efunc Optional function for comparing this option
+ template <typename T>
+ static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+ OptionTypeFlags flags) {
+ OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+ flags | OptionTypeFlags::kRawPointer);
+ return info.SetParseFunc([](const ConfigOptions& opts,
+ const std::string& name,
+ const std::string& value, void* addr) {
+ auto** pointer = static_cast<T**>(addr);
+ if (name == kIdPropName() && value.empty()) {
+ *pointer = nullptr;
+ return Status::OK();
+ } else {
+ return T::CreateFromString(opts, value, pointer);
+ }
+ });
+ }
+
+ template <typename T>
+ static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+ OptionTypeFlags flags,
+ const SerializeFunc& serialize_func,
+ const EqualsFunc& equals_func) {
+ OptionTypeInfo info(AsCustomRawPtr<T>(offset, ovt, flags));
+ info.SetSerializeFunc(serialize_func);
+ info.SetEqualsFunc(equals_func);
+ return info;
+ }
+
+ OptionTypeInfo& SetParseFunc(const ParseFunc& f) {
+ parse_func_ = f;
+ return *this;
+ }
+
+ OptionTypeInfo& SetSerializeFunc(const SerializeFunc& f) {
+ serialize_func_ = f;
+ return *this;
+ }
+ OptionTypeInfo& SetEqualsFunc(const EqualsFunc& f) {
+ equals_func_ = f;
+ return *this;
+ }
+
+ OptionTypeInfo& SetPrepareFunc(const PrepareFunc& f) {
+ prepare_func_ = f;
+ return *this;
+ }
+
+ OptionTypeInfo& SetValidateFunc(const ValidateFunc& f) {
+ validate_func_ = f;
+ return *this;
+ }
+
+ bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; }
+
+ bool IsEditable(const ConfigOptions& opts) const {
+ if (opts.mutable_options_only) {
+ return IsMutable();
+ } else {
+ return true;
+ }
+ }
+ bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); }
+
+ bool IsDeprecated() const {
+ return IsEnabled(OptionVerificationType::kDeprecated);
+ }
+
+ // Returns true if the option is marked as an Alias.
+ // Aliases are valid options that are parsed but are not converted to strings
+ // or compared.
+ bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); }
+
+ bool IsEnabled(OptionVerificationType ovf) const {
+ return verification_ == ovf;
+ }
+
+ // Returns the sanity level for comparing the option.
+ // If the options should not be compared, returns None
+ // If the option has a compare flag, returns it.
+ // Otherwise, returns "exact"
+ ConfigOptions::SanityLevel GetSanityLevel() const {
+ if (IsDeprecated() || IsAlias()) {
+ return ConfigOptions::SanityLevel::kSanityLevelNone;
+ } else {
+ auto match = (flags_ & OptionTypeFlags::kCompareExact);
+ if (match == OptionTypeFlags::kCompareDefault) {
+ return ConfigOptions::SanityLevel::kSanityLevelExactMatch;
+ } else {
+ return (ConfigOptions::SanityLevel)match;
+ }
+ }
+ }
+
+ // Returns true if the option should be serialized.
+ // Options should be serialized if the are not deprecated, aliases,
+ // or marked as "Don't Serialize".
+ bool ShouldSerialize() const {
+ if (IsDeprecated() || IsAlias()) {
+ return false;
+ } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ bool ShouldPrepare() const {
+ if (IsDeprecated() || IsAlias()) {
+ return false;
+ } else if (IsEnabled(OptionTypeFlags::kDontPrepare)) {
+ return false;
+ } else {
+ return (prepare_func_ != nullptr || IsConfigurable());
+ }
+ }
+
+ bool ShouldValidate() const {
+ if (IsDeprecated() || IsAlias()) {
+ return false;
+ } else {
+ return (validate_func_ != nullptr || IsConfigurable());
+ }
+ }
+
+ // Returns true if the option is allowed to be null.
+ // Options can be null if the verification type is allow from null
+ // or if the flags specify allow null.
+ bool CanBeNull() const {
+ return (IsEnabled(OptionTypeFlags::kAllowNull) ||
+ IsEnabled(OptionVerificationType::kByNameAllowNull) ||
+ IsEnabled(OptionVerificationType::kByNameAllowFromNull));
+ }
+
+ bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); }
+
+ bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); }
+
+ bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); }
+
+ bool IsByName() const {
+ return (verification_ == OptionVerificationType::kByName ||
+ verification_ == OptionVerificationType::kByNameAllowNull ||
+ verification_ == OptionVerificationType::kByNameAllowFromNull);
+ }
+
+ bool IsStruct() const { return (type_ == OptionType::kStruct); }
+
+ bool IsConfigurable() const {
+ return (type_ == OptionType::kConfigurable ||
+ type_ == OptionType::kCustomizable);
+ }
+
+ bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); }
+
+ inline const void* GetOffset(const void* base) const {
+ return static_cast<const char*>(base) + offset_;
+ }
+
+ inline void* GetOffset(void* base) const {
+ return static_cast<char*>(base) + offset_;
+ }
+
+ template <typename T>
+ const T* GetOffsetAs(const void* base) const {
+ const void* addr = GetOffset(base);
+ return static_cast<const T*>(addr);
+ }
+
+ template <typename T>
+ T* GetOffsetAs(void* base) const {
+ void* addr = GetOffset(base);
+ return static_cast<T*>(addr);
+ }
+
+ // Returns the underlying pointer for the type at base_addr
+ // The value returned is the underlying "raw" pointer, offset from base.
+ template <typename T>
+ const T* AsRawPointer(const void* const base_addr) const {
+ if (base_addr == nullptr) {
+ return nullptr;
+ }
+ if (IsUniquePtr()) {
+ const auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+ return ptr->get();
+ } else if (IsSharedPtr()) {
+ const auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+ return ptr->get();
+ } else if (IsRawPtr()) {
+ const T* const* ptr = GetOffsetAs<T* const>(base_addr);
+ return *ptr;
+ } else {
+ return GetOffsetAs<T>(base_addr);
+ }
+ }
+
+ // Returns the underlying pointer for the type at base_addr
+ // The value returned is the underlying "raw" pointer, offset from base.
+ template <typename T>
+ T* AsRawPointer(void* base_addr) const {
+ if (base_addr == nullptr) {
+ return nullptr;
+ }
+ if (IsUniquePtr()) {
+ auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+ return ptr->get();
+ } else if (IsSharedPtr()) {
+ auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+ return ptr->get();
+ } else if (IsRawPtr()) {
+ auto ptr = GetOffsetAs<T*>(base_addr);
+ return *ptr;
+ } else {
+ return GetOffsetAs<T>(base_addr);
+ }
+ }
+
+ // Parses the option in "opt_value" according to the rules of this class
+ // and updates the value at "opt_ptr".
+ // On success, Status::OK() is returned. On failure:
+ // NotFound means the opt_name is not valid for this option
+ // NotSupported means we do not know how to parse the value for this option
+ // InvalidArgument means the opt_value is not valid for this option.
+ Status Parse(const ConfigOptions& config_options, const std::string& opt_name,
+ const std::string& opt_value, void* const opt_ptr) const;
+
+ // Serializes the option in "opt_addr" according to the rules of this class
+ // into the value at "opt_value".
+ Status Serialize(const ConfigOptions& config_options,
+ const std::string& opt_name, const void* const opt_ptr,
+ std::string* opt_value) const;
+
+ // Compares the "addr1" and "addr2" values according to the rules of this
+ // class and returns true if they match. On a failed match, mismatch is the
+ // name of the option that failed to match.
+ bool AreEqual(const ConfigOptions& config_options,
+ const std::string& opt_name, const void* const addr1,
+ const void* const addr2, std::string* mismatch) const;
+
+ // Used to override the match rules for "ByName" options.
+ bool AreEqualByName(const ConfigOptions& config_options,
+ const std::string& opt_name, const void* const this_ptr,
+ const void* const that_ptr) const;
+ bool AreEqualByName(const ConfigOptions& config_options,
+ const std::string& opt_name, const void* const this_ptr,
+ const std::string& that_value) const;
+
+ Status Prepare(const ConfigOptions& config_options, const std::string& name,
+ void* opt_ptr) const;
+ Status Validate(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts,
+ const std::string& name, const void* opt_ptr) const;
+
+ // Parses the input opts_map according to the type_map for the opt_addr
+ // For each name-value pair in opts_map, find the corresponding name in
+ // type_map If the name is found:
+ // - set the corresponding value in opt_addr, returning the status on
+ // failure;
+ // If the name is not found:
+ // - If unused is specified, add the name-value to unused and continue
+ // - If ingore_unknown_options is false, return NotFound
+ // Returns OK if all options were either:
+ // - Successfully set
+ // - options were not found and ignore_unknown_options=true
+ // - options were not found and unused was specified
+ // Note that this method is much less sophisticated than the comparable
+ // Configurable::Configure methods. For example, on error, there is no
+ // attempt to return opt_addr to the initial state. Additionally, there
+ // is no effort to initialize (Configurable::PrepareOptions) the object
+ // on success. This method should typically only be used for simpler,
+ // standalone structures and not those that contain shared and embedded
+ // objects.
+ static Status ParseType(
+ const ConfigOptions& config_options, const std::string& opts_str,
+ const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+ void* opt_addr,
+ std::unordered_map<std::string, std::string>* unused = nullptr);
+ static Status ParseType(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+ void* opt_addr,
+ std::unordered_map<std::string, std::string>* unused = nullptr);
+
+ // Parses the input value according to the map for the struct at opt_addr
+ // struct_name is the name of the struct option as registered
+ // opt_name is the name of the option being evaluated. This may
+ // be the whole struct or a sub-element of it, based on struct_name and
+ // opt_name.
+ static Status ParseStruct(
+ const ConfigOptions& config_options, const std::string& struct_name,
+ const std::unordered_map<std::string, OptionTypeInfo>* map,
+ const std::string& opt_name, const std::string& value, void* opt_addr);
+
+ // Serializes the values from opt_addr using the rules in type_map.
+ // Returns the serialized form in result.
+ // Returns OK on success or non-OK if some option could not be serialized.
+ static Status SerializeType(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+ const void* opt_addr, std::string* value);
+
+ // Serializes the input addr according to the map for the struct to value.
+ // struct_name is the name of the struct option as registered
+ // opt_name is the name of the option being evaluated. This may
+ // be the whole struct or a sub-element of it
+ static Status SerializeStruct(
+ const ConfigOptions& config_options, const std::string& struct_name,
+ const std::unordered_map<std::string, OptionTypeInfo>* map,
+ const std::string& opt_name, const void* opt_addr, std::string* value);
+
+ // Compares the values in this_addr and that_addr using the rules in type_map.
+ // If the values are equal, returns true
+ // If the values are not equal, returns false and sets mismatch to the name
+ // of the first value that did not match.
+ static bool TypesAreEqual(
+ const ConfigOptions& config_options,
+ const std::unordered_map<std::string, OptionTypeInfo>& map,
+ const void* this_addr, const void* that_addr, std::string* mismatch);
+
+ // Compares the input offsets according to the map for the struct and returns
+ // true if they are equivalent, false otherwise.
+ // struct_name is the name of the struct option as registered
+ // opt_name is the name of the option being evaluated. This may
+ // be the whole struct or a sub-element of it
+ static bool StructsAreEqual(
+ const ConfigOptions& config_options, const std::string& struct_name,
+ const std::unordered_map<std::string, OptionTypeInfo>* map,
+ const std::string& opt_name, const void* this_offset,
+ const void* that_offset, std::string* mismatch);
+
+ // Finds the entry for the opt_name in the opt_map, returning
+ // nullptr if not found.
+ // If found, elem_name will be the name of option to find.
+ // This may be opt_name, or a substring of opt_name.
+ // For "simple" options, opt_name will be equal to elem_name. Given the
+ // opt_name "opt", elem_name will equal "opt".
+ // For "embedded" options (like structs), elem_name may be opt_name
+ // or a field within the opt_name. For example, given the struct "struct",
+ // and opt_name of "struct.field", elem_name will be "field"
+ static const OptionTypeInfo* Find(
+ const std::string& opt_name,
+ const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+ std::string* elem_name);
+
+ // Returns the next token marked by the delimiter from "opts" after start in
+ // token and updates end to point to where that token stops. Delimiters inside
+ // of braces are ignored. Returns OK if a token is found and an error if the
+ // input opts string is mis-formatted.
+ // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points
+ // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B"
+ //
+ // @param opts The string in which to find the next token
+ // @param delimiter The delimiter between tokens
+ // @param start The position in opts to start looking for the token
+ // @param ed Returns the end position in opts of the token
+ // @param token Returns the token
+ // @returns OK if a token was found
+ // @return InvalidArgument if the braces mismatch
+ // (e.g. "{a={b=c;}" ) -- missing closing brace
+ // @return InvalidArgument if an expected delimiter is not found
+ // e.g. "{a=b}c=d;" -- missing delimiter before "c"
+ static Status NextToken(const std::string& opts, char delimiter, size_t start,
+ size_t* end, std::string* token);
+
+ constexpr static const char* kIdPropName() { return "id"; }
+ constexpr static const char* kIdPropSuffix() { return ".id"; }
+
+ private:
+ int offset_;
+
+ // The optional function to convert a string to its representation
+ ParseFunc parse_func_;
+
+ // The optional function to convert a value to its string representation
+ SerializeFunc serialize_func_;
+
+ // The optional function to match two option values
+ EqualsFunc equals_func_;
+
+ PrepareFunc prepare_func_;
+ ValidateFunc validate_func_;
+ OptionType type_;
+ OptionVerificationType verification_;
+ OptionTypeFlags flags_;
+};
+
+// Parses the input value into elements of the result array, which has fixed
+// array size. For example, if the value=1:2:3 and elem_info parses integers,
+// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo
+// the input value has to match with that.
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name The name associated with this array option
+// @param value The input string to parse into tokens
+// @param result Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or element number
+// doesn't match array size defined in OptionTypeInfo
+// or if the token could not be parsed
+// @return NotFound If the tokenized value contains unknown options for
+// its type
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::string& value,
+ std::array<T, kSize>* result) {
+ Status status;
+
+ ConfigOptions copy = config_options;
+ copy.ignore_unsupported_options = false;
+ size_t i = 0, start = 0, end = 0;
+ for (; status.ok() && i < kSize && start < value.size() &&
+ end != std::string::npos;
+ i++, start = end + 1) {
+ std::string token;
+ status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+ if (status.ok()) {
+ status = elem_info.Parse(copy, name, token, &((*result)[i]));
+ if (config_options.ignore_unsupported_options &&
+ status.IsNotSupported()) {
+ // If we were ignoring unsupported options and this one should be
+ // ignored, ignore it by setting the status to OK
+ status = Status::OK();
+ }
+ }
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ // make sure the element number matches the array size
+ if (i < kSize) {
+ return Status::InvalidArgument(
+ "Serialized value has less elements than array size", name);
+ }
+ if (start < value.size() && end != std::string::npos) {
+ return Status::InvalidArgument(
+ "Serialized value has more elements than array size", name);
+ }
+ return status;
+}
+
+// Serializes the fixed size input array into its output value. Elements are
+// separated by the separator character. This element will convert all of the
+// elements in array into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the array contains the integers 1,2,3 and elem_info
+// serializes the output would be 1:2:3 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name The name associated with this array option
+// @param array The input array to serialize
+// @param value The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+// could not be parsed
+// @return NotFound If the tokenized value contains unknown options for
+// its type
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name,
+ const std::array<T, kSize>& array, std::string* value) {
+ std::string result;
+ ConfigOptions embedded = config_options;
+ embedded.delimiter = ";";
+ int printed = 0;
+ for (const auto& elem : array) {
+ std::string elem_str;
+ Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+ if (!s.ok()) {
+ return s;
+ } else if (!elem_str.empty()) {
+ if (printed++ > 0) {
+ result += separator;
+ }
+ // If the element contains embedded separators, put it inside of brackets
+ if (elem_str.find(separator) != std::string::npos) {
+ result += "{" + elem_str + "}";
+ } else {
+ result += elem_str;
+ }
+ }
+ }
+ if (result.find("=") != std::string::npos) {
+ *value = "{" + result + "}";
+ } else if (printed > 1 && result.at(0) == '{') {
+ *value = "{" + result + "}";
+ } else {
+ *value = result;
+ }
+ return Status::OK();
+}
+
+// Compares the input arrays array1 and array2 for equality
+// Elements of the array are compared one by one using elem_info to perform the
+// comparison.
+//
+// @param config_options Controls how the arrays are compared.
+// @param elem_info Controls how individual elements in the arrays are compared
+// @param name The name associated with this array option
+// @param array1,array2 The arrays to compare.
+// @param mismatch If the arrays are not equivalent, mismatch will point to
+// the first element of the comparison that did not match.
+// @return true If vec1 and vec2 are "equal", false otherwise
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, const std::string& name,
+ const std::array<T, kSize>& array1,
+ const std::array<T, kSize>& array2, std::string* mismatch) {
+ assert(array1.size() == kSize);
+ assert(array2.size() == kSize);
+ for (size_t i = 0; i < kSize; ++i) {
+ if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i],
+ mismatch)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Parses the input value into elements of the result vector. This method
+// will break the input value into the individual tokens (based on the
+// separator), where each of those tokens will be parsed based on the rules of
+// elem_info. The result vector will be populated with elements based on the
+// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses
+// integers, the result vector will contain the integers 1,2,3,4,5
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name The name associated with this vector option
+// @param value The input string to parse into tokens
+// @param result Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+// could not be parsed
+// @return NotFound If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::string& value,
+ std::vector<T>* result) {
+ result->clear();
+ Status status;
+
+ // Turn off ignore_unknown_objects so we can tell if the returned
+ // object is valid or not.
+ ConfigOptions copy = config_options;
+ copy.ignore_unsupported_options = false;
+ for (size_t start = 0, end = 0;
+ status.ok() && start < value.size() && end != std::string::npos;
+ start = end + 1) {
+ std::string token;
+ status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+ if (status.ok()) {
+ T elem;
+ status = elem_info.Parse(copy, name, token, &elem);
+ if (status.ok()) {
+ result->emplace_back(elem);
+ } else if (config_options.ignore_unsupported_options &&
+ status.IsNotSupported()) {
+ // If we were ignoring unsupported options and this one should be
+ // ignored, ignore it by setting the status to OK
+ status = Status::OK();
+ }
+ }
+ }
+ return status;
+}
+
+// Serializes the input vector into its output value. Elements are
+// separated by the separator character. This element will convert all of the
+// elements in vec into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the vec contains the integers 1,2,3,4,5 and elem_info
+// serializes the output would be 1:2:3:4:5 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name The name associated with this vector option
+// @param vec The input vector to serialize
+// @param value The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+// could not be parsed
+// @return NotFound If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, char separator,
+ const std::string& name, const std::vector<T>& vec,
+ std::string* value) {
+ std::string result;
+ ConfigOptions embedded = config_options;
+ embedded.delimiter = ";";
+ int printed = 0;
+ for (const auto& elem : vec) {
+ std::string elem_str;
+ Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+ if (!s.ok()) {
+ return s;
+ } else if (!elem_str.empty()) {
+ if (printed++ > 0) {
+ result += separator;
+ }
+ // If the element contains embedded separators, put it inside of brackets
+ if (elem_str.find(separator) != std::string::npos) {
+ result += "{" + elem_str + "}";
+ } else {
+ result += elem_str;
+ }
+ }
+ }
+ if (result.find("=") != std::string::npos) {
+ *value = "{" + result + "}";
+ } else if (printed > 1 && result.at(0) == '{') {
+ *value = "{" + result + "}";
+ } else {
+ *value = result;
+ }
+ return Status::OK();
+}
+
+// Compares the input vectors vec1 and vec2 for equality
+// If the vectors are the same size, elements of the vectors are compared one by
+// one using elem_info to perform the comparison.
+//
+// @param config_options Controls how the vectors are compared.
+// @param elem_info Controls how individual elements in the vectors are compared
+// @param name The name associated with this vector option
+// @param vec1,vec2 The vectors to compare.
+// @param mismatch If the vectors are not equivalent, mismatch will point to
+// the first
+// element of the comparison that did not match.
+// @return true If vec1 and vec2 are "equal", false otherwise
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+ const OptionTypeInfo& elem_info, const std::string& name,
+ const std::vector<T>& vec1, const std::vector<T>& vec2,
+ std::string* mismatch) {
+ if (vec1.size() != vec2.size()) {
+ *mismatch = name;
+ return false;
+ } else {
+ for (size_t i = 0; i < vec1.size(); ++i) {
+ if (!elem_info.AreEqual(
+ config_options, name, reinterpret_cast<const char*>(&vec1[i]),
+ reinterpret_cast<const char*>(&vec2[i]), mismatch)) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h
new file mode 100644
index 000000000..064c087f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_util.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// This file contains utility functions for RocksDB Options.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+// Constructs the DBOptions and ColumnFamilyDescriptors by loading the
+// latest RocksDB options file stored in the specified rocksdb database.
+//
+// Note that the all the pointer options (except table_factory, which will
+// be described in more details below) will be initialized with the default
+// values. Developers can further initialize them after this function call.
+// Below is an example list of pointer options which will be initialized
+//
+// * env
+// * memtable_factory
+// * compaction_filter_factory
+// * prefix_extractor
+// * comparator
+// * merge_operator
+// * compaction_filter
+//
+// User can also choose to load customized comparator, env, and/or
+// merge_operator through object registry:
+// * comparator needs to be registered through Registrar<const Comparator>
+// * env needs to be registered through Registrar<Env>
+// * merge operator needs to be registered through
+// Registrar<std::shared_ptr<MergeOperator>>.
+//
+// For table_factory, this function further supports deserializing
+// BlockBasedTableFactory and its BlockBasedTableOptions except the
+// pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+// block_cache, and block_cache_compressed), which will be initialized with
+// default values. Developers can further specify these three options by
+// casting the return value of TableFactory::GetOptions() to
+// BlockBasedTableOptions and making necessary changes.
+//
+// ignore_unknown_options can be set to true if you want to ignore options
+// that are from a newer version of the db, essentially for forward
+// compatibility.
+//
+// config_options contains a set of options that controls the processing
+// of the options. The LoadLatestOptions(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding options
+// in the ConfigOptions parameter.
+//
+// examples/options_file_example.cc demonstrates how to use this function
+// to open a RocksDB instance.
+//
+// @return the function returns an OK status when it went successfully. If
+// the specified "dbpath" does not contain any option file, then a
+// Status::NotFound will be returned. A return value other than
+// Status::OK or Status::NotFound indicates there is some error related
+// to the options file itself.
+//
+// @see LoadOptionsFromFile
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options = false,
+ std::shared_ptr<Cache>* cache = {});
+Status LoadLatestOptions(const ConfigOptions& config_options,
+ const std::string& dbpath, DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ std::shared_ptr<Cache>* cache = {});
+
+// Similar to LoadLatestOptions, this function constructs the DBOptions
+// and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+//
+// The LoadOptionsFile(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @see LoadLatestOptions
+Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options = false,
+ std::shared_ptr<Cache>* cache = {});
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+ const std::string& options_file_name,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ std::shared_ptr<Cache>* cache = {});
+
+// Returns the latest options file name under the specified db path.
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+ std::string* options_file_name);
+
+// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors
+// are compatible with the latest options stored in the specified DB path.
+//
+// If the return status is non-ok, it means the specified RocksDB instance
+// might not be correctly opened with the input set of options. Currently,
+// changing one of the following options will fail the compatibility check:
+//
+// * comparator
+// * prefix_extractor
+// * table_factory
+// * merge_operator
+Status CheckOptionsCompatibility(
+ const std::string& dbpath, Env* env, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& cf_descs,
+ bool ignore_unknown_options = false);
+Status CheckOptionsCompatibility(
+ const ConfigOptions& config_options, const std::string& dbpath,
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& cf_descs);
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/replayer.h b/src/rocksdb/include/rocksdb/utilities/replayer.h
new file mode 100644
index 000000000..4fdd8d73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/replayer.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TraceRecord;
+class TraceRecordResult;
+
+struct ReplayOptions {
+ // Number of threads used for replaying. If 0 or 1, replay using
+ // single thread.
+ uint32_t num_threads;
+
+ // Enables fast forwarding a replay by increasing/reducing the delay between
+ // the ingested traces.
+ // If > 0.0 and < 1.0, slow down the replay by this amount.
+ // If 1.0, replay the operations at the same rate as in the trace stream.
+ // If > 1, speed up the replay by this amount.
+ double fast_forward;
+
+ ReplayOptions() : num_threads(1), fast_forward(1.0) {}
+
+ ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio)
+ : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {}
+};
+
+// Replayer helps to replay the captured RocksDB query level operations.
+// The Replayer can either be created from DB::NewReplayer method, or be
+// instantiated via db_bench today, on using "replay" benchmark.
+class Replayer {
+ public:
+ virtual ~Replayer() = default;
+
+ // Make some preparation before replaying the trace. This will also reset the
+ // replayer in order to restart replaying.
+ virtual Status Prepare() = 0;
+
+ // Return the timestamp when the trace recording was started.
+ virtual uint64_t GetHeaderTimestamp() const = 0;
+
+ // Atomically read one trace into a TraceRecord (excluding the header and
+ // footer traces).
+ // Return Status::OK() on success;
+ // Status::Incomplete() if Prepare() was not called or no more available
+ // trace;
+ // Status::NotSupported() if the read trace type is not supported.
+ virtual Status Next(std::unique_ptr<TraceRecord>* record) = 0;
+
+ // Execute one TraceRecord.
+ // Return Status::OK() if the execution was successful. Get/MultiGet traces
+ // will still return Status::OK() even if they got Status::NotFound()
+ // from DB::Get() or DB::MultiGet();
+ // Status::Incomplete() if Prepare() was not called or no more available
+ // trace;
+ // Status::NotSupported() if the operation is not supported;
+ // Otherwise, return the corresponding error status.
+ //
+ // The actual operation execution status and result(s) will be saved in
+ // result. For example, a GetQueryTraceRecord will have its DB::Get() status
+ // and the returned value saved in a SingleValueTraceExecutionResult.
+ virtual Status Execute(const std::unique_ptr<TraceRecord>& record,
+ std::unique_ptr<TraceRecordResult>* result) = 0;
+
+ // Replay all the traces from the provided trace stream, taking the delay
+ // between the traces into consideration.
+ //
+ // result_callback reports the status of executing a trace record, and the
+ // actual operation execution result (See the description for Execute()).
+ virtual Status Replay(
+ const ReplayOptions& options,
+ const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+ result_callback) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
new file mode 100644
index 000000000..a682c7748
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCache;
+
+// For instrumentation purpose, use NewSimCache instead of NewLRUCache API
+// NewSimCache is a wrapper function returning a SimCache instance that can
+// have additional interface provided in Simcache class besides Cache interface
+// to predict block cache hit rate without actually allocating the memory. It
+// can help users tune their current block cache size, and determine how
+// efficient they are using the memory.
+//
+// Since GetSimCapacity() returns the capacity for simulation, it differs from
+// actual memory usage, which can be estimated as:
+// sim_capacity * entry_size / (entry_size + block_size),
+// where 76 <= entry_size <= 104,
+// BlockBasedTableOptions.block_size = 4096 by default but is configurable,
+// Therefore, generally the actual memory overhead of SimCache is Less than
+// sim_capacity * 2%
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+ size_t sim_capacity,
+ int num_shard_bits);
+
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+ std::shared_ptr<Cache> cache,
+ int num_shard_bits);
+
+class SimCache : public Cache {
+ public:
+ SimCache() {}
+
+ ~SimCache() override {}
+
+ const char* Name() const override { return "SimCache"; }
+
+ // returns the maximum configured capacity of the simcache for simulation
+ virtual size_t GetSimCapacity() const = 0;
+
+ // simcache doesn't provide internal handler reference to user, so always
+ // PinnedUsage = 0 and the behavior will be not exactly consistent the
+ // with real cache.
+ // returns the memory size for the entries residing in the simcache.
+ virtual size_t GetSimUsage() const = 0;
+
+ // sets the maximum configured capacity of the simcache. When the new
+ // capacity is less than the old capacity and the existing usage is
+ // greater than new capacity, the implementation will purge old entries
+ // to fit new capacity.
+ virtual void SetSimCapacity(size_t capacity) = 0;
+
+ // returns the lookup times of simcache
+ virtual uint64_t get_miss_counter() const = 0;
+ // returns the hit times of simcache
+ virtual uint64_t get_hit_counter() const = 0;
+ // reset the lookup and hit counters
+ virtual void reset_counter() = 0;
+ // String representation of the statistics of the simcache
+ virtual std::string ToString() const = 0;
+
+ // Start storing logs of the cache activity (Add/Lookup) into
+ // a file located at activity_log_file, max_logging_size option can be used to
+ // stop logging to the file automatically after reaching a specific size in
+ // bytes, a values of 0 disable this feature
+ virtual Status StartActivityLogging(const std::string& activity_log_file,
+ Env* env,
+ uint64_t max_logging_size = 0) = 0;
+
+ // Stop cache activity logging if any
+ virtual void StopActivityLogging() = 0;
+
+ // Status of cache logging happening in background
+ virtual Status GetActivityLoggingStatus() = 0;
+
+ private:
+ SimCache(const SimCache&);
+ SimCache& operator=(const SimCache&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
new file mode 100644
index 000000000..9b13c3bdf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -0,0 +1,566 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/db.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+ // StackableDB take sole ownership of the underlying db.
+ explicit StackableDB(DB* db) : db_(db) {}
+
+ // StackableDB take shared ownership of the underlying db.
+ explicit StackableDB(std::shared_ptr<DB> db)
+ : db_(db.get()), shared_db_ptr_(db) {}
+
+ ~StackableDB() {
+ if (shared_db_ptr_ == nullptr) {
+ delete db_;
+ } else {
+ assert(shared_db_ptr_.get() == db_);
+ }
+ db_ = nullptr;
+ }
+
+ virtual Status Close() override { return db_->Close(); }
+
+ virtual DB* GetBaseDB() { return db_; }
+
+ virtual DB* GetRootDB() override { return db_->GetRootDB(); }
+
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) override {
+ return db_->CreateColumnFamily(options, column_family_name, handle);
+ }
+
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override {
+ return db_->CreateColumnFamilies(options, column_family_names, handles);
+ }
+
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override {
+ return db_->CreateColumnFamilies(column_families, handles);
+ }
+
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override {
+ return db_->DropColumnFamily(column_family);
+ }
+
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override {
+ return db_->DropColumnFamilies(column_families);
+ }
+
+ virtual Status DestroyColumnFamilyHandle(
+ ColumnFamilyHandle* column_family) override {
+ return db_->DestroyColumnFamilyHandle(column_family);
+ }
+
+ using DB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& val) override {
+ return db_->Put(options, column_family, key, val);
+ }
+ Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& val) override {
+ return db_->Put(options, column_family, key, ts, val);
+ }
+
+ using DB::PutEntity;
+ Status PutEntity(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) override {
+ return db_->PutEntity(options, column_family, key, columns);
+ }
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override {
+ return db_->Get(options, column_family, key, value);
+ }
+
+ using DB::GetEntity;
+ Status GetEntity(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableWideColumns* columns) override {
+ return db_->GetEntity(options, column_family, key, columns);
+ }
+
+ using DB::GetMergeOperands;
+ virtual Status GetMergeOperands(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* slice,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) override {
+ return db_->GetMergeOperands(options, column_family, key, slice,
+ get_merge_operands_options,
+ number_of_operands);
+ }
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override {
+ return db_->MultiGet(options, column_family, keys, values);
+ }
+
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override {
+ return db_->MultiGet(options, column_family, num_keys, keys, values,
+ statuses, sorted_input);
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) override {
+ return db_->IngestExternalFile(column_family, external_files, options);
+ }
+
+ using DB::IngestExternalFiles;
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) override {
+ return db_->IngestExternalFiles(args);
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) override {
+ return db_->CreateColumnFamilyWithImport(options, column_family_name,
+ import_options, metadata, handle);
+ }
+
+ using DB::VerifyFileChecksums;
+ Status VerifyFileChecksums(const ReadOptions& read_opts) override {
+ return db_->VerifyFileChecksums(read_opts);
+ }
+
+ virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
+
+ virtual Status VerifyChecksum(const ReadOptions& options) override {
+ return db_->VerifyChecksum(options);
+ }
+
+ using DB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value,
+ bool* value_found = nullptr) override {
+ return db_->KeyMayExist(options, column_family, key, value, value_found);
+ }
+
+ using DB::Delete;
+ virtual Status Delete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override {
+ return db_->Delete(wopts, column_family, key);
+ }
+ Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) override {
+ return db_->Delete(wopts, column_family, key, ts);
+ }
+
+ using DB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override {
+ return db_->SingleDelete(wopts, column_family, key);
+ }
+ Status SingleDelete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override {
+ return db_->SingleDelete(wopts, column_family, key, ts);
+ }
+
+ using DB::DeleteRange;
+ Status DeleteRange(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family, const Slice& start_key,
+ const Slice& end_key) override {
+ return db_->DeleteRange(wopts, column_family, start_key, end_key);
+ }
+
+ using DB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override {
+ return db_->Merge(options, column_family, key, value);
+ }
+ Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts, const Slice& value) override {
+ return db_->Merge(options, column_family, key, ts, value);
+ }
+
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
+ return db_->Write(opts, updates);
+ }
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& opts,
+ ColumnFamilyHandle* column_family) override {
+ return db_->NewIterator(opts, column_family);
+ }
+
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override {
+ return db_->NewIterators(options, column_families, iterators);
+ }
+
+ virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+ return db_->ReleaseSnapshot(snapshot);
+ }
+
+ using DB::GetMapProperty;
+ using DB::GetProperty;
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) override {
+ return db_->GetProperty(column_family, property, value);
+ }
+ virtual bool GetMapProperty(
+ ColumnFamilyHandle* column_family, const Slice& property,
+ std::map<std::string, std::string>* value) override {
+ return db_->GetMapProperty(column_family, property, value);
+ }
+
+ using DB::GetIntProperty;
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) override {
+ return db_->GetIntProperty(column_family, property, value);
+ }
+
+ using DB::GetAggregatedIntProperty;
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* value) override {
+ return db_->GetAggregatedIntProperty(property, value);
+ }
+
+ using DB::GetApproximateSizes;
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* r, int n,
+ uint64_t* sizes) override {
+ return db_->GetApproximateSizes(options, column_family, r, n, sizes);
+ }
+
+ using DB::GetApproximateMemTableStats;
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) override {
+ return db_->GetApproximateMemTableStats(column_family, range, count, size);
+ }
+
+ using DB::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override {
+ return db_->CompactRange(options, column_family, begin, end);
+ }
+
+ using DB::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override {
+ return db_->CompactFiles(compact_options, column_family, input_file_names,
+ output_level, output_path_id, output_file_names,
+ compaction_job_info);
+ }
+
+ virtual Status PauseBackgroundWork() override {
+ return db_->PauseBackgroundWork();
+ }
+ virtual Status ContinueBackgroundWork() override {
+ return db_->ContinueBackgroundWork();
+ }
+
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) override {
+ return db_->EnableAutoCompaction(column_family_handles);
+ }
+
+ virtual void EnableManualCompaction() override {
+ return db_->EnableManualCompaction();
+ }
+ virtual void DisableManualCompaction() override {
+ return db_->DisableManualCompaction();
+ }
+
+ using DB::NumberLevels;
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+ return db_->NumberLevels(column_family);
+ }
+
+ using DB::MaxMemCompactionLevel;
+ virtual int MaxMemCompactionLevel(
+ ColumnFamilyHandle* column_family) override {
+ return db_->MaxMemCompactionLevel(column_family);
+ }
+
+ using DB::Level0StopWriteTrigger;
+ virtual int Level0StopWriteTrigger(
+ ColumnFamilyHandle* column_family) override {
+ return db_->Level0StopWriteTrigger(column_family);
+ }
+
+ virtual const std::string& GetName() const override { return db_->GetName(); }
+
+ virtual Env* GetEnv() const override { return db_->GetEnv(); }
+
+ virtual FileSystem* GetFileSystem() const override {
+ return db_->GetFileSystem();
+ }
+
+ using DB::GetOptions;
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+ return db_->GetOptions(column_family);
+ }
+
+ using DB::GetDBOptions;
+ virtual DBOptions GetDBOptions() const override {
+ return db_->GetDBOptions();
+ }
+
+ using DB::Flush;
+ virtual Status Flush(const FlushOptions& fopts,
+ ColumnFamilyHandle* column_family) override {
+ return db_->Flush(fopts, column_family);
+ }
+ virtual Status Flush(
+ const FlushOptions& fopts,
+ const std::vector<ColumnFamilyHandle*>& column_families) override {
+ return db_->Flush(fopts, column_families);
+ }
+
+ virtual Status SyncWAL() override { return db_->SyncWAL(); }
+
+ virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+
+ virtual Status LockWAL() override { return db_->LockWAL(); }
+
+ virtual Status UnlockWAL() override { return db_->UnlockWAL(); }
+
+#ifndef ROCKSDB_LITE
+
+ virtual Status DisableFileDeletions() override {
+ return db_->DisableFileDeletions();
+ }
+
+ virtual Status EnableFileDeletions(bool force) override {
+ return db_->EnableFileDeletions(force);
+ }
+
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* metadata) override {
+ db_->GetLiveFilesMetaData(metadata);
+ }
+
+ virtual Status GetLiveFilesChecksumInfo(
+ FileChecksumList* checksum_list) override {
+ return db_->GetLiveFilesChecksumInfo(checksum_list);
+ }
+
+ virtual Status GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& opts,
+ std::vector<LiveFileStorageInfo>* files) override {
+ return db_->GetLiveFilesStorageInfo(opts, files);
+ }
+
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* cf_meta) override {
+ db_->GetColumnFamilyMetaData(column_family, cf_meta);
+ }
+
+ using DB::StartBlockCacheTrace;
+ Status StartBlockCacheTrace(
+ const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override {
+ return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer));
+ }
+
+ Status StartBlockCacheTrace(
+ const BlockCacheTraceOptions& options,
+ std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override {
+ return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+ }
+
+ using DB::EndBlockCacheTrace;
+ Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
+ using DB::StartIOTrace;
+ Status StartIOTrace(const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override {
+ return db_->StartIOTrace(options, std::move(trace_writer));
+ }
+
+ using DB::EndIOTrace;
+ Status EndIOTrace() override { return db_->EndIOTrace(); }
+
+ using DB::StartTrace;
+ Status StartTrace(const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override {
+ return db_->StartTrace(options, std::move(trace_writer));
+ }
+
+ using DB::EndTrace;
+ Status EndTrace() override { return db_->EndTrace(); }
+
+ using DB::NewDefaultReplayer;
+ Status NewDefaultReplayer(const std::vector<ColumnFamilyHandle*>& handles,
+ std::unique_ptr<TraceReader>&& reader,
+ std::unique_ptr<Replayer>* replayer) override {
+ return db_->NewDefaultReplayer(handles, std::move(reader), replayer);
+ }
+
+#endif // ROCKSDB_LITE
+
+ virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+ bool flush_memtable = true) override {
+ return db_->GetLiveFiles(vec, mfs, flush_memtable);
+ }
+
+ virtual SequenceNumber GetLatestSequenceNumber() const override {
+ return db_->GetLatestSequenceNumber();
+ }
+
+ Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string ts_low) override {
+ return db_->IncreaseFullHistoryTsLow(column_family, ts_low);
+ }
+
+ Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+ std::string* ts_low) override {
+ return db_->GetFullHistoryTsLow(column_family, ts_low);
+ }
+
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+ return db_->GetSortedWalFiles(files);
+ }
+
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) override {
+ return db_->GetCurrentWalFile(current_log_file);
+ }
+
+ virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override {
+ return db_->GetCreationTimeOfOldestFile(creation_time);
+ }
+
+ // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+ // operate at the proper level of abstraction for a key-value store, and its
+ // contract/restrictions are poorly documented. For example, it returns non-OK
+ // `Status` for non-bottommost files and files undergoing compaction. Since we
+ // do not plan to maintain it, the contract will likely remain underspecified
+ // until its removal. Any user is encouraged to read the implementation
+ // carefully and migrate away from it when possible.
+ virtual Status DeleteFile(std::string name) override {
+ return db_->DeleteFile(name);
+ }
+
+ virtual Status GetDbIdentity(std::string& identity) const override {
+ return db_->GetDbIdentity(identity);
+ }
+
+ virtual Status GetDbSessionId(std::string& session_id) const override {
+ return db_->GetDbSessionId(session_id);
+ }
+
+ using DB::SetOptions;
+ virtual Status SetOptions(ColumnFamilyHandle* column_family_handle,
+ const std::unordered_map<std::string, std::string>&
+ new_options) override {
+ return db_->SetOptions(column_family_handle, new_options);
+ }
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& new_options)
+ override {
+ return db_->SetDBOptions(new_options);
+ }
+
+ using DB::ResetStats;
+ virtual Status ResetStats() override { return db_->ResetStats(); }
+
+ using DB::GetPropertiesOfAllTables;
+ virtual Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) override {
+ return db_->GetPropertiesOfAllTables(column_family, props);
+ }
+
+ using DB::GetPropertiesOfTablesInRange;
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) override {
+ return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
+ }
+
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options) override {
+ return db_->GetUpdatesSince(seq_number, iter, read_options);
+ }
+
+ virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin,
+ const Slice* end) override {
+ return db_->SuggestCompactRange(column_family, begin, end);
+ }
+
+ virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+ int target_level) override {
+ return db_->PromoteL0(column_family, target_level);
+ }
+
+ virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+ return db_->DefaultColumnFamily();
+ }
+
+#ifndef ROCKSDB_LITE
+ Status TryCatchUpWithPrimary() override {
+ return db_->TryCatchUpWithPrimary();
+ }
+#endif // ROCKSDB_LITE
+
+ protected:
+ DB* db_;
+ std::shared_ptr<DB> shared_db_ptr_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
new file mode 100644
index 000000000..f3a4ba005
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries or the ratio of tombstone
+// entries in the whole file >= the specified deletion ratio.
+class CompactOnDeletionCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ // A factory of a table property collector that marks a SST
+ // file as need-compaction when it observe at least "D" deletion
+ // entries in any "N" consecutive entries, or the ratio of tombstone
+ // entries >= deletion_ratio.
+ //
+ // @param sliding_window_size "N"
+ // @param deletion_trigger "D"
+ // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+ // based on deletion ratio.
+ CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger,
+ double deletion_ratio);
+
+ ~CompactOnDeletionCollectorFactory() {}
+
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override;
+
+ // Change the value of sliding_window_size "N"
+ // Setting it to 0 disables the delete triggered compaction
+ void SetWindowSize(size_t sliding_window_size) {
+ sliding_window_size_.store(sliding_window_size);
+ }
+ size_t GetWindowSize() const { return sliding_window_size_.load(); }
+
+ // Change the value of deletion_trigger "D"
+ void SetDeletionTrigger(size_t deletion_trigger) {
+ deletion_trigger_.store(deletion_trigger);
+ }
+
+ size_t GetDeletionTrigger() const { return deletion_trigger_.load(); }
+ // Change deletion ratio.
+ // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+ // based on deletion ratio.
+ void SetDeletionRatio(double deletion_ratio) {
+ deletion_ratio_.store(deletion_ratio);
+ }
+
+ double GetDeletionRatio() const { return deletion_ratio_.load(); }
+ static const char* kClassName() { return "CompactOnDeletionCollector"; }
+ const char* Name() const override { return kClassName(); }
+
+ std::string ToString() const override;
+
+ private:
+ std::atomic<size_t> sliding_window_size_;
+ std::atomic<size_t> deletion_trigger_;
+ std::atomic<double> deletion_ratio_;
+};
+
+// Creates a factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries, or the ratio of tombstone
+// entries >= deletion_ratio.
+//
+// @param sliding_window_size "N". Note that this number will be
+// round up to the smallest multiple of 128 that is no less
+// than the specified size.
+// @param deletion_trigger "D". Note that even when "N" is changed,
+// the specified number for "D" will not be changed.
+// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+// based on deletion ratio. Disabled by default.
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger,
+ double deletion_ratio = 0);
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h
new file mode 100644
index 000000000..1d2822988
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction.h
@@ -0,0 +1,686 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+class TransactionDB;
+class WriteBatchWithIndex;
+
+using TransactionName = std::string;
+
+using TransactionID = uint64_t;
+
+using TxnTimestamp = uint64_t;
+
+constexpr TxnTimestamp kMaxTxnTimestamp =
+ std::numeric_limits<TxnTimestamp>::max();
+
+/*
+ class Endpoint allows to define prefix ranges.
+
+ Prefix ranges are introduced below.
+
+ == Basic Ranges ==
+ Let's start from basic ranges. Key Comparator defines ordering of rowkeys.
+ Then, one can specify finite closed ranges by just providing rowkeys of their
+ endpoints:
+
+ lower_endpoint <= X <= upper_endpoint
+
+ However our goal is to provide a richer set of endpoints. Read on.
+
+ == Lexicographic ordering ==
+ A lexicographic (or dictionary) ordering satisfies these criteria: If there
+ are two keys in form
+ key_a = {prefix_a, suffix_a}
+ key_b = {prefix_b, suffix_b}
+ and
+ prefix_a < prefix_b
+ then
+ key_a < key_b.
+
+ == Prefix ranges ==
+ With lexicographic ordering, one may want to define ranges in form
+
+ "prefix is $PREFIX"
+
+ which translates to a range in form
+
+ {$PREFIX, -infinity} < X < {$PREFIX, +infinity}
+
+ where -infinity will compare less than any possible suffix, and +infinity
+ will compare as greater than any possible suffix.
+
+ class Endpoint allows to define these kind of rangtes.
+
+ == Notes ==
+ BytewiseComparator and ReverseBytewiseComparator produce lexicographic
+ ordering.
+
+ The row comparison function is able to compare key prefixes. If the data
+ domain includes keys A and B, then the comparison function is able to compare
+ equal-length prefixes:
+
+ min_len= min(byte_length(A), byte_length(B));
+ cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid
+
+ == Other options ==
+ As far as MyRocks is concerned, the alternative to prefix ranges would be to
+ support both open (non-inclusive) and closed (inclusive) range endpoints.
+*/
+
+class Endpoint {
+ public:
+ Slice slice;
+
+ /*
+ true : the key has a "+infinity" suffix. A suffix that would compare as
+ greater than any other suffix
+ false : otherwise
+ */
+ bool inf_suffix;
+
+ explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false)
+ : slice(slice_arg), inf_suffix(inf_suffix_arg) {}
+
+ explicit Endpoint(const char* s, bool inf_suffix_arg = false)
+ : slice(s), inf_suffix(inf_suffix_arg) {}
+
+ Endpoint(const char* s, size_t size, bool inf_suffix_arg = false)
+ : slice(s, size), inf_suffix(inf_suffix_arg) {}
+
+ Endpoint() : inf_suffix(false) {}
+};
+
+// Provides notification to the caller of SetSnapshotOnNextOperation when
+// the actual snapshot gets created
+class TransactionNotifier {
+ public:
+ virtual ~TransactionNotifier() {}
+
+ // Implement this method to receive notification when a snapshot is
+ // requested via SetSnapshotOnNextOperation.
+ // Do not take exclusive ownership of `newSnapshot` because it is shared with
+ // the underlying transaction.
+ virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
+};
+
+// Provides BEGIN/COMMIT/ROLLBACK transactions.
+//
+// To use transactions, you must first create either an OptimisticTransactionDB
+// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
+// more information.
+//
+// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
+//
+// It is up to the caller to synchronize access to this object.
+//
+// See examples/transaction_example.cc for some simple examples.
+//
+// TODO(agiardullo): Not yet implemented
+// -PerfContext statistics
+// -Support for using Transactions with DBWithTTL
+class Transaction {
+ public:
+ // No copying allowed
+ Transaction(const Transaction&) = delete;
+ void operator=(const Transaction&) = delete;
+
+ virtual ~Transaction() {}
+
+ // If a transaction has a snapshot set, the transaction will ensure that
+ // any keys successfully written(or fetched via GetForUpdate()) have not
+ // been modified outside of this transaction since the time the snapshot was
+ // set.
+ // If a snapshot has not been set, the transaction guarantees that keys have
+ // not been modified since the time each key was first written (or fetched via
+ // GetForUpdate()).
+ //
+ // Using SetSnapshot() will provide stricter isolation guarantees at the
+ // expense of potentially more transaction failures due to conflicts with
+ // other writes.
+ //
+ // Calling SetSnapshot() has no effect on keys written before this function
+ // has been called.
+ //
+ // SetSnapshot() may be called multiple times if you would like to change
+ // the snapshot used for different operations in this transaction.
+ //
+ // Calling SetSnapshot will not affect the version of Data returned by Get()
+ // methods. See Transaction::Get() for more details.
+ virtual void SetSnapshot() = 0;
+
+ // Similar to SetSnapshot(), but will not change the current snapshot
+ // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
+ // By calling this function, the transaction will essentially call
+ // SetSnapshot() for you right before performing the next write/GetForUpdate.
+ //
+ // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
+ // returned by GetSnapshot() until the next write/GetForUpdate is executed.
+ //
+ // When the snapshot is created the notifier's SnapshotCreated method will
+ // be called so that the caller can get access to the snapshot.
+ //
+ // This is an optimization to reduce the likelihood of conflicts that
+ // could occur in between the time SetSnapshot() is called and the first
+ // write/GetForUpdate operation. Eg, this prevents the following
+ // race-condition:
+ //
+ // txn1->SetSnapshot();
+ // txn2->Put("A", ...);
+ // txn2->Commit();
+ // txn1->GetForUpdate(opts, "A", ...); // FAIL!
+ //
+ // WriteCommittedTxn only: a new snapshot will be taken upon next operation,
+ // and next operation can be a Commit.
+ // TODO(yanqin) remove the "write-committed only" limitation.
+ virtual void SetSnapshotOnNextOperation(
+ std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
+
+ // Returns the Snapshot created by the last call to SetSnapshot().
+ //
+ // REQUIRED: The returned Snapshot is only valid up until the next time
+ // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
+ // is called, or the Transaction is deleted.
+ virtual const Snapshot* GetSnapshot() const = 0;
+
+ // Returns the Snapshot created by the last call to SetSnapshot().
+ // The returned snapshot can outlive the transaction.
+ virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const = 0;
+
+ // Clears the current snapshot (i.e. no snapshot will be 'set')
+ //
+ // This removes any snapshot that currently exists or is set to be created
+ // on the next update operation (SetSnapshotOnNextOperation).
+ //
+ // Calling ClearSnapshot() has no effect on keys written before this function
+ // has been called.
+ //
+ // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
+ // longer be valid and should be discarded after a call to ClearSnapshot().
+ virtual void ClearSnapshot() = 0;
+
+ // Prepare the current transaction for 2PC
+ virtual Status Prepare() = 0;
+
+ // Write all batched keys to the db atomically.
+ //
+ // Returns OK on success.
+ //
+ // May return any error status that could be returned by DB:Write().
+ //
+ // If this transaction was created by an OptimisticTransactionDB(),
+ // Status::Busy() may be returned if the transaction could not guarantee
+ // that there are no write conflicts. Status::TryAgain() may be returned
+ // if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain).
+ //
+ // If this transaction was created by a TransactionDB(), Status::Expired()
+ // may be returned if this transaction has lived for longer than
+ // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if
+ // TransactionOptions.skip_prepare is false and Prepare is not called on this
+ // transaction before Commit.
+ virtual Status Commit() = 0;
+
+ // In addition to Commit(), also creates a snapshot of the db after all
+ // writes by this txn are visible to other readers.
+ // Caller is responsible for ensuring that
+ // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
+ // in which snapshot1 and snapshot2 are created by this API.
+ //
+ // Currently only supported by WriteCommittedTxn. Calling this method on
+ // other types of transactions will return non-ok Status resulting from
+ // Commit() or a `NotSupported` error.
+ // This method returns OK if and only if the transaction successfully
+ // commits. It is possible that transaction commits successfully but fails to
+ // create a timestamped snapshot. Therefore, the caller should check that the
+ // snapshot is created.
+ // notifier will be notified upon next snapshot creation. Nullable.
+ // ret non-null output argument storing a shared_ptr to the newly created
+ // snapshot.
+ Status CommitAndTryCreateSnapshot(
+ std::shared_ptr<TransactionNotifier> notifier =
+ std::shared_ptr<TransactionNotifier>(),
+ TxnTimestamp ts = kMaxTxnTimestamp,
+ std::shared_ptr<const Snapshot>* snapshot = nullptr);
+
+ // Discard all batched writes in this transaction.
+ virtual Status Rollback() = 0;
+
+ // Records the state of the transaction for future calls to
+ // RollbackToSavePoint(). May be called multiple times to set multiple save
+ // points.
+ virtual void SetSavePoint() = 0;
+
+ // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
+ // since the most recent call to SetSavePoint() and removes the most recent
+ // SetSavePoint().
+ // If there is no previous call to SetSavePoint(), returns Status::NotFound()
+ virtual Status RollbackToSavePoint() = 0;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ virtual Status PopSavePoint() = 0;
+
+ // This function is similar to DB::Get() except it will also read pending
+ // changes in this transaction. Currently, this function will return
+ // Status::MergeInProgress if the most recent write to the queried key in
+ // this batch is a Merge.
+ //
+ // If read_options.snapshot is not set, the current version of the key will
+ // be read. Calling SetSnapshot() does not affect the version of the data
+ // returned.
+ //
+ // Note that setting read_options.snapshot will affect what is read from the
+ // DB but will NOT change which keys are read from this transaction (the keys
+ // in this transaction do not yet belong to any snapshot and will be fetched
+ // regardless).
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value) = 0;
+
+ // An overload of the above method that receives a PinnableSlice
+ // For backward compatibility a default implementation is provided
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ auto s = Get(options, column_family, key, pinnable_val->GetSelf());
+ pinnable_val->PinSelf();
+ return s;
+ }
+
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) = 0;
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ auto s = Get(options, key, pinnable_val->GetSelf());
+ pinnable_val->PinSelf();
+ return s;
+ }
+
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+ virtual std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) = 0;
+
+ // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
+ // expected to override this with an implementation that calls
+ // DBImpl::MultiGet()
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ for (size_t i = 0; i < num_keys; ++i) {
+ statuses[i] = Get(options, column_family, keys[i], &values[i]);
+ }
+ }
+
+ // Read this key and ensure that this transaction will only
+ // be able to be committed if this key is not written outside this
+ // transaction after it has first been read (or after the snapshot if a
+ // snapshot is set in this transaction and do_validate is true). If
+ // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
+ // that GetForUpdate returns the latest committed value. The transaction
+ // behavior is the same regardless of whether the key exists or not.
+ //
+ // Note: Currently, this function will return Status::MergeInProgress
+ // if the most recent write to the queried key in this batch is a Merge.
+ //
+ // The values returned by this function are similar to Transaction::Get().
+ // If value==nullptr, then this function will not read any data, but will
+ // still ensure that this key cannot be written to by outside of this
+ // transaction.
+ //
+ // If this transaction was created by an OptimisticTransaction, GetForUpdate()
+ // could cause commit() to fail. Otherwise, it could return any error
+ // that could be returned by DB::Get().
+ //
+ // If this transaction was created by a TransactionDB, it can return
+ // Status::OK() on success,
+ // Status::Busy() if there is a write conflict,
+ // Status::TimedOut() if a lock could not be acquired,
+ // Status::TryAgain() if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain)
+ // Status::MergeInProgress() if merge operations cannot be resolved.
+ // or other errors if this key could not be read.
+ virtual Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value,
+ bool exclusive = true,
+ const bool do_validate = true) = 0;
+
+ // An overload of the above method that receives a PinnableSlice
+ // For backward compatibility a default implementation is provided
+ virtual Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val,
+ bool exclusive = true,
+ const bool do_validate = true) {
+ if (pinnable_val == nullptr) {
+ std::string* null_str = nullptr;
+ return GetForUpdate(options, column_family, key, null_str, exclusive,
+ do_validate);
+ } else {
+ auto s = GetForUpdate(options, column_family, key,
+ pinnable_val->GetSelf(), exclusive, do_validate);
+ pinnable_val->PinSelf();
+ return s;
+ }
+ }
+
+ // Get a range lock on [start_endpoint; end_endpoint].
+ virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&,
+ const Endpoint&) {
+ return Status::NotSupported();
+ }
+
+ virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
+ std::string* value, bool exclusive = true,
+ const bool do_validate = true) = 0;
+
+ virtual std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+ virtual std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options, const std::vector<Slice>& keys,
+ std::vector<std::string>* values) = 0;
+
+ // Returns an iterator that will iterate on all keys in the default
+ // column family including both keys in the DB and uncommitted keys in this
+ // transaction.
+ //
+ // Setting read_options.snapshot will affect what is read from the
+ // DB but will NOT change which keys are read from this transaction (the keys
+ // in this transaction do not yet belong to any snapshot and will be fetched
+ // regardless).
+ //
+ // Caller is responsible for deleting the returned Iterator.
+ //
+ // The returned iterator is only valid until Commit(), Rollback(), or
+ // RollbackToSavePoint() is called.
+ virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
+
+ virtual Iterator* GetIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) = 0;
+
+ // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
+ // functions in WriteBatch, but will also do conflict checking on the
+ // keys being written.
+ //
+ // assume_tracked=true expects the key be already tracked. More
+ // specifically, it means the the key was previous tracked in the same
+ // savepoint, with the same exclusive flag, and at a lower sequence number.
+ // If valid then it skips ValidateSnapshot. Returns error otherwise.
+ //
+ // If this Transaction was created on an OptimisticTransactionDB, these
+ // functions should always return Status::OK().
+ //
+ // If this Transaction was created on a TransactionDB, the status returned
+ // can be:
+ // Status::OK() on success,
+ // Status::Busy() if there is a write conflict,
+ // Status::TimedOut() if a lock could not be acquired,
+ // Status::TryAgain() if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain)
+ // or other errors on unexpected failures.
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) = 0;
+ virtual Status Put(const Slice& key, const Slice& value) = 0;
+ virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value,
+ const bool assume_tracked = false) = 0;
+ virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
+
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value,
+ const bool assume_tracked = false) = 0;
+ virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+ virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status Delete(const Slice& key) = 0;
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status Delete(const SliceParts& key) = 0;
+
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status SingleDelete(const Slice& key) = 0;
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status SingleDelete(const SliceParts& key) = 0;
+
+ // PutUntracked() will write a Put to the batch of operations to be committed
+ // in this transaction. This write will only happen if this transaction
+ // gets committed successfully. But unlike Transaction::Put(),
+ // no conflict checking will be done for this key.
+ //
+ // If this Transaction was created on a PessimisticTransactionDB, this
+ // function will still acquire locks necessary to make sure this write doesn't
+ // cause conflicts in other transactions and may return Status::Busy().
+ virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) = 0;
+ virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
+ virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const SliceParts& value) = 0;
+ virtual Status PutUntracked(const SliceParts& key,
+ const SliceParts& value) = 0;
+
+ virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) = 0;
+ virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
+
+ virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ virtual Status DeleteUntracked(const Slice& key) = 0;
+ virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) = 0;
+ virtual Status DeleteUntracked(const SliceParts& key) = 0;
+ virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ virtual Status SingleDeleteUntracked(const Slice& key) = 0;
+
+ // Similar to WriteBatch::PutLogData
+ virtual void PutLogData(const Slice& blob) = 0;
+
+ // By default, all Put/Merge/Delete operations will be indexed in the
+ // transaction so that Get/GetForUpdate/GetIterator can search for these
+ // keys.
+ //
+ // If the caller does not want to fetch the keys about to be written,
+ // they may want to avoid indexing as a performance optimization.
+ // Calling DisableIndexing() will turn off indexing for all future
+ // Put/Merge/Delete operations until EnableIndexing() is called.
+ //
+ // If a key is Put/Merge/Deleted after DisableIndexing is called and then
+ // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
+ // undefined.
+ virtual void DisableIndexing() = 0;
+ virtual void EnableIndexing() = 0;
+
+ // Returns the number of distinct Keys being tracked by this transaction.
+ // If this transaction was created by a TransactionDB, this is the number of
+ // keys that are currently locked by this transaction.
+ // If this transaction was created by an OptimisticTransactionDB, this is the
+ // number of keys that need to be checked for conflicts at commit time.
+ virtual uint64_t GetNumKeys() const = 0;
+
+ // Returns the number of Puts/Deletes/Merges that have been applied to this
+ // transaction so far.
+ virtual uint64_t GetNumPuts() const = 0;
+ virtual uint64_t GetNumDeletes() const = 0;
+ virtual uint64_t GetNumMerges() const = 0;
+
+ // Returns the elapsed time in milliseconds since this Transaction began.
+ virtual uint64_t GetElapsedTime() const = 0;
+
+ // Fetch the underlying write batch that contains all pending changes to be
+ // committed.
+ //
+ // Note: You should not write or delete anything from the batch directly and
+ // should only use the functions in the Transaction class to
+ // write to this transaction.
+ virtual WriteBatchWithIndex* GetWriteBatch() = 0;
+
+ // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
+ // this transaction.
+ // Has no effect on OptimisticTransactions.
+ virtual void SetLockTimeout(int64_t timeout) = 0;
+
+ // Return the WriteOptions that will be used during Commit()
+ virtual WriteOptions* GetWriteOptions() = 0;
+
+ // Reset the WriteOptions that will be used during Commit().
+ virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
+
+ // If this key was previously fetched in this transaction using
+ // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
+ // the transaction that it no longer needs to do any conflict checking
+ // for this key.
+ //
+ // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
+ // then UndoGetForUpdate will only have an effect if it is also called N
+ // times. If this key has been written to in this transaction,
+ // UndoGetForUpdate() will have no effect.
+ //
+ // If SetSavePoint() has been called after the GetForUpdate(),
+ // UndoGetForUpdate() will not have any effect.
+ //
+ // If this Transaction was created by an OptimisticTransactionDB,
+ // calling UndoGetForUpdate can affect whether this key is conflict checked
+ // at commit time.
+ // If this Transaction was created by a TransactionDB,
+ // calling UndoGetForUpdate may release any held locks for this key.
+ virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual void UndoGetForUpdate(const Slice& key) = 0;
+
+ virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
+
+ // Note: data in the commit-time-write-batch bypasses concurrency control,
+ // thus should be used with great caution.
+ // For write-prepared/write-unprepared transactions,
+ // GetCommitTimeWriteBatch() can be used only if the transaction is started
+ // with
+ // `TransactionOptions::use_only_the_last_commit_time_batch_for_recovery` set
+ // to true. Otherwise, it is possible that two uncommitted versions of the
+ // same key exist in the database due to the current implementation (see the
+ // explanation in WritePreparedTxn::CommitInternal).
+ // During bottommost compaction, RocksDB may
+ // set the sequence numbers of both to zero once becoming committed, causing
+ // output SST file to have two identical internal keys.
+ virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
+
+ virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
+
+ virtual uint64_t GetLogNumber() const { return log_number_; }
+
+ virtual Status SetName(const TransactionName& name) = 0;
+
+ virtual TransactionName GetName() const { return name_; }
+
+ virtual TransactionID GetID() const { return 0; }
+
+ virtual bool IsDeadlockDetect() const { return false; }
+
+ virtual std::vector<TransactionID> GetWaitingTxns(
+ uint32_t* /*column_family_id*/, std::string* /*key*/) const {
+ assert(false);
+ return std::vector<TransactionID>();
+ }
+
+ enum TransactionState {
+ STARTED = 0,
+ AWAITING_PREPARE = 1,
+ PREPARED = 2,
+ AWAITING_COMMIT = 3,
+ COMMITTED = 4,
+ COMMITED = COMMITTED, // old misspelled name
+ AWAITING_ROLLBACK = 5,
+ ROLLEDBACK = 6,
+ LOCKS_STOLEN = 7,
+ };
+
+ TransactionState GetState() const { return txn_state_; }
+ void SetState(TransactionState state) { txn_state_ = state; }
+
+ // NOTE: Experimental feature
+ // The globally unique id with which the transaction is identified. This id
+ // might or might not be set depending on the implementation. Similarly the
+ // implementation decides the point in lifetime of a transaction at which it
+ // assigns the id. Although currently it is the case, the id is not guaranteed
+ // to remain the same across restarts.
+ uint64_t GetId() { return id_; }
+
+ virtual Status SetReadTimestampForValidation(TxnTimestamp /*ts*/) {
+ return Status::NotSupported("timestamp not supported");
+ }
+
+ virtual Status SetCommitTimestamp(TxnTimestamp /*ts*/) {
+ return Status::NotSupported("timestamp not supported");
+ }
+
+ virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; }
+
+ protected:
+ explicit Transaction(const TransactionDB* /*db*/) {}
+ Transaction() : log_number_(0), txn_state_(STARTED) {}
+
+ // the log in which the prepared section for this txn resides
+ // (for two phase commit)
+ uint64_t log_number_;
+ TransactionName name_;
+
+ // Execution status of the transaction.
+ std::atomic<TransactionState> txn_state_;
+
+ uint64_t id_ = 0;
+ virtual void SetId(uint64_t id) {
+ assert(id_ == 0);
+ id_ = id;
+ }
+
+ virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
+ private:
+ friend class PessimisticTransactionDB;
+ friend class WriteUnpreparedTxnDB;
+ friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+ friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
new file mode 100644
index 000000000..741c59574
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -0,0 +1,508 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+// Database with Transaction support.
+//
+// See transaction.h and examples/transaction_example.cc
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexFactory;
+
+enum TxnDBWritePolicy {
+ WRITE_COMMITTED = 0, // write only the committed data
+ WRITE_PREPARED, // write data after the prepare phase of 2pc
+ WRITE_UNPREPARED // write data before the prepare phase of 2pc
+};
+
+constexpr uint32_t kInitialMaxDeadlocks = 5;
+
+class LockManager;
+struct RangeLockInfo;
+
+// A lock manager handle
+// The workflow is as follows:
+// * Use a factory method (like NewRangeLockManager()) to create a lock
+// manager and get its handle.
+// * A Handle for a particular kind of lock manager will have extra
+// methods and parameters to control the lock manager
+// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
+// will be used to perform locking.
+class LockManagerHandle {
+ public:
+ // PessimisticTransactionDB will call this to get the Lock Manager it's going
+ // to use.
+ virtual LockManager* getLockManager() = 0;
+
+ virtual ~LockManagerHandle() {}
+};
+
+// Same as class Endpoint, but use std::string to manage the buffer allocation
+struct EndpointWithString {
+ std::string slice;
+ bool inf_suffix;
+};
+
+struct RangeDeadlockInfo {
+ TransactionID m_txn_id;
+ uint32_t m_cf_id;
+ bool m_exclusive;
+
+ EndpointWithString m_start;
+ EndpointWithString m_end;
+};
+
+struct RangeDeadlockPath {
+ std::vector<RangeDeadlockInfo> path;
+ bool limit_exceeded;
+ int64_t deadlock_time;
+
+ explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
+ const int64_t& dl_time)
+ : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+ // empty path, limit exceeded constructor and default constructor
+ explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+ : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+ bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+// A handle to control RangeLockManager (Range-based lock manager) from outside
+// RocksDB
+class RangeLockManagerHandle : public LockManagerHandle {
+ public:
+ // Set total amount of lock memory to use.
+ //
+ // @return 0 Ok
+ // @return EDOM Failed to set because currently using more memory than
+ // specified
+ virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
+ virtual size_t GetMaxLockMemory() = 0;
+
+ using RangeLockStatus =
+ std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+
+ // Lock Escalation barrier check function.
+ // It is called for a couple of endpoints A and B, such that A < B.
+ // If escalation_barrier_check_func(A, B)==true, then there's a lock
+ // escalation barrier between A and B, and lock escalation is not allowed
+ // to bridge the gap between A and B.
+ //
+ // The function may be called from any thread that acquires or releases
+ // locks. It should not throw exceptions. There is currently no way to return
+ // an error.
+ using EscalationBarrierFunc =
+ std::function<bool(const Endpoint& a, const Endpoint& b)>;
+
+ // Set the user-provided barrier check function
+ virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0;
+
+ virtual RangeLockStatus GetRangeLockStatusData() = 0;
+
+ class Counters {
+ public:
+ // Number of times lock escalation was triggered (for all column families)
+ uint64_t escalation_count;
+
+ // Number of times lock acquisition had to wait for a conflicting lock
+ // to be released. This counts both successful waits (where the desired
+ // lock was acquired) and waits that timed out or got other error.
+ uint64_t lock_wait_count;
+
+ // How much memory is currently used for locks (total for all column
+ // families)
+ uint64_t current_lock_memory;
+ };
+
+ // Get the current counter values
+ virtual Counters GetStatus() = 0;
+
+ // Functions for range-based Deadlock reporting.
+ virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
+ virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+ virtual ~RangeLockManagerHandle() {}
+};
+
+// A factory function to create a Range Lock Manager. The created object should
+// be:
+// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
+// range-locking mode
+// 2. Used to control the lock manager when the DB is already open.
+RangeLockManagerHandle* NewRangeLockManager(
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+struct TransactionDBOptions {
+ // Specifies the maximum number of keys that can be locked at the same time
+ // per column family.
+ // If the number of locked keys is greater than max_num_locks, transaction
+ // writes (or GetForUpdate) will return an error.
+ // If this value is not positive, no limit will be enforced.
+ int64_t max_num_locks = -1;
+
+ // Stores the number of latest deadlocks to track
+ uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
+
+ // Increasing this value will increase the concurrency by dividing the lock
+ // table (per column family) into more sub-tables, each with their own
+ // separate mutex.
+ size_t num_stripes = 16;
+
+ // If positive, specifies the default wait timeout in milliseconds when
+ // a transaction attempts to lock a key if not specified by
+ // TransactionOptions::lock_timeout.
+ //
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, there is no timeout. Not using a timeout is not recommended
+ // as it can lead to deadlocks. Currently, there is no deadlock-detection to
+ // recover from a deadlock.
+ int64_t transaction_lock_timeout = 1000; // 1 second
+
+ // If positive, specifies the wait timeout in milliseconds when writing a key
+ // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
+ // directly).
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, there is no timeout and will block indefinitely when acquiring
+ // a lock.
+ //
+ // Not using a timeout can lead to deadlocks. Currently, there
+ // is no deadlock-detection to recover from a deadlock. While DB writes
+ // cannot deadlock with other DB writes, they can deadlock with a transaction.
+ // A negative timeout should only be used if all transactions have a small
+ // expiration set.
+ int64_t default_lock_timeout = 1000; // 1 second
+
+ // If set, the TransactionDB will use this implementation of a mutex and
+ // condition variable for all transaction locking instead of the default
+ // mutex/condvar implementation.
+ std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+
+ // The policy for when to write the data into the DB. The default policy is to
+ // write only the committed data (WRITE_COMMITTED). The data could be written
+ // before the commit phase. The DB then needs to provide the mechanisms to
+ // tell apart committed from uncommitted data.
+ TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+
+ // TODO(myabandeh): remove this option
+ // Note: this is a temporary option as a hot fix in rollback of writeprepared
+ // txns in myrocks. MyRocks uses merge operands for autoinc column id without
+ // however obtaining locks. This breaks the assumption behind the rollback
+ // logic in myrocks. This hack of simply not rolling back merge operands works
+ // for the special way that myrocks uses this operands.
+ bool rollback_merge_operands = false;
+
+ // nullptr means use default lock manager.
+ // Other value means the user provides a custom lock manager.
+ std::shared_ptr<LockManagerHandle> lock_mgr_handle;
+
+ // If true, the TransactionDB implementation might skip concurrency control
+ // unless it is overridden by TransactionOptions or
+ // TransactionDBWriteOptimizations. This can be used in conjunction with
+ // DBOptions::unordered_write when the TransactionDB is used solely for write
+ // ordering rather than concurrency control.
+ bool skip_concurrency_control = false;
+
+ // This option is only valid for write unprepared. If a write batch exceeds
+ // this threshold, then the transaction will implicitly flush the currently
+ // pending writes into the database. A value of 0 or less means no limit.
+ int64_t default_write_batch_flush_threshold = 0;
+
+ // This option is valid only for write-prepared/write-unprepared. Transaction
+ // will rely on this callback to determine if a key should be rolled back
+ // with Delete or SingleDelete when necessary. If the callback returns true,
+ // then SingleDelete should be used. If the callback is not callable or the
+ // callback returns false, then a Delete is used.
+ // The application should ensure thread-safety of this callback.
+ // The callback should not throw because RocksDB is not exception-safe.
+ // The callback may be removed if we allow mixing Delete and SingleDelete in
+ // the future.
+ std::function<bool(TransactionDB* /*db*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/)>
+ rollback_deletion_type_callback;
+
+ private:
+ // 128 entries
+ // Should the default value change, please also update wp_snapshot_cache_bits
+ // in db_stress_gflags.cc
+ size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
+ // 8m entry, 64MB size
+ // Should the default value change, please also update wp_commit_cache_bits
+ // in db_stress_gflags.cc
+ size_t wp_commit_cache_bits = static_cast<size_t>(23);
+
+ // For testing, whether transaction name should be auto-generated or not. This
+ // is useful for write unprepared which requires named transactions.
+ bool autogenerate_name = false;
+
+ friend class WritePreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+ friend class WritePreparedTransactionTestBase;
+ friend class TransactionTestBase;
+ friend class MySQLStyleTransactionTest;
+ friend class StressTest;
+};
+
+struct TransactionOptions {
+ // Setting set_snapshot=true is the same as calling
+ // Transaction::SetSnapshot().
+ bool set_snapshot = false;
+
+ // Setting to true means that before acquiring locks, this transaction will
+ // check if doing so will cause a deadlock. If so, it will return with
+ // Status::Busy. The user should retry their transaction.
+ bool deadlock_detect = false;
+
+ // If set, it states that the CommitTimeWriteBatch represents the latest state
+ // of the application, has only one sub-batch, i.e., no duplicate keys, and
+ // meant to be used later during recovery. It enables an optimization to
+ // postpone updating the memtable with CommitTimeWriteBatch to only
+ // SwitchMemtable or recovery.
+ // This option does not affect write-committed. Only
+ // write-prepared/write-unprepared transactions will be affected.
+ bool use_only_the_last_commit_time_batch_for_recovery = false;
+
+ // TODO(agiardullo): TransactionDB does not yet support comparators that allow
+ // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
+ // return 0 if
+ // a.compare(b) returns 0.
+
+ // If positive, specifies the wait timeout in milliseconds when
+ // a transaction attempts to lock a key.
+ //
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
+ int64_t lock_timeout = -1;
+
+ // Expiration duration in milliseconds. If non-negative, transactions that
+ // last longer than this many milliseconds will fail to commit. If not set,
+ // a forgotten transaction that is never committed, rolled back, or deleted
+ // will never relinquish any locks it holds. This could prevent keys from
+ // being written by other writers.
+ int64_t expiration = -1;
+
+ // The number of traversals to make during deadlock detection.
+ int64_t deadlock_detect_depth = 50;
+
+ // The maximum number of bytes used for the write batch. 0 means no limit.
+ size_t max_write_batch_size = 0;
+
+ // Skip Concurrency Control. This could be as an optimization if the
+ // application knows that the transaction would not have any conflict with
+ // concurrent transactions. It could also be used during recovery if (i)
+ // application guarantees no conflict between prepared transactions in the WAL
+ // (ii) application guarantees that recovered transactions will be rolled
+ // back/commit before new transactions start.
+ // Default: false
+ bool skip_concurrency_control = false;
+
+ // In pessimistic transaction, if this is true, then you can skip Prepare
+ // before Commit, otherwise, you must Prepare before Commit.
+ bool skip_prepare = true;
+
+ // See TransactionDBOptions::default_write_batch_flush_threshold for
+ // description. If a negative value is specified, then the default value from
+ // TransactionDBOptions is used.
+ int64_t write_batch_flush_threshold = -1;
+};
+
+// The per-write optimizations that do not involve transactions. TransactionDB
+// implementation might or might not make use of the specified optimizations.
+struct TransactionDBWriteOptimizations {
+ // If it is true it means that the application guarantees that the
+ // key-set in the write batch do not conflict with any concurrent transaction
+ // and hence the concurrency control mechanism could be skipped for this
+ // write.
+ bool skip_concurrency_control = false;
+ // If true, the application guarantees that there is no duplicate <column
+ // family, key> in the write batch and any employed mechanism to handle
+ // duplicate keys could be skipped.
+ bool skip_duplicate_key_check = false;
+};
+
+struct KeyLockInfo {
+ std::string key;
+ std::vector<TransactionID> ids;
+ bool exclusive;
+};
+
+struct RangeLockInfo {
+ EndpointWithString start;
+ EndpointWithString end;
+ std::vector<TransactionID> ids;
+ bool exclusive;
+};
+
+struct DeadlockInfo {
+ TransactionID m_txn_id;
+ uint32_t m_cf_id;
+ bool m_exclusive;
+ std::string m_waiting_key;
+};
+
+struct DeadlockPath {
+ std::vector<DeadlockInfo> path;
+ bool limit_exceeded;
+ int64_t deadlock_time;
+
+ explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
+ const int64_t& dl_time)
+ : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+ // empty path, limit exceeded constructor and default constructor
+ explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+ : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+ bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+class TransactionDB : public StackableDB {
+ public:
+ // Optimized version of ::Write that receives more optimization request such
+ // as skip_concurrency_control.
+ using StackableDB::Write;
+ virtual Status Write(const WriteOptions& opts,
+ const TransactionDBWriteOptimizations&,
+ WriteBatch* updates) {
+ // The default implementation ignores TransactionDBWriteOptimizations and
+ // falls back to the un-optimized version of ::Write
+ return Write(opts, updates);
+ }
+ // Transactional `DeleteRange()` is not yet supported.
+ // However, users who know their deleted range does not conflict with
+ // anything can still use it via the `Write()` API. In all cases, the
+ // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
+ // used and `skip_concurrency_control` must be set. When using either
+ // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
+ // additionally be set.
+ using StackableDB::DeleteRange;
+ virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+ const Slice&, const Slice&) override {
+ return Status::NotSupported();
+ }
+ // Open a TransactionDB similar to DB::Open().
+ // Internally call PrepareWrap() and WrapDB()
+ // If the return status is not ok, then dbptr is set to nullptr.
+ static Status Open(const Options& options,
+ const TransactionDBOptions& txn_db_options,
+ const std::string& dbname, TransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options,
+ const TransactionDBOptions& txn_db_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ TransactionDB** dbptr);
+ // Note: PrepareWrap() may change parameters, make copies before the
+ // invocation if needed.
+ static void PrepareWrap(DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* column_families,
+ std::vector<size_t>* compaction_enabled_cf_indices);
+ // If the return status is not ok, then dbptr will bet set to nullptr. The
+ // input db parameter might or might not be deleted as a result of the
+ // failure. If it is properly deleted it will be set to nullptr. If the return
+ // status is ok, the ownership of db is transferred to dbptr.
+ static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles,
+ TransactionDB** dbptr);
+ // If the return status is not ok, then dbptr will bet set to nullptr. The
+ // input db parameter might or might not be deleted as a result of the
+ // failure. If it is properly deleted it will be set to nullptr. If the return
+ // status is ok, the ownership of db is transferred to dbptr.
+ static Status WrapStackableDB(
+ StackableDB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
+ // Since the destructor in StackableDB is virtual, this destructor is virtual
+ // too. The root db will be deleted by the base's destructor.
+ ~TransactionDB() override {}
+
+ // Starts a new Transaction.
+ //
+ // Caller is responsible for deleting the returned transaction when no
+ // longer needed.
+ //
+ // If old_txn is not null, BeginTransaction will reuse this Transaction
+ // handle instead of allocating a new one. This is an optimization to avoid
+ // extra allocations when repeatedly creating transactions.
+ virtual Transaction* BeginTransaction(
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options = TransactionOptions(),
+ Transaction* old_txn = nullptr) = 0;
+
+ virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
+ virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
+
+ // Returns set of all locks held.
+ //
+ // The mapping is column family id -> KeyLockInfo
+ virtual std::unordered_multimap<uint32_t, KeyLockInfo>
+ GetLockStatusData() = 0;
+
+ virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+ virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+ // Create a snapshot and assign ts to it. Return the snapshot to caller. The
+ // snapshot-timestamp mapping is also tracked by the database.
+ // Caller must ensure there are no active writes when this API is called.
+ virtual std::pair<Status, std::shared_ptr<const Snapshot>>
+ CreateTimestampedSnapshot(TxnTimestamp ts) = 0;
+
+ // Return the latest timestamped snapshot if present.
+ std::shared_ptr<const Snapshot> GetLatestTimestampedSnapshot() const {
+ return GetTimestampedSnapshot(kMaxTxnTimestamp);
+ }
+ // Return the snapshot correponding to given timestamp. If ts is
+ // kMaxTxnTimestamp, then we return the latest timestamped snapshot if
+ // present. Othersise, we return the snapshot whose timestamp is equal to
+ // `ts`. If no such snapshot exists, then we return null.
+ virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
+ TxnTimestamp ts) const = 0;
+ // Release timestamped snapshots whose timestamps are less than or equal to
+ // ts.
+ virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0;
+
+ // Get all timestamped snapshots which will be stored in
+ // timestamped_snapshots.
+ Status GetAllTimestampedSnapshots(
+ std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+ const {
+ return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp,
+ timestamped_snapshots);
+ }
+
+ // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub).
+ // timestamped_snapshots will be cleared and contain returned snapshots.
+ virtual Status GetTimestampedSnapshots(
+ TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+ const = 0;
+
+ protected:
+ // To Create an TransactionDB, call Open()
+ // The ownership of db is transferred to the base StackableDB
+ explicit TransactionDB(DB* db) : StackableDB(db) {}
+ // No copying allowed
+ TransactionDB(const TransactionDB&) = delete;
+ void operator=(const TransactionDB&) = delete;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
new file mode 100644
index 000000000..e352f325a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TransactionDBMutex and TransactionDBCondVar APIs allows applications to
+// implement custom mutexes and condition variables to be used by a
+// TransactionDB when locking keys.
+//
+// To open a TransactionDB with a custom TransactionDBMutexFactory, set
+// TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutex {
+ public:
+ virtual ~TransactionDBMutex() {}
+
+ // Attempt to acquire lock. Return OK on success, or other Status on failure.
+ // If returned status is OK, TransactionDB will eventually call UnLock().
+ virtual Status Lock() = 0;
+
+ // Attempt to acquire lock. If timeout is non-negative, operation may be
+ // failed after this many microseconds.
+ // Returns OK on success,
+ // TimedOut if timed out,
+ // or other Status on failure.
+ // If returned status is OK, TransactionDB will eventually call UnLock().
+ virtual Status TryLockFor(int64_t timeout_time) = 0;
+
+ // Unlock Mutex that was successfully locked by Lock() or TryLockUntil()
+ virtual void UnLock() = 0;
+};
+
+class TransactionDBCondVar {
+ public:
+ virtual ~TransactionDBCondVar() {}
+
+ // Block current thread until condition variable is notified by a call to
+ // Notify() or NotifyAll(). Wait() will be called with mutex locked.
+ // Returns OK if notified.
+ // Returns non-OK if TransactionDB should stop waiting and fail the operation.
+ // May return OK spuriously even if not notified.
+ virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0;
+
+ // Block current thread until condition variable is notified by a call to
+ // Notify() or NotifyAll(), or if the timeout is reached.
+ // Wait() will be called with mutex locked.
+ //
+ // If timeout is non-negative, operation should be failed after this many
+ // microseconds.
+ // If implementing a custom version of this class, the implementation may
+ // choose to ignore the timeout.
+ //
+ // Returns OK if notified.
+ // Returns TimedOut if timeout is reached.
+ // Returns other status if TransactionDB should otherwise stop waiting and
+ // fail the operation.
+ // May return OK spuriously even if not notified.
+ virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+ int64_t timeout_time) = 0;
+
+ // If any threads are waiting on *this, unblock at least one of the
+ // waiting threads.
+ virtual void Notify() = 0;
+
+ // Unblocks all threads waiting on *this.
+ virtual void NotifyAll() = 0;
+};
+
+// Factory class that can allocate mutexes and condition variables.
+class TransactionDBMutexFactory {
+ public:
+ // Create a TransactionDBMutex object.
+ virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0;
+
+ // Create a TransactionDBCondVar object.
+ virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0;
+
+ virtual ~TransactionDBMutexFactory() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
new file mode 100644
index 000000000..84dc11a31
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -0,0 +1,309 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class Comparator;
+class DB;
+class ReadCallback;
+struct ReadOptions;
+struct DBOptions;
+
+enum WriteType {
+ kPutRecord,
+ kMergeRecord,
+ kDeleteRecord,
+ kSingleDeleteRecord,
+ kDeleteRangeRecord,
+ kLogDataRecord,
+ kXIDRecord,
+ kUnknownRecord,
+};
+
+// an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
+// Used in WBWIIterator.
+struct WriteEntry {
+ WriteType type = kUnknownRecord;
+ Slice key;
+ Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+ virtual ~WBWIIterator() {}
+
+ virtual bool Valid() const = 0;
+
+ virtual void SeekToFirst() = 0;
+
+ virtual void SeekToLast() = 0;
+
+ virtual void Seek(const Slice& key) = 0;
+
+ virtual void SeekForPrev(const Slice& key) = 0;
+
+ virtual void Next() = 0;
+
+ virtual void Prev() = 0;
+
+ // the return WriteEntry is only valid until the next mutation of
+ // WriteBatchWithIndex
+ virtual WriteEntry Entry() const = 0;
+
+ virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() Delete(), or SingleDelete(), the same function of the
+// wrapped will be called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex : public WriteBatchBase {
+ public:
+ // backup_index_comparator: the backup comparator used to compare keys
+ // within the same column family, if column family is not given in the
+ // interface, or we can't find a column family from the column family handle
+ // passed in, backup_index_comparator will be used for the column family.
+ // reserved_bytes: reserved bytes in underlying WriteBatch
+ // max_bytes: maximum size of underlying WriteBatch in bytes
+ // overwrite_key: if true, overwrite the key in the index when inserting
+ // the same key as previously, so iterator will never
+ // show two entries with the same key.
+ explicit WriteBatchWithIndex(
+ const Comparator* backup_index_comparator = BytewiseComparator(),
+ size_t reserved_bytes = 0, bool overwrite_key = false,
+ size_t max_bytes = 0, size_t protection_bytes_per_key = 0);
+
+ ~WriteBatchWithIndex() override;
+ WriteBatchWithIndex(WriteBatchWithIndex&&);
+ WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
+
+ using WriteBatchBase::Put;
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ Status Put(const Slice& key, const Slice& value) override;
+
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) override;
+
+ Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */,
+ const WideColumns& /* columns */) override {
+ if (!column_family) {
+ return Status::InvalidArgument(
+ "Cannot call this method without a column family handle");
+ }
+
+ return Status::NotSupported(
+ "PutEntity not supported by WriteBatchWithIndex");
+ }
+
+ using WriteBatchBase::Merge;
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ Status Merge(const Slice& key, const Slice& value) override;
+ Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*ts*/, const Slice& /*value*/) override {
+ return Status::NotSupported(
+ "Merge does not support user-defined timestamp");
+ }
+
+ using WriteBatchBase::Delete;
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+ Status Delete(const Slice& key) override;
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override;
+
+ using WriteBatchBase::SingleDelete;
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDelete(const Slice& key) override;
+ Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override;
+
+ using WriteBatchBase::DeleteRange;
+ Status DeleteRange(ColumnFamilyHandle* /* column_family */,
+ const Slice& /* begin_key */,
+ const Slice& /* end_key */) override {
+ return Status::NotSupported(
+ "DeleteRange unsupported in WriteBatchWithIndex");
+ }
+ Status DeleteRange(const Slice& /* begin_key */,
+ const Slice& /* end_key */) override {
+ return Status::NotSupported(
+ "DeleteRange unsupported in WriteBatchWithIndex");
+ }
+ Status DeleteRange(ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*begin_key*/, const Slice& /*end_key*/,
+ const Slice& /*ts*/) override {
+ return Status::NotSupported(
+ "DeleteRange unsupported in WriteBatchWithIndex");
+ }
+
+ using WriteBatchBase::PutLogData;
+ Status PutLogData(const Slice& blob) override;
+
+ using WriteBatchBase::Clear;
+ void Clear() override;
+
+ using WriteBatchBase::GetWriteBatch;
+ WriteBatch* GetWriteBatch() override;
+
+ // Create an iterator of a column family. User can call iterator.Seek() to
+ // search to the next entry of or after a key. Keys will be iterated in the
+ // order given by index_comparator. For multiple updates on the same key,
+ // each update will be returned as a separate entry, in the order of update
+ // time.
+ //
+ // The returned iterator should be deleted by the caller.
+ WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+ // Create an iterator of the default column family.
+ WBWIIterator* NewIterator();
+
+ // Will create a new Iterator that will use WBWIIterator as a delta and
+ // base_iterator as base.
+ //
+ // This function is only supported if the WriteBatchWithIndex was
+ // constructed with overwrite_key=true.
+ //
+ // The returned iterator should be deleted by the caller.
+ // The base_iterator is now 'owned' by the returned iterator. Deleting the
+ // returned iterator will also delete the base_iterator.
+ //
+ // Updating write batch with the current key of the iterator is not safe.
+ // We strongly recommend users not to do it. It will invalidate the current
+ // key() and value() of the iterator. This invalidation happens even before
+ // the write batch update finishes. The state may recover after Next() is
+ // called.
+ Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+ Iterator* base_iterator,
+ const ReadOptions* opts = nullptr);
+ // default column family
+ Iterator* NewIteratorWithBase(Iterator* base_iterator);
+
+ // Similar to DB::Get() but will only read the key from this batch.
+ // If the batch does not have enough data to resolve Merge operations,
+ // MergeInProgress status may be returned.
+ Status GetFromBatch(ColumnFamilyHandle* column_family,
+ const DBOptions& options, const Slice& key,
+ std::string* value);
+
+ // Similar to previous function but does not require a column_family.
+ // Note: An InvalidArgument status will be returned if there are any Merge
+ // operators for this key. Use previous method instead.
+ Status GetFromBatch(const DBOptions& options, const Slice& key,
+ std::string* value) {
+ return GetFromBatch(nullptr, options, key, value);
+ }
+
+ // Similar to DB::Get() but will also read writes from this batch.
+ //
+ // This function will query both this batch and the DB and then merge
+ // the results using the DB's merge operator (if the batch contains any
+ // merge requests).
+ //
+ // Setting read_options.snapshot will affect what is read from the DB
+ // but will NOT change which keys are read from the batch (the keys in
+ // this batch do not yet belong to any snapshot and will be fetched
+ // regardless).
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ const Slice& key, std::string* value);
+
+ // An overload of the above method that receives a PinnableSlice
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ const Slice& key, PinnableSlice* value);
+
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value);
+
+ // An overload of the above method that receives a PinnableSlice
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value);
+
+ void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ bool sorted_input);
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ void SetSavePoint() override;
+
+ // Remove all entries in this batch (Put, Merge, Delete, SingleDelete,
+ // PutLogData) since the most recent call to SetSavePoint() and removes the
+ // most recent save point.
+ // If there is no previous call to SetSavePoint(), behaves the same as
+ // Clear().
+ //
+ // Calling RollbackToSavePoint invalidates any open iterators on this batch.
+ //
+ // Returns Status::OK() on success,
+ // Status::NotFound() if no previous call to SetSavePoint(),
+ // or other Status on corruption.
+ Status RollbackToSavePoint() override;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status PopSavePoint() override;
+
+ void SetMaxBytes(size_t max_bytes) override;
+ size_t GetDataSize() const;
+
+ private:
+ friend class PessimisticTransactionDB;
+ friend class WritePreparedTxn;
+ friend class WriteUnpreparedTxn;
+ friend class WriteBatchWithIndex_SubBatchCnt_Test;
+ friend class WriteBatchWithIndexInternal;
+ // Returns the number of sub-batches inside the write batch. A sub-batch
+ // starts right before inserting a key that is a duplicate of a key in the
+ // last sub-batch.
+ size_t SubBatchCnt();
+
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, ReadCallback* callback);
+ void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ bool sorted_input, ReadCallback* callback);
+ struct Rep;
+ std::unique_ptr<Rep> rep;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
new file mode 100644
index 000000000..c54f3a2c3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+// NOTE: in 'main' development branch, this should be the *next*
+// minor or major version number planned for release.
+#define ROCKSDB_MAJOR 7
+#define ROCKSDB_MINOR 9
+#define ROCKSDB_PATCH 2
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a set of properties indicating how/when/where this version of RocksDB
+// was created.
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties();
+
+// Returns the current version of RocksDB as a string (e.g. "6.16.0").
+// If with_patch is true, the patch is included (6.16.x).
+// Otherwise, only major and minor version is included (6.16)
+std::string GetRocksVersionAsString(bool with_patch = true);
+
+// Gets the set of build properties (@see GetRocksBuildProperties) into a
+// string. Properties are returned one-per-line, with the first line being:
+// "<program> from RocksDB <version>.
+// If verbose is true, the full set of properties is
+// printed. If verbose is false, only the version information (@see
+// GetRocksVersionString) is printed.
+std::string GetRocksBuildInfoAsString(const std::string& program,
+ bool verbose = false);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h
new file mode 100644
index 000000000..3e66c39e4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wal_filter.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteBatch;
+struct ConfigOptions;
+
+// WALFilter allows an application to inspect write-ahead-log (WAL)
+// records or modify their processing on recovery.
+// Please see the details below.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class WalFilter : public Customizable {
+ public:
+ static const char* Type() { return "WalFilter"; }
+ static Status CreateFromString(const ConfigOptions& options,
+ const std::string& value, WalFilter** result);
+ enum class WalProcessingOption {
+ // Continue processing as usual
+ kContinueProcessing = 0,
+ // Ignore the current record but continue processing of log(s)
+ kIgnoreCurrentRecord = 1,
+ // Stop replay of logs and discard logs
+ // Logs won't be replayed on subsequent recovery
+ kStopReplay = 2,
+ // Corrupted record detected by filter
+ kCorruptedRecord = 3,
+ // Marker for enum count
+ kWalProcessingOptionMax = 4
+ };
+
+ virtual ~WalFilter() {}
+
+ // Provide ColumnFamily->LogNumber map to filter
+ // so that filter can determine whether a log number applies to a given
+ // column family (i.e. that log hasn't been flushed to SST already for the
+ // column family).
+ // We also pass in name->id map as only name is known during
+ // recovery (as handles are opened post-recovery).
+ // while write batch callbacks happen in terms of column family id.
+ //
+ // @params cf_lognumber_map column_family_id to lognumber map
+ // @params cf_name_id_map column_family_name to column_family_id map
+
+ virtual void ColumnFamilyLogNumberMap(
+ const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/,
+ const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {}
+
+ // LogRecord is invoked for each log record encountered for all the logs
+ // during replay on logs on recovery. This method can be used to:
+ // * inspect the record (using the batch parameter)
+ // * ignoring current record
+ // (by returning WalProcessingOption::kIgnoreCurrentRecord)
+ // * reporting corrupted record
+ // (by returning WalProcessingOption::kCorruptedRecord)
+ // * stop log replay
+ // (by returning kStop replay) - please note that this implies
+ // discarding the logs from current record onwards.
+ //
+ // @params log_number log_number of the current log.
+ // Filter might use this to determine if the log
+ // record is applicable to a certain column family.
+ // @params log_file_name log file name - only for informational purposes
+ // @params batch batch encountered in the log during recovery
+ // @params new_batch new_batch to populate if filter wants to change
+ // the batch (for example to filter some records out,
+ // or alter some records).
+ // Please note that the new batch MUST NOT contain
+ // more records than original, else recovery would
+ // be failed.
+ // @params batch_changed Whether batch was changed by the filter.
+ // It must be set to true if new_batch was populated,
+ // else new_batch has no effect.
+ // @returns Processing option for the current record.
+ // Please see WalProcessingOption enum above for
+ // details.
+ virtual WalProcessingOption LogRecordFound(
+ unsigned long long /*log_number*/, const std::string& /*log_file_name*/,
+ const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+ // Default implementation falls back to older function for compatibility
+ return LogRecord(batch, new_batch, batch_changed);
+ }
+
+ // Please see the comments for LogRecord above. This function is for
+ // compatibility only and contains a subset of parameters.
+ // New code should use the function above.
+ virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) const {
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ // Returns a name that identifies this WAL filter.
+ // The name will be printed to LOG file on start up for diagnosis.
+ virtual const char* Name() const override = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wide_columns.h b/src/rocksdb/include/rocksdb/wide_columns.h
new file mode 100644
index 000000000..7ddc61f03
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wide_columns.h
@@ -0,0 +1,171 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <ostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Class representing a wide column, which is defined as a pair of column name
+// and column value.
+class WideColumn {
+ public:
+ WideColumn() = default;
+
+ // Initializes a WideColumn object by forwarding the name and value
+ // arguments to the corresponding member Slices. This makes it possible to
+ // construct a WideColumn using combinations of const char*, const
+ // std::string&, const Slice& etc., for example:
+ //
+ // constexpr char foo[] = "foo";
+ // const std::string bar("bar");
+ // WideColumn column(foo, bar);
+ template <typename N, typename V>
+ WideColumn(N&& name, V&& value)
+ : name_(std::forward<N>(name)), value_(std::forward<V>(value)) {}
+
+ // Initializes a WideColumn object by forwarding the elements of
+ // name_tuple and value_tuple to the constructors of the corresponding member
+ // Slices. This makes it possible to initialize the Slices using the Slice
+ // constructors that take more than one argument, for example:
+ //
+ // constexpr char foo_name[] = "foo_name";
+ // constexpr char bar_value[] = "bar_value";
+ // WideColumn column(std::piecewise_construct,
+ // std::forward_as_tuple(foo_name, 3),
+ // std::forward_as_tuple(bar_value, 3));
+ template <typename NTuple, typename VTuple>
+ WideColumn(std::piecewise_construct_t, NTuple&& name_tuple,
+ VTuple&& value_tuple)
+ : name_(std::make_from_tuple<Slice>(std::forward<NTuple>(name_tuple))),
+ value_(std::make_from_tuple<Slice>(std::forward<VTuple>(value_tuple))) {
+ }
+
+ const Slice& name() const { return name_; }
+ const Slice& value() const { return value_; }
+
+ Slice& name() { return name_; }
+ Slice& value() { return value_; }
+
+ private:
+ Slice name_;
+ Slice value_;
+};
+
+// Note: column names and values are compared bytewise.
+inline bool operator==(const WideColumn& lhs, const WideColumn& rhs) {
+ return lhs.name() == rhs.name() && lhs.value() == rhs.value();
+}
+
+inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) {
+ return !(lhs == rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) {
+ const bool hex =
+ (os.flags() & std::ios_base::basefield) == std::ios_base::hex;
+ os << column.name().ToString(hex) << ':' << column.value().ToString(hex);
+
+ return os;
+}
+
+// A collection of wide columns.
+using WideColumns = std::vector<WideColumn>;
+
+// The anonymous default wide column (an empty Slice).
+extern const Slice kDefaultWideColumnName;
+
+// An empty set of wide columns.
+extern const WideColumns kNoWideColumns;
+
+// A self-contained collection of wide columns. Used for the results of
+// wide-column queries.
+class PinnableWideColumns {
+ public:
+ const WideColumns& columns() const { return columns_; }
+ size_t serialized_size() const { return value_.size(); }
+
+ void SetPlainValue(const Slice& value);
+ void SetPlainValue(const Slice& value, Cleanable* cleanable);
+
+ Status SetWideColumnValue(const Slice& value);
+ Status SetWideColumnValue(const Slice& value, Cleanable* cleanable);
+
+ void Reset();
+
+ private:
+ void CopyValue(const Slice& value);
+ void PinOrCopyValue(const Slice& value, Cleanable* cleanable);
+ void CreateIndexForPlainValue();
+ Status CreateIndexForWideColumns();
+
+ PinnableSlice value_;
+ WideColumns columns_;
+};
+
+inline void PinnableWideColumns::CopyValue(const Slice& value) {
+ value_.PinSelf(value);
+}
+
+inline void PinnableWideColumns::PinOrCopyValue(const Slice& value,
+ Cleanable* cleanable) {
+ if (!cleanable) {
+ CopyValue(value);
+ return;
+ }
+
+ value_.PinSlice(value, cleanable);
+}
+
+inline void PinnableWideColumns::CreateIndexForPlainValue() {
+ columns_ = WideColumns{{kDefaultWideColumnName, value_}};
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value) {
+ CopyValue(value);
+ CreateIndexForPlainValue();
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value,
+ Cleanable* cleanable) {
+ PinOrCopyValue(value, cleanable);
+ CreateIndexForPlainValue();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) {
+ CopyValue(value);
+ return CreateIndexForWideColumns();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value,
+ Cleanable* cleanable) {
+ PinOrCopyValue(value, cleanable);
+ return CreateIndexForWideColumns();
+}
+
+inline void PinnableWideColumns::Reset() {
+ value_.Reset();
+ columns_.clear();
+}
+
+inline bool operator==(const PinnableWideColumns& lhs,
+ const PinnableWideColumns& rhs) {
+ return lhs.columns() == rhs.columns();
+}
+
+inline bool operator!=(const PinnableWideColumns& lhs,
+ const PinnableWideColumns& rhs) {
+ return !(lhs == rhs);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
new file mode 100644
index 000000000..61ba5a739
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -0,0 +1,494 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch. For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+// batch.Put("key", "v1");
+// batch.Delete("key");
+// batch.Put("key", "v2");
+// batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SavePoints;
+struct SliceParts;
+
+struct SavePoint {
+ size_t size; // size of rep_
+ int count; // count of elements in rep_
+ uint32_t content_flags;
+
+ SavePoint() : size(0), count(0), content_flags(0) {}
+
+ SavePoint(size_t _size, int _count, uint32_t _flags)
+ : size(_size), count(_count), content_flags(_flags) {}
+
+ void clear() {
+ size = 0;
+ count = 0;
+ content_flags = 0;
+ }
+
+ bool is_cleared() const { return (size | count | content_flags) == 0; }
+};
+
+class WriteBatch : public WriteBatchBase {
+ public:
+ explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
+ : WriteBatch(reserved_bytes, max_bytes, 0, 0) {}
+
+ // `protection_bytes_per_key` is the number of bytes used to store
+ // protection information for each key entry. Currently supported values are
+ // zero (disabled) and eight.
+ explicit WriteBatch(size_t reserved_bytes, size_t max_bytes,
+ size_t protection_bytes_per_key, size_t default_cf_ts_sz);
+ ~WriteBatch() override;
+
+ using WriteBatchBase::Put;
+ // Store the mapping "key->value" in the database.
+ // The following Put(..., const Slice& key, ...) API can also be used when
+ // user-defined timestamp is enabled as long as `key` points to a contiguous
+ // buffer with timestamp appended after user key. The caller is responsible
+ // for setting up the memory buffer pointed to by `key`.
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status Put(const Slice& key, const Slice& value) override {
+ return Put(nullptr, key, value);
+ }
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) override;
+
+ // Variant of Put() that gathers output like writev(2). The key and value
+ // that will be written to the database are concatenations of arrays of
+ // slices.
+ // The following Put(..., const SliceParts& key, ...) API can be used when
+ // user-defined timestamp is enabled as long as the timestamp is the last
+ // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+ // for setting up the `key` SliceParts object.
+ Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+ Status Put(const SliceParts& key, const SliceParts& value) override {
+ return Put(nullptr, key, value);
+ }
+
+ // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+ // column family specified by "column_family".
+ using WriteBatchBase::PutEntity;
+ Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) override;
+
+ using WriteBatchBase::Delete;
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ // The following Delete(..., const Slice& key) can be used when user-defined
+ // timestamp is enabled as long as `key` points to a contiguous buffer with
+ // timestamp appended after user key. The caller is responsible for setting
+ // up the memory buffer pointed to by `key`.
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+ Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override;
+
+ // variant that takes SliceParts
+ // These two variants of Delete(..., const SliceParts& key) can be used when
+ // user-defined timestamp is enabled as long as the timestamp is the last
+ // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+ // for setting up the `key` SliceParts object.
+ Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+ Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+ using WriteBatchBase::SingleDelete;
+ // WriteBatch implementation of DB::SingleDelete(). See db.h.
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDelete(const Slice& key) override {
+ return SingleDelete(nullptr, key);
+ }
+ Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) override;
+
+ // variant that takes SliceParts
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+ Status SingleDelete(const SliceParts& key) override {
+ return SingleDelete(nullptr, key);
+ }
+
+ using WriteBatchBase::DeleteRange;
+ // WriteBatch implementation of DB::DeleteRange(). See db.h.
+ Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+ const Slice& end_key) override;
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key) override {
+ return DeleteRange(nullptr, begin_key, end_key);
+ }
+ // begin_key and end_key should be user keys without timestamp.
+ Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+ const Slice& end_key, const Slice& ts) override;
+
+ // variant that takes SliceParts
+ Status DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) override;
+ Status DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key) override {
+ return DeleteRange(nullptr, begin_key, end_key);
+ }
+
+ using WriteBatchBase::Merge;
+ // Merge "value" with the existing value of "key" in the database.
+ // "key->merge(existing, value)"
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status Merge(const Slice& key, const Slice& value) override {
+ return Merge(nullptr, key, value);
+ }
+ Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*ts*/, const Slice& /*value*/) override;
+
+ // variant that takes SliceParts
+ Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+ Status Merge(const SliceParts& key, const SliceParts& value) override {
+ return Merge(nullptr, key, value);
+ }
+
+ using WriteBatchBase::PutLogData;
+ // Append a blob of arbitrary size to the records in this batch. The blob will
+ // be stored in the transaction log but not in any other file. In particular,
+ // it will not be persisted to the SST files. When iterating over this
+ // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+ // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+ // encountered in the same order in which they were inserted. The blob will
+ // NOT consume sequence number(s) and will NOT increase the count of the batch
+ //
+ // Example application: add timestamps to the transaction log for use in
+ // replication.
+ Status PutLogData(const Slice& blob) override;
+
+ using WriteBatchBase::Clear;
+ // Clear all updates buffered in this batch.
+ void Clear() override;
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ void SetSavePoint() override;
+
+ // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+ // most recent call to SetSavePoint() and removes the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status RollbackToSavePoint() override;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status PopSavePoint() override;
+
+ // Support for iterating over the contents of a batch.
+ // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
+ class Handler {
+ public:
+ virtual ~Handler();
+ // All handler functions in this class provide default implementations so
+ // we won't break existing clients of Handler on a source code level when
+ // adding a new member function.
+
+ // default implementation will just call Put without column family for
+ // backwards compatibility. If the column family is not default,
+ // the function is noop
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ // Put() historically doesn't return status. We didn't want to be
+ // backwards incompatible so we didn't change the return status
+ // (this is a public API). We do an ordinary get and return Status::OK()
+ Put(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and PutCF not implemented");
+ }
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status PutEntityCF(uint32_t /* column_family_id */,
+ const Slice& /* key */,
+ const Slice& /* entity */) {
+ return Status::NotSupported("PutEntityCF not implemented");
+ }
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+ if (column_family_id == 0) {
+ Delete(key);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and DeleteCF not implemented");
+ }
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual void Delete(const Slice& /*key*/) {}
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
+ if (column_family_id == 0) {
+ SingleDelete(key);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and SingleDeleteCF not implemented");
+ }
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual void SingleDelete(const Slice& /*key*/) {}
+
+ // If user-defined timestamp is enabled, then `begin_key` and `end_key`
+ // both include timestamp.
+ virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
+ const Slice& /*begin_key*/,
+ const Slice& /*end_key*/) {
+ return Status::InvalidArgument("DeleteRangeCF not implemented");
+ }
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ Merge(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and MergeCF not implemented");
+ }
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
+
+ // If user-defined timestamp is enabled, then `key` includes timestamp.
+ virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/,
+ const Slice& /*value*/) {
+ return Status::InvalidArgument("PutBlobIndexCF not implemented");
+ }
+
+ // The default implementation of LogData does nothing.
+ virtual void LogData(const Slice& blob);
+
+ virtual Status MarkBeginPrepare(bool = false) {
+ return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
+ }
+
+ virtual Status MarkEndPrepare(const Slice& /*xid*/) {
+ return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
+ }
+
+ virtual Status MarkNoop(bool /*empty_batch*/) {
+ return Status::InvalidArgument("MarkNoop() handler not defined.");
+ }
+
+ virtual Status MarkRollback(const Slice& /*xid*/) {
+ return Status::InvalidArgument(
+ "MarkRollbackPrepare() handler not defined.");
+ }
+
+ virtual Status MarkCommit(const Slice& /*xid*/) {
+ return Status::InvalidArgument("MarkCommit() handler not defined.");
+ }
+
+ virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+ const Slice& /*commit_ts*/) {
+ return Status::InvalidArgument(
+ "MarkCommitWithTimestamp() handler not defined.");
+ }
+
+ // Continue is called by WriteBatch::Iterate. If it returns false,
+ // iteration is halted. Otherwise, it continues iterating. The default
+ // implementation always returns true.
+ virtual bool Continue();
+
+ protected:
+ friend class WriteBatchInternal;
+ enum class OptionState {
+ kUnknown,
+ kDisabled,
+ kEnabled,
+ };
+ virtual OptionState WriteAfterCommit() const {
+ return OptionState::kUnknown;
+ }
+ virtual OptionState WriteBeforePrepare() const {
+ return OptionState::kUnknown;
+ }
+ };
+ Status Iterate(Handler* handler) const;
+
+ // Retrieve the serialized version of this batch.
+ const std::string& Data() const { return rep_; }
+
+ // Retrieve data size of the batch.
+ size_t GetDataSize() const { return rep_.size(); }
+
+ // Returns the number of updates in the batch
+ uint32_t Count() const;
+
+ // Returns true if PutCF will be called during Iterate
+ bool HasPut() const;
+
+ // Returns true if PutEntityCF will be called during Iterate
+ bool HasPutEntity() const;
+
+ // Returns true if DeleteCF will be called during Iterate
+ bool HasDelete() const;
+
+ // Returns true if SingleDeleteCF will be called during Iterate
+ bool HasSingleDelete() const;
+
+ // Returns true if DeleteRangeCF will be called during Iterate
+ bool HasDeleteRange() const;
+
+ // Returns true if MergeCF will be called during Iterate
+ bool HasMerge() const;
+
+ // Returns true if MarkBeginPrepare will be called during Iterate
+ bool HasBeginPrepare() const;
+
+ // Returns true if MarkEndPrepare will be called during Iterate
+ bool HasEndPrepare() const;
+
+ // Returns true if MarkCommit will be called during Iterate
+ bool HasCommit() const;
+
+ // Returns true if MarkRollback will be called during Iterate
+ bool HasRollback() const;
+
+ // Experimental.
+ //
+ // Update timestamps of existing entries in the write batch if
+ // applicable. If a key is intended for a column family that disables
+ // timestamp, then this API won't set the timestamp for this key.
+ // This requires that all keys, if enable timestamp, (possibly from multiple
+ // column families) in the write batch have timestamps of the same format.
+ //
+ // ts_sz_func: callable object to obtain the timestamp sizes of column
+ // families. If ts_sz_func() accesses data structures, then the caller of this
+ // API must guarantee thread-safety. Like other parts of RocksDB, this API is
+ // not exception-safe. Therefore, ts_sz_func() must not throw.
+ //
+ // in: cf, the column family id.
+ // ret: timestamp size of the given column family. Return
+ // std::numeric_limits<size_t>::max() indicating "don't know or column
+ // family info not found", this will cause UpdateTimestamps() to fail.
+ // size_t ts_sz_func(uint32_t cf);
+ Status UpdateTimestamps(const Slice& ts,
+ std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
+
+ // Verify the per-key-value checksums of this write batch.
+ // Corruption status will be returned if the verification fails.
+ // If this write batch does not have per-key-value checksum,
+ // OK status will be returned.
+ Status VerifyChecksum() const;
+
+ using WriteBatchBase::GetWriteBatch;
+ WriteBatch* GetWriteBatch() override { return this; }
+
+ // Constructor with a serialized string object
+ explicit WriteBatch(const std::string& rep);
+ explicit WriteBatch(std::string&& rep);
+
+ WriteBatch(const WriteBatch& src);
+ WriteBatch(WriteBatch&& src) noexcept;
+ WriteBatch& operator=(const WriteBatch& src);
+ WriteBatch& operator=(WriteBatch&& src);
+
+ // marks this point in the WriteBatch as the last record to
+ // be inserted into the WAL, provided the WAL is enabled
+ void MarkWalTerminationPoint();
+ const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; }
+
+ void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
+
+ struct ProtectionInfo;
+ size_t GetProtectionBytesPerKey() const;
+
+ private:
+ friend class WriteBatchInternal;
+ friend class LocalSavePoint;
+ // TODO(myabandeh): this is needed for a hack to collapse the write batch and
+ // remove duplicate keys. Remove it when the hack is replaced with a proper
+ // solution.
+ friend class WriteBatchWithIndex;
+ std::unique_ptr<SavePoints> save_points_;
+
+ // When sending a WriteBatch through WriteImpl we might want to
+ // specify that only the first x records of the batch be written to
+ // the WAL.
+ SavePoint wal_term_point_;
+
+ // Is the content of the batch the application's latest state that meant only
+ // to be used for recovery? Refer to
+ // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
+ // more details.
+ bool is_latest_persistent_state_ = false;
+
+ // False if all keys are from column families that disable user-defined
+ // timestamp OR UpdateTimestamps() has been called at least once.
+ // This flag will be set to true if any of the above Put(), Delete(),
+ // SingleDelete(), etc. APIs are called at least once.
+ // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag
+ // to true because the assumption is that these APIs have already set the
+ // timestamps to desired values.
+ bool needs_in_place_update_ts_ = false;
+
+ // True if the write batch contains at least one key from a column family
+ // that enables user-defined timestamp.
+ bool has_key_with_ts_ = false;
+
+ // For HasXYZ. Mutable to allow lazy computation of results
+ mutable std::atomic<uint32_t> content_flags_;
+
+ // Performs deferred computation of content_flags if necessary
+ uint32_t ComputeContentFlags() const;
+
+ // Maximum size of rep_.
+ size_t max_bytes_;
+
+ std::unique_ptr<ProtectionInfo> prot_info_;
+
+ size_t default_cf_ts_sz_ = 0;
+
+ protected:
+ std::string rep_; // See comment in write_batch.cc for the format of rep_
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
new file mode 100644
index 000000000..f6f39ef0b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstddef>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+class ColumnFamilyHandle;
+class WriteBatch;
+struct SliceParts;
+
+// Abstract base class that defines the basic interface for a write batch.
+// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
+// indexed implementation.
+class WriteBatchBase {
+ public:
+ virtual ~WriteBatchBase() {}
+
+ // Store the mapping "key->value" in the database.
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Put(const Slice& key, const Slice& value) = 0;
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) = 0;
+
+ // Variant of Put() that gathers output like writev(2). The key and value
+ // that will be written to the database are concatenations of arrays of
+ // slices.
+ virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value);
+ virtual Status Put(const SliceParts& key, const SliceParts& value);
+
+ // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+ // column family specified by "column_family".
+ virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+ const WideColumns& columns) = 0;
+
+ // Merge "value" with the existing value of "key" in the database.
+ // "key->merge(existing, value)"
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Merge(const Slice& key, const Slice& value) = 0;
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts, const Slice& value) = 0;
+
+ // variant that takes SliceParts
+ virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value);
+ virtual Status Merge(const SliceParts& key, const SliceParts& value);
+
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status Delete(const Slice& key) = 0;
+ virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& ts) = 0;
+
+ // variant that takes SliceParts
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key);
+ virtual Status Delete(const SliceParts& key);
+
+ // If the database contains a mapping for "key", erase it. Expects that the
+ // key was not overwritten. Else do nothing.
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status SingleDelete(const Slice& key) = 0;
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& ts) = 0;
+
+ // variant that takes SliceParts
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key);
+ virtual Status SingleDelete(const SliceParts& key);
+
+ // If the database contains mappings in the range ["begin_key", "end_key"),
+ // erase them. Else do nothing.
+ virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) = 0;
+ virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0;
+ virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key,
+ const Slice& ts) = 0;
+
+ // variant that takes SliceParts
+ virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key);
+ virtual Status DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key);
+
+ // Append a blob of arbitrary size to the records in this batch. The blob will
+ // be stored in the transaction log but not in any other file. In particular,
+ // it will not be persisted to the SST files. When iterating over this
+ // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+ // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+ // encountered in the same order in which they were inserted. The blob will
+ // NOT consume sequence number(s) and will NOT increase the count of the batch
+ //
+ // Example application: add timestamps to the transaction log for use in
+ // replication.
+ virtual Status PutLogData(const Slice& blob) = 0;
+
+ // Clear all updates buffered in this batch.
+ virtual void Clear() = 0;
+
+ // Covert this batch into a WriteBatch. This is an abstracted way of
+ // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
+ // WriteBatch.
+ virtual WriteBatch* GetWriteBatch() = 0;
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ virtual void SetSavePoint() = 0;
+
+ // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+ // most recent call to SetSavePoint() and removes the most recent save point.
+ // If there is no previous call to SetSavePoint(), behaves the same as
+ // Clear().
+ virtual Status RollbackToSavePoint() = 0;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ virtual Status PopSavePoint() = 0;
+
+ // Sets the maximum size of the write batch in bytes. 0 means no limit.
+ virtual void SetMaxBytes(size_t max_bytes) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h
new file mode 100644
index 000000000..7fb18196d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBufferManager is for managing memory allocation for one or more
+// MemTables.
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <list>
+#include <mutex>
+
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManager;
+
+// Interface to block and signal DB instances, intended for RocksDB
+// internal use only. Each DB instance contains ptr to StallInterface.
+class StallInterface {
+ public:
+ virtual ~StallInterface() {}
+
+ virtual void Block() = 0;
+
+ virtual void Signal() = 0;
+};
+
+class WriteBufferManager final {
+ public:
+ // Parameters:
+ // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
+ // memory_usage() won't be valid and ShouldFlush() will always return true.
+ //
+ // cache_: if `cache` is provided, we'll put dummy entries in the cache and
+ // cost the memory allocated to the cache. It can be used even if _buffer_size
+ // = 0.
+ //
+ // allow_stall: if set true, it will enable stalling of writes when
+ // memory_usage() exceeds buffer_size. It will wait for flush to complete and
+ // memory usage to drop down.
+ explicit WriteBufferManager(size_t _buffer_size,
+ std::shared_ptr<Cache> cache = {},
+ bool allow_stall = false);
+ // No copying allowed
+ WriteBufferManager(const WriteBufferManager&) = delete;
+ WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
+ ~WriteBufferManager();
+
+ // Returns true if buffer_limit is passed to limit the total memory usage and
+ // is greater than 0.
+ bool enabled() const { return buffer_size() > 0; }
+
+ // Returns true if pointer to cache is passed.
+ bool cost_to_cache() const { return cache_res_mgr_ != nullptr; }
+
+ // Returns the total memory used by memtables.
+ // Only valid if enabled()
+ size_t memory_usage() const {
+ return memory_used_.load(std::memory_order_relaxed);
+ }
+
+ // Returns the total memory used by active memtables.
+ size_t mutable_memtable_memory_usage() const {
+ return memory_active_.load(std::memory_order_relaxed);
+ }
+
+ size_t dummy_entries_in_cache_usage() const;
+
+ // Returns the buffer_size.
+ size_t buffer_size() const {
+ return buffer_size_.load(std::memory_order_relaxed);
+ }
+
+ void SetBufferSize(size_t new_size) {
+ buffer_size_.store(new_size, std::memory_order_relaxed);
+ mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
+ // Check if stall is active and can be ended.
+ MaybeEndWriteStall();
+ }
+
+ // Below functions should be called by RocksDB internally.
+
+ // Should only be called from write thread
+ bool ShouldFlush() const {
+ if (enabled()) {
+ if (mutable_memtable_memory_usage() >
+ mutable_limit_.load(std::memory_order_relaxed)) {
+ return true;
+ }
+ size_t local_size = buffer_size();
+ if (memory_usage() >= local_size &&
+ mutable_memtable_memory_usage() >= local_size / 2) {
+ // If the memory exceeds the buffer size, we trigger more aggressive
+ // flush. But if already more than half memory is being flushed,
+ // triggering more flush may not help. We will hold it instead.
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Returns true if total memory usage exceeded buffer_size.
+ // We stall the writes untill memory_usage drops below buffer_size. When the
+ // function returns true, all writer threads (including one checking this
+ // condition) across all DBs will be stalled. Stall is allowed only if user
+ // pass allow_stall = true during WriteBufferManager instance creation.
+ //
+ // Should only be called by RocksDB internally .
+ bool ShouldStall() const {
+ if (!allow_stall_ || !enabled()) {
+ return false;
+ }
+
+ return IsStallActive() || IsStallThresholdExceeded();
+ }
+
+ // Returns true if stall is active.
+ bool IsStallActive() const {
+ return stall_active_.load(std::memory_order_relaxed);
+ }
+
+ // Returns true if stalling condition is met.
+ bool IsStallThresholdExceeded() const {
+ return memory_usage() >= buffer_size_;
+ }
+
+ void ReserveMem(size_t mem);
+
+ // We are in the process of freeing `mem` bytes, so it is not considered
+ // when checking the soft limit.
+ void ScheduleFreeMem(size_t mem);
+
+ void FreeMem(size_t mem);
+
+ // Add the DB instance to the queue and block the DB.
+ // Should only be called by RocksDB internally.
+ void BeginWriteStall(StallInterface* wbm_stall);
+
+ // If stall conditions have resolved, remove DB instances from queue and
+ // signal them to continue.
+ void MaybeEndWriteStall();
+
+ void RemoveDBFromQueue(StallInterface* wbm_stall);
+
+ private:
+ std::atomic<size_t> buffer_size_;
+ std::atomic<size_t> mutable_limit_;
+ std::atomic<size_t> memory_used_;
+ // Memory that hasn't been scheduled to free.
+ std::atomic<size_t> memory_active_;
+ std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+ // Protects cache_res_mgr_
+ std::mutex cache_res_mgr_mu_;
+
+ std::list<StallInterface*> queue_;
+ // Protects the queue_ and stall_active_.
+ std::mutex mu_;
+ bool allow_stall_;
+ // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall()
+ // while holding mu_, but it can be read without a lock.
+ std::atomic<bool> stall_active_;
+
+ void ReserveMemWithCache(size_t mem);
+ void FreeMemWithCache(size_t mem);
+};
+} // namespace ROCKSDB_NAMESPACE