summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/include/rocksdb
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/include/rocksdb')
-rw-r--r--src/rocksdb/include/rocksdb/advanced_options.h731
-rw-r--r--src/rocksdb/include/rocksdb/c.h1801
-rw-r--r--src/rocksdb/include/rocksdb/cache.h278
-rw-r--r--src/rocksdb/include/rocksdb/cleanable.h71
-rw-r--r--src/rocksdb/include/rocksdb/compaction_filter.h212
-rw-r--r--src/rocksdb/include/rocksdb/compaction_job_stats.h96
-rw-r--r--src/rocksdb/include/rocksdb/comparator.h122
-rw-r--r--src/rocksdb/include/rocksdb/concurrent_task_limiter.h46
-rw-r--r--src/rocksdb/include/rocksdb/convenience.h351
-rw-r--r--src/rocksdb/include/rocksdb/db.h1525
-rw-r--r--src/rocksdb/include/rocksdb/db_bench_tool.h11
-rw-r--r--src/rocksdb/include/rocksdb/db_dump_tool.h45
-rw-r--r--src/rocksdb/include/rocksdb/db_stress_tool.h11
-rw-r--r--src/rocksdb/include/rocksdb/env.h1589
-rw-r--r--src/rocksdb/include/rocksdb/env_encryption.h206
-rw-r--r--src/rocksdb/include/rocksdb/experimental.h29
-rw-r--r--src/rocksdb/include/rocksdb/file_checksum.h86
-rw-r--r--src/rocksdb/include/rocksdb/file_system.h1358
-rw-r--r--src/rocksdb/include/rocksdb/filter_policy.h200
-rw-r--r--src/rocksdb/include/rocksdb/flush_block_policy.h61
-rw-r--r--src/rocksdb/include/rocksdb/io_status.h232
-rw-r--r--src/rocksdb/include/rocksdb/iostats_context.h56
-rw-r--r--src/rocksdb/include/rocksdb/iterator.h119
-rw-r--r--src/rocksdb/include/rocksdb/ldb_tool.h43
-rw-r--r--src/rocksdb/include/rocksdb/listener.h491
-rw-r--r--src/rocksdb/include/rocksdb/memory_allocator.h77
-rw-r--r--src/rocksdb/include/rocksdb/memtablerep.h385
-rw-r--r--src/rocksdb/include/rocksdb/merge_operator.h257
-rw-r--r--src/rocksdb/include/rocksdb/metadata.h151
-rw-r--r--src/rocksdb/include/rocksdb/options.h1587
-rw-r--r--src/rocksdb/include/rocksdb/perf_context.h232
-rw-r--r--src/rocksdb/include/rocksdb/perf_level.h35
-rw-r--r--src/rocksdb/include/rocksdb/persistent_cache.h67
-rw-r--r--src/rocksdb/include/rocksdb/rate_limiter.h139
-rw-r--r--src/rocksdb/include/rocksdb/rocksdb_namespace.h10
-rw-r--r--src/rocksdb/include/rocksdb/slice.h269
-rw-r--r--src/rocksdb/include/rocksdb/slice_transform.h103
-rw-r--r--src/rocksdb/include/rocksdb/snapshot.h48
-rw-r--r--src/rocksdb/include/rocksdb/sst_dump_tool.h19
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_manager.h132
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_reader.h47
-rw-r--r--src/rocksdb/include/rocksdb/sst_file_writer.h139
-rw-r--r--src/rocksdb/include/rocksdb/statistics.h548
-rw-r--r--src/rocksdb/include/rocksdb/stats_history.h69
-rw-r--r--src/rocksdb/include/rocksdb/status.h386
-rw-r--r--src/rocksdb/include/rocksdb/table.h607
-rw-r--r--src/rocksdb/include/rocksdb/table_properties.h250
-rw-r--r--src/rocksdb/include/rocksdb/thread_status.h188
-rw-r--r--src/rocksdb/include/rocksdb/threadpool.h58
-rw-r--r--src/rocksdb/include/rocksdb/trace_reader_writer.h48
-rw-r--r--src/rocksdb/include/rocksdb/transaction_log.h121
-rw-r--r--src/rocksdb/include/rocksdb/types.h54
-rw-r--r--src/rocksdb/include/rocksdb/universal_compaction.h86
-rw-r--r--src/rocksdb/include/rocksdb/utilities/backupable_db.h341
-rw-r--r--src/rocksdb/include/rocksdb/utilities/checkpoint.h57
-rw-r--r--src/rocksdb/include/rocksdb/utilities/convenience.h10
-rw-r--r--src/rocksdb/include/rocksdb/utilities/db_ttl.h72
-rw-r--r--src/rocksdb/include/rocksdb/utilities/debug.h49
-rw-r--r--src/rocksdb/include/rocksdb/utilities/env_librados.h175
-rw-r--r--src/rocksdb/include/rocksdb/utilities/env_mirror.h180
-rw-r--r--src/rocksdb/include/rocksdb/utilities/info_log_finder.h19
-rw-r--r--src/rocksdb/include/rocksdb/utilities/ldb_cmd.h277
-rw-r--r--src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h71
-rw-r--r--src/rocksdb/include/rocksdb/utilities/leveldb_options.h146
-rw-r--r--src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h43
-rw-r--r--src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h55
-rw-r--r--src/rocksdb/include/rocksdb/utilities/memory_util.h50
-rw-r--r--src/rocksdb/include/rocksdb/utilities/object_registry.h205
-rw-r--r--src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h98
-rw-r--r--src/rocksdb/include/rocksdb/utilities/option_change_migration.h19
-rw-r--r--src/rocksdb/include/rocksdb/utilities/options_util.h102
-rw-r--r--src/rocksdb/include/rocksdb/utilities/sim_cache.h94
-rw-r--r--src/rocksdb/include/rocksdb/utilities/stackable_db.h465
-rw-r--r--src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h74
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction.h540
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction_db.h309
-rw-r--r--src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h92
-rw-r--r--src/rocksdb/include/rocksdb/utilities/utility_db.h34
-rw-r--r--src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h278
-rw-r--r--src/rocksdb/include/rocksdb/version.h16
-rw-r--r--src/rocksdb/include/rocksdb/wal_filter.h102
-rw-r--r--src/rocksdb/include/rocksdb/write_batch.h377
-rw-r--r--src/rocksdb/include/rocksdb/write_batch_base.h127
-rw-r--r--src/rocksdb/include/rocksdb/write_buffer_manager.h102
84 files changed, 20461 insertions, 0 deletions
diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h
new file mode 100644
index 000000000..a72edbe05
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/advanced_options.h
@@ -0,0 +1,731 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+enum CompressionType : unsigned char;
+class TablePropertiesCollectorFactory;
+class TableFactory;
+struct Options;
+
+enum CompactionStyle : char {
+ // level based compaction style
+ kCompactionStyleLevel = 0x0,
+ // Universal compaction style
+ // Not supported in ROCKSDB_LITE.
+ kCompactionStyleUniversal = 0x1,
+ // FIFO compaction style
+ // Not supported in ROCKSDB_LITE
+ kCompactionStyleFIFO = 0x2,
+ // Disable background compaction. Compaction jobs are submitted
+ // via CompactFiles().
+ // Not supported in ROCKSDB_LITE
+ kCompactionStyleNone = 0x3,
+};
+
+// In Level-based compaction, it Determines which file from a level to be
+// picked to merge to the next level. We suggest people try
+// kMinOverlappingRatio first when you tune your database.
+enum CompactionPri : char {
+ // Slightly prioritize larger files by size compensated by #deletes
+ kByCompensatedSize = 0x0,
+ // First compact files whose data's latest update time is oldest.
+ // Try this if you only update some hot keys in small ranges.
+ kOldestLargestSeqFirst = 0x1,
+ // First compact files whose range hasn't been compacted to the next level
+ // for the longest. If your updates are random across the key space,
+ // write amplification is slightly better with this option.
+ kOldestSmallestSeqFirst = 0x2,
+ // First compact files whose ratio between overlapping size in next level
+ // and its size is the smallest. It in many cases can optimize write
+ // amplification.
+ kMinOverlappingRatio = 0x3,
+};
+
+struct CompactionOptionsFIFO {
+ // once the total sum of table files reaches this, we will delete the oldest
+ // table file
+ // Default: 1GB
+ uint64_t max_table_files_size;
+
+ // If true, try to do compaction to compact smaller files into larger ones.
+ // Minimum files to compact follows options.level0_file_num_compaction_trigger
+ // and compaction won't trigger if average compact bytes per del file is
+ // larger than options.write_buffer_size. This is to protect large files
+ // from being compacted again.
+ // Default: false;
+ bool allow_compaction = false;
+
+ CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
+ CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
+ : max_table_files_size(_max_table_files_size),
+ allow_compaction(_allow_compaction) {}
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+ // RocksDB's generic default compression level. Internally it'll be translated
+ // to the default compression level specific to the library being used (see
+ // comment above `ColumnFamilyOptions::compression`).
+ //
+ // The default value is the max 16-bit int as it'll be written out in OPTIONS
+ // file, which should be portable.
+ const static int kDefaultCompressionLevel = 32767;
+
+ int window_bits;
+ int level;
+ int strategy;
+
+ // Maximum size of dictionaries used to prime the compression library.
+ // Enabling dictionary can improve compression ratios when there are
+ // repetitions across data blocks.
+ //
+ // The dictionary is created by sampling the SST file data. If
+ // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
+ // dictionary generator. Otherwise, the random samples are used directly as
+ // the dictionary.
+ //
+ // When compression dictionary is disabled, we compress and write each block
+ // before buffering data for the next one. When compression dictionary is
+ // enabled, we buffer all SST file data in-memory so we can sample it, as data
+ // can only be compressed and written after the dictionary has been finalized.
+ // So users of this feature may see increased memory usage.
+ //
+ // Default: 0.
+ uint32_t max_dict_bytes;
+
+ // Maximum size of training data passed to zstd's dictionary trainer. Using
+ // zstd's dictionary trainer can achieve even better compression ratio
+ // improvements than using `max_dict_bytes` alone.
+ //
+ // The training data will be used to generate a dictionary of max_dict_bytes.
+ //
+ // Default: 0.
+ uint32_t zstd_max_train_bytes;
+
+ // When the compression options are set by the user, it will be set to "true".
+ // For bottommost_compression_opts, to enable it, user must set enabled=true.
+ // Otherwise, bottommost compression will use compression_opts as default
+ // compression options.
+ //
+ // For compression_opts, if compression_opts.enabled=false, it is still
+ // used as compression options for compression process.
+ //
+ // Default: false.
+ bool enabled;
+
+ CompressionOptions()
+ : window_bits(-14),
+ level(kDefaultCompressionLevel),
+ strategy(0),
+ max_dict_bytes(0),
+ zstd_max_train_bytes(0),
+ enabled(false) {}
+ CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes,
+ int _zstd_max_train_bytes, bool _enabled)
+ : window_bits(wbits),
+ level(_lev),
+ strategy(_strategy),
+ max_dict_bytes(_max_dict_bytes),
+ zstd_max_train_bytes(_zstd_max_train_bytes),
+ enabled(_enabled) {}
+};
+
+enum UpdateStatus { // Return status For inplace update callback
+ UPDATE_FAILED = 0, // Nothing to update
+ UPDATED_INPLACE = 1, // Value updated inplace
+ UPDATED = 2, // No inplace update. Merged value set
+};
+
+struct AdvancedColumnFamilyOptions {
+ // The maximum number of write buffers that are built up in memory.
+ // The default and the minimum number is 2, so that when 1 write buffer
+ // is being flushed to storage, new writes can continue to the other
+ // write buffer.
+ // If max_write_buffer_number > 3, writing will be slowed down to
+ // options.delayed_write_rate if we are writing to the last write buffer
+ // allowed.
+ //
+ // Default: 2
+ //
+ // Dynamically changeable through SetOptions() API
+ int max_write_buffer_number = 2;
+
+ // The minimum number of write buffers that will be merged together
+ // before writing to storage. If set to 1, then
+ // all write buffers are flushed to L0 as individual files and this increases
+ // read amplification because a get request has to check in all of these
+ // files. Also, an in-memory merge may result in writing lesser
+ // data to storage if there are duplicate records in each of these
+ // individual write buffers. Default: 1
+ int min_write_buffer_number_to_merge = 1;
+
+ // DEPRECATED
+ // The total maximum number of write buffers to maintain in memory including
+ // copies of buffers that have already been flushed. Unlike
+ // max_write_buffer_number, this parameter does not affect flushing.
+ // This parameter is being replaced by max_write_buffer_size_to_maintain.
+ // If both parameters are set to non-zero values, this parameter will be
+ // ignored.
+ int max_write_buffer_number_to_maintain = 0;
+
+ // The total maximum size(bytes) of write buffers to maintain in memory
+ // including copies of buffers that have already been flushed. This parameter
+ // only affects trimming of flushed buffers and does not affect flushing.
+ // This controls the maximum amount of write history that will be available
+ // in memory for conflict checking when Transactions are used. The actual
+ // size of write history (flushed Memtables) might be higher than this limit
+ // if further trimming will reduce write history total size below this
+ // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB,
+ // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB.
+ // Because trimming the next Memtable of size 20MB will reduce total memory
+ // usage to 52MB which is below the limit, RocksDB will stop trimming.
+ //
+ // When using an OptimisticTransactionDB:
+ // If this value is too low, some transactions may fail at commit time due
+ // to not being able to determine whether there were any write conflicts.
+ //
+ // When using a TransactionDB:
+ // If Transaction::SetSnapshot is used, TransactionDB will read either
+ // in-memory write buffers or SST files to do write-conflict checking.
+ // Increasing this value can reduce the number of reads to SST files
+ // done for conflict detection.
+ //
+ // Setting this value to 0 will cause write buffers to be freed immediately
+ // after they are flushed. If this value is set to -1,
+ // 'max_write_buffer_number * write_buffer_size' will be used.
+ //
+ // Default:
+ // If using a TransactionDB/OptimisticTransactionDB, the default value will
+ // be set to the value of 'max_write_buffer_number * write_buffer_size'
+ // if it is not explicitly set by the user. Otherwise, the default is 0.
+ int64_t max_write_buffer_size_to_maintain = 0;
+
+ // Allows thread-safe inplace updates. If this is true, there is no way to
+ // achieve point-in-time consistency using snapshot or iterator (assuming
+ // concurrent updates). Hence iterator and multi-get will return results
+ // which are not consistent as of any point-in-time.
+ // If inplace_callback function is not set,
+ // Put(key, new_value) will update inplace the existing_value iff
+ // * key exists in current memtable
+ // * new sizeof(new_value) <= sizeof(existing_value)
+ // * existing_value for that key is a put i.e. kTypeValue
+ // If inplace_callback function is set, check doc for inplace_callback.
+ // Default: false.
+ bool inplace_update_support = false;
+
+ // Number of locks used for inplace update
+ // Default: 10000, if inplace_update_support = true, else 0.
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t inplace_update_num_locks = 10000;
+
+ // existing_value - pointer to previous value (from both memtable and sst).
+ // nullptr if key doesn't exist
+ // existing_value_size - pointer to size of existing_value).
+ // nullptr if key doesn't exist
+ // delta_value - Delta value to be merged with the existing_value.
+ // Stored in transaction logs.
+ // merged_value - Set when delta is applied on the previous value.
+
+ // Applicable only when inplace_update_support is true,
+ // this callback function is called at the time of updating the memtable
+ // as part of a Put operation, lets say Put(key, delta_value). It allows the
+ // 'delta_value' specified as part of the Put operation to be merged with
+ // an 'existing_value' of the key in the database.
+
+ // If the merged value is smaller in size that the 'existing_value',
+ // then this function can update the 'existing_value' buffer inplace and
+ // the corresponding 'existing_value'_size pointer, if it wishes to.
+ // The callback should return UpdateStatus::UPDATED_INPLACE.
+ // In this case. (In this case, the snapshot-semantics of the rocksdb
+ // Iterator is not atomic anymore).
+
+ // If the merged value is larger in size than the 'existing_value' or the
+ // application does not wish to modify the 'existing_value' buffer inplace,
+ // then the merged value should be returned via *merge_value. It is set by
+ // merging the 'existing_value' and the Put 'delta_value'. The callback should
+ // return UpdateStatus::UPDATED in this case. This merged value will be added
+ // to the memtable.
+
+ // If merging fails or the application does not wish to take any action,
+ // then the callback should return UpdateStatus::UPDATE_FAILED.
+
+ // Please remember that the original call from the application is Put(key,
+ // delta_value). So the transaction log (if enabled) will still contain (key,
+ // delta_value). The 'merged_value' is not stored in the transaction log.
+ // Hence the inplace_callback function should be consistent across db reopens.
+
+ // Default: nullptr
+ UpdateStatus (*inplace_callback)(char* existing_value,
+ uint32_t* existing_value_size,
+ Slice delta_value,
+ std::string* merged_value) = nullptr;
+
+ // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
+ // create prefix bloom for memtable with the size of
+ // write_buffer_size * memtable_prefix_bloom_size_ratio.
+ // If it is larger than 0.25, it is sanitized to 0.25.
+ //
+ // Default: 0 (disable)
+ //
+ // Dynamically changeable through SetOptions() API
+ double memtable_prefix_bloom_size_ratio = 0.0;
+
+ // Enable whole key bloom filter in memtable. Note this will only take effect
+ // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+ // can potentially reduce CPU usage for point-look-ups.
+ //
+ // Default: false (disable)
+ //
+ // Dynamically changeable through SetOptions() API
+ bool memtable_whole_key_filtering = false;
+
+ // Page size for huge page for the arena used by the memtable. If <=0, it
+ // won't allocate from huge page but from malloc.
+ // Users are responsible to reserve huge pages for it to be allocated. For
+ // example:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ // If there isn't enough free huge page available, it will fall back to
+ // malloc.
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t memtable_huge_page_size = 0;
+
+ // If non-nullptr, memtable will use the specified function to extract
+ // prefixes for keys, and for each prefix maintain a hint of insert location
+ // to reduce CPU usage for inserting keys with the prefix. Keys out of
+ // domain of the prefix extractor will be insert without using hints.
+ //
+ // Currently only the default skiplist based memtable implements the feature.
+ // All other memtable implementation will ignore the option. It incurs ~250
+ // additional bytes of memory overhead to store a hint for each prefix.
+ // Also concurrent writes (when allow_concurrent_memtable_write is true) will
+ // ignore the option.
+ //
+ // The option is best suited for workloads where keys will likely to insert
+ // to a location close the last inserted key with the same prefix.
+ // One example could be inserting keys of the form (prefix + timestamp),
+ // and keys of the same prefix always comes in with time order. Another
+ // example would be updating the same key over and over again, in which case
+ // the prefix can be the key itself.
+ //
+ // Default: nullptr (disable)
+ std::shared_ptr<const SliceTransform>
+ memtable_insert_with_hint_prefix_extractor = nullptr;
+
+ // Control locality of bloom filter probes to improve CPU cache hit rate.
+ // This option now only applies to plaintable prefix bloom. This
+ // optimization is turned off when set to 0, and positive number to turn
+ // it on.
+ // Default: 0
+ uint32_t bloom_locality = 0;
+
+ // size of one block in arena memory allocation.
+ // If <= 0, a proper value is automatically calculated (usually 1/8 of
+ // writer_buffer_size, rounded up to a multiple of 4KB).
+ //
+ // There are two additional restriction of the specified size:
+ // (1) size should be in the range of [4096, 2 << 30] and
+ // (2) be the multiple of the CPU word (which helps with the memory
+ // alignment).
+ //
+ // We'll automatically check and adjust the size number to make sure it
+ // conforms to the restrictions.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t arena_block_size = 0;
+
+ // Different levels can have different compression policies. There
+ // are cases where most lower levels would like to use quick compression
+ // algorithms while the higher levels (which have more data) use
+ // compression algorithms that have better compression but could
+ // be slower. This array, if non-empty, should have an entry for
+ // each level of the database; these override the value specified in
+ // the previous field 'compression'.
+ //
+ // NOTICE if level_compaction_dynamic_level_bytes=true,
+ // compression_per_level[0] still determines L0, but other elements
+ // of the array are based on base level (the level L0 files are merged
+ // to), and may not match the level users see from info log for metadata.
+ // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
+ // determines compaction type for level n+i-1.
+ // For example, if we have three 5 levels, and we determine to merge L0
+ // data to L4 (which means L1..L3 will be empty), then the new files go to
+ // L4 uses compression type compression_per_level[1].
+ // If now L0 is merged to L2. Data goes to L2 will be compressed
+ // according to compression_per_level[1], L3 using compression_per_level[2]
+ // and L4 using compression_per_level[3]. Compaction for each level can
+ // change when data grows.
+ std::vector<CompressionType> compression_per_level;
+
+ // Number of levels for this database
+ int num_levels = 7;
+
+ // Soft limit on number of level-0 files. We start slowing down writes at this
+ // point. A value <0 means that no writing slow down will be triggered by
+ // number of files in level-0.
+ //
+ // Default: 20
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_slowdown_writes_trigger = 20;
+
+ // Maximum number of level-0 files. We stop writes at this point.
+ //
+ // Default: 36
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_stop_writes_trigger = 36;
+
+ // Target file size for compaction.
+ // target_file_size_base is per-file size for level-1.
+ // Target file size for level L can be calculated by
+ // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+ // For example, if target_file_size_base is 2MB and
+ // target_file_size_multiplier is 10, then each file on level-1 will
+ // be 2MB, and each file on level 2 will be 20MB,
+ // and each file on level-3 will be 200MB.
+ //
+ // Default: 64MB.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t target_file_size_base = 64 * 1048576;
+
+ // By default target_file_size_multiplier is 1, which means
+ // by default files in different levels will have similar size.
+ //
+ // Dynamically changeable through SetOptions() API
+ int target_file_size_multiplier = 1;
+
+ // If true, RocksDB will pick target size of each level dynamically.
+ // We will pick a base level b >= 1. L0 will be directly merged into level b,
+ // instead of always into level 1. Level 1 to b-1 need to be empty.
+ // We try to pick b and its target size so that
+ // 1. target size is in the range of
+ // (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+ // max_bytes_for_level_base]
+ // 2. target size of the last level (level num_levels-1) equals to extra size
+ // of the level.
+ // At the same time max_bytes_for_level_multiplier and
+ // max_bytes_for_level_multiplier_additional are still satisfied.
+ // (When L0 is too large, we make some adjustment. See below.)
+ //
+ // With this option on, from an empty DB, we make last level the base level,
+ // which means merging L0 data into the last level, until it exceeds
+ // max_bytes_for_level_base. And then we make the second last level to be
+ // base level, to start to merge L0 data to second last level, with its
+ // target size to be 1/max_bytes_for_level_multiplier of the last level's
+ // extra size. After the data accumulates more so that we need to move the
+ // base level to the third last one, and so on.
+ //
+ // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
+ // and max_bytes_for_level_base=10MB.
+ // Target sizes of level 1 to 5 starts with:
+ // [- - - - 10MB]
+ // with base level is level. Target sizes of level 1 to 4 are not applicable
+ // because they will not be used.
+ // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+ // base target to level 4 and now the targets looks like:
+ // [- - - 1.1MB 11MB]
+ // While data are accumulated, size targets are tuned based on actual data
+ // of level 5. When level 5 has 50MB of data, the target is like:
+ // [- - - 5MB 50MB]
+ // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
+ // level 4 to be the base level, its target size needs to be 10.1MB, which
+ // doesn't satisfy the target size range. So now we make level 3 the target
+ // size and the target sizes of the levels look like:
+ // [- - 1.01MB 10.1MB 101MB]
+ // In the same way, while level 5 further grows, all levels' targets grow,
+ // like
+ // [- - 5MB 50MB 500MB]
+ // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+ // base level and make levels' target sizes like this:
+ // [- 1.001MB 10.01MB 100.1MB 1001MB]
+ // and go on...
+ //
+ // By doing it, we give max_bytes_for_level_multiplier a priority against
+ // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
+ // useful to limit worse case space amplification.
+ //
+ //
+ // If the compaction from L0 is lagged behind, a special mode will be turned
+ // on to prioritize write amplification against max_bytes_for_level_multiplier
+ // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+ // at number of L0 files and total L0 size. If number of L0 files is at least
+ // the double of level0_file_num_compaction_trigger, or the total size is
+ // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+ // to the actual data size in L0, and then determine the target for each level
+ // so that each level will have the same level multiplier.
+ //
+ // For example, when L0 size is 100MB, the size of last level is 1600MB,
+ // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+ // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+ // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+ // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+ // be needed. The level multiplier will be calculated to be 4 and the three
+ // levels' target to be [100MB, 400MB, 1600MB].
+ //
+ // In this mode, The number of levels will be no more than the normal mode,
+ // and the level multiplier will be lower. The write amplification will
+ // likely to be reduced.
+ //
+ //
+ // max_bytes_for_level_multiplier_additional is ignored with this flag on.
+ //
+ // Turning this feature on or off for an existing DB can cause unexpected
+ // LSM tree structure so it's not recommended.
+ //
+ // Default: false
+ bool level_compaction_dynamic_level_bytes = false;
+
+ // Default: 10.
+ //
+ // Dynamically changeable through SetOptions() API
+ double max_bytes_for_level_multiplier = 10;
+
+ // Different max-size multipliers for different levels.
+ // These are multiplied by max_bytes_for_level_multiplier to arrive
+ // at the max-size of each level.
+ //
+ // Default: 1
+ //
+ // Dynamically changeable through SetOptions() API
+ std::vector<int> max_bytes_for_level_multiplier_additional =
+ std::vector<int>(num_levels, 1);
+
+ // We try to limit number of bytes in one compaction to be lower than this
+ // threshold. But it's not guaranteed.
+ // Value 0 will be sanitized.
+ //
+ // Default: target_file_size_base * 25
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_compaction_bytes = 0;
+
+ // All writes will be slowed down to at least delayed_write_rate if estimated
+ // bytes needed to be compaction exceed this threshold.
+ //
+ // Default: 64GB
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
+
+ // All writes are stopped if estimated bytes needed to be compaction exceed
+ // this threshold.
+ //
+ // Default: 256GB
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
+
+ // The compaction style. Default: kCompactionStyleLevel
+ CompactionStyle compaction_style = kCompactionStyleLevel;
+
+ // If level compaction_style = kCompactionStyleLevel, for each level,
+ // which files are prioritized to be picked to compact.
+ // Default: kMinOverlappingRatio
+ CompactionPri compaction_pri = kMinOverlappingRatio;
+
+ // The options needed to support Universal Style compactions
+ //
+ // Dynamically changeable through SetOptions() API
+ // Dynamic change example:
+ // SetOptions("compaction_options_universal", "{size_ratio=2;}")
+ CompactionOptionsUniversal compaction_options_universal;
+
+ // The options for FIFO compaction style
+ //
+ // Dynamically changeable through SetOptions() API
+ // Dynamic change example:
+ // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
+ CompactionOptionsFIFO compaction_options_fifo;
+
+ // An iteration->Next() sequentially skips over keys with the same
+ // user-key unless this option is set. This number specifies the number
+ // of keys (with the same userkey) that will be sequentially
+ // skipped before a reseek is issued.
+ //
+ // Default: 8
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_sequential_skip_in_iterations = 8;
+
+ // This is a factory that provides MemTableRep objects.
+ // Default: a factory that provides a skip-list-based implementation of
+ // MemTableRep.
+ std::shared_ptr<MemTableRepFactory> memtable_factory =
+ std::shared_ptr<SkipListFactory>(new SkipListFactory);
+
+ // Block-based table related options are moved to BlockBasedTableOptions.
+ // Related options that were originally here but now moved include:
+ // no_block_cache
+ // block_cache
+ // block_cache_compressed
+ // block_size
+ // block_size_deviation
+ // block_restart_interval
+ // filter_policy
+ // whole_key_filtering
+ // If you'd like to customize some of these options, you will need to
+ // use NewBlockBasedTableFactory() to construct a new table factory.
+
+ // This option allows user to collect their own interested statistics of
+ // the tables.
+ // Default: empty vector -- no user-defined statistics collection will be
+ // performed.
+ typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ TablePropertiesCollectorFactories;
+ TablePropertiesCollectorFactories table_properties_collector_factories;
+
+ // Maximum number of successive merge operations on a key in the memtable.
+ //
+ // When a merge operation is added to the memtable and the maximum number of
+ // successive merges is reached, the value of the key will be calculated and
+ // inserted into the memtable instead of the merge operation. This will
+ // ensure that there are never more than max_successive_merges merge
+ // operations in the memtable.
+ //
+ // Default: 0 (disabled)
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t max_successive_merges = 0;
+
+ // This flag specifies that the implementation should optimize the filters
+ // mainly for cases where keys are found rather than also optimize for keys
+ // missed. This would be used in cases where the application knows that
+ // there are very few misses or the performance in the case of misses is not
+ // important.
+ //
+ // For now, this flag allows us to not store filters for the last level i.e
+ // the largest level which contains data of the LSM store. For keys which
+ // are hits, the filters in this level are not useful because we will search
+ // for the data anyway. NOTE: the filters in other levels are still useful
+ // even for key hit because they tell us whether to look in that level or go
+ // to the higher level.
+ //
+ // Default: false
+ bool optimize_filters_for_hits = false;
+
+ // After writing every SST file, reopen it and read all the keys.
+ //
+ // Default: false
+ //
+ // Dynamically changeable through SetOptions() API
+ bool paranoid_file_checks = false;
+
+ // In debug mode, RocksDB run consistency checks on the LSM every time the LSM
+ // change (Flush, Compaction, AddFile). These checks are disabled in release
+ // mode, use this option to enable them in release mode as well.
+ // Default: false
+ bool force_consistency_checks = false;
+
+ // Measure IO stats in compactions and flushes, if true.
+ //
+ // Default: false
+ //
+ // Dynamically changeable through SetOptions() API
+ bool report_bg_io_stats = false;
+
+ // Files older than TTL will go through the compaction process.
+ // Pre-req: This needs max_open_files to be set to -1.
+ // In Level: Non-bottom-level files older than TTL will go through the
+ // compation process.
+ // In FIFO: Files older than TTL will be deleted.
+ // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+ // In FIFO, this option will have the same meaning as
+ // periodic_compaction_seconds. Whichever stricter will be used.
+ // 0 means disabling.
+ // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+ // pick default.
+ //
+ // Default: 30 days for leveled compaction + block based table. disable
+ // otherwise.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t ttl = 0xfffffffffffffffe;
+
+ // Files older than this value will be picked up for compaction, and
+ // re-written to the same level as they were before.
+ //
+ // A file's age is computed by looking at file_creation_time or creation_time
+ // table properties in order, if they have valid non-zero values; if not, the
+ // age is based on the file's last modified time (given by the underlying
+ // Env).
+ //
+ // Supported in Level and FIFO compaction.
+ // In FIFO compaction, this option has the same meaning as TTL and whichever
+ // stricter will be used.
+ // Pre-req: max_open_file == -1.
+ // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+ //
+ // Values:
+ // 0: Turn off Periodic compactions.
+ // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+ // as needed. For now, RocksDB will change this value to 30 days
+ // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+ // process at least once every 30 days if not compacted sooner.
+ // In FIFO compaction, since the option has the same meaning as ttl,
+ // when this value is left default, and ttl is left to 0, 30 days will be
+ // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+ //
+ // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
+
+ // If this option is set then 1 in N blocks are compressed
+ // using a fast (lz4) and slow (zstd) compression algorithm.
+ // The compressibility is reported as stats and the stored
+ // data is left uncompressed (unless compression is also requested).
+ uint64_t sample_for_compression = 0;
+
+ // Create ColumnFamilyOptions with default values for all fields
+ AdvancedColumnFamilyOptions();
+ // Create ColumnFamilyOptions from Options
+ explicit AdvancedColumnFamilyOptions(const Options& options);
+
+ // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
+
+ // NOT SUPPORTED ANYMORE
+ // This does not do anything anymore.
+ int max_mem_compaction_level;
+
+ // NOT SUPPORTED ANYMORE -- this options is no longer used
+ // Puts are delayed to options.delayed_write_rate when any level has a
+ // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
+ //
+ // Default: 0 (disabled)
+ //
+ // Dynamically changeable through SetOptions() API
+ double soft_rate_limit = 0.0;
+
+ // NOT SUPPORTED ANYMORE -- this options is no longer used
+ double hard_rate_limit = 0.0;
+
+ // NOT SUPPORTED ANYMORE -- this options is no longer used
+ unsigned int rate_limit_delay_max_milliseconds = 100;
+
+ // NOT SUPPORTED ANYMORE
+ // Does not have any effect.
+ bool purge_redundant_kvs_while_flush = true;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
new file mode 100644
index 000000000..dafefceb3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -0,0 +1,1801 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+ C bindings for rocksdb. May be useful as a stable ABI that can be
+ used by programs that keep rocksdb in a shared library, or for
+ a JNI api.
+
+ Does not support:
+ . getters for the option types
+ . custom comparators that implement key shortening
+ . capturing post-write-snapshot
+ . custom iter, db, env, cache implementations using just the C bindings
+
+ Some conventions:
+
+ (1) We expose just opaque struct pointers and functions to clients.
+ This allows us to change internal representations without having to
+ recompile clients.
+
+ (2) For simplicity, there is no equivalent to the Slice type. Instead,
+ the caller has to pass the pointer and length as separate
+ arguments.
+
+ (3) Errors are represented by a null-terminated c string. NULL
+ means no error. All operations that can raise an error are passed
+ a "char** errptr" as the last argument. One of the following must
+ be true on entry:
+ *errptr == NULL
+ *errptr points to a malloc()ed null-terminated error message
+ On success, a leveldb routine leaves *errptr unchanged.
+ On failure, leveldb frees the old value of *errptr and
+ set *errptr to a malloc()ed error message.
+
+ (4) Bools have the type unsigned char (0 == false; rest == true)
+
+ (5) All of the pointer arguments must be non-NULL.
+*/
+
+#pragma once
+
+#ifdef _WIN32
+#ifdef ROCKSDB_DLL
+#ifdef ROCKSDB_LIBRARY_EXPORTS
+#define ROCKSDB_LIBRARY_API __declspec(dllexport)
+#else
+#define ROCKSDB_LIBRARY_API __declspec(dllimport)
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
+typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
+typedef struct rocksdb_compactionfiltercontext_t
+ rocksdb_compactionfiltercontext_t;
+typedef struct rocksdb_compactionfilterfactory_t
+ rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_comparator_t rocksdb_comparator_t;
+typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
+typedef struct rocksdb_env_t rocksdb_env_t;
+typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
+typedef struct rocksdb_filelock_t rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t rocksdb_iterator_t;
+typedef struct rocksdb_logger_t rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t rocksdb_options_t;
+typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t;
+typedef struct rocksdb_block_based_table_options_t
+ rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t
+ rocksdb_cuckoo_table_options_t;
+typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
+typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
+typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
+typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
+typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
+typedef struct rocksdb_ingestexternalfileoptions_t rocksdb_ingestexternalfileoptions_t;
+typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t;
+typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t;
+typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t;
+typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
+typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
+typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
+typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t
+ rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t
+ rocksdb_optimistictransaction_options_t;
+typedef struct rocksdb_transaction_t rocksdb_transaction_t;
+typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
+typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
+typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
+
+/* DB operations */
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl(
+ const rocksdb_options_t* options, const char* name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
+ const rocksdb_options_t* options, const char* name,
+ unsigned char error_if_log_file_exist, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+ const rocksdb_options_t* options, const char* name,
+ const char* secondary_path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+ const rocksdb_options_t* options, const char* path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
+ rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush(
+ rocksdb_backup_engine_t* be, rocksdb_t* db, unsigned char flush_before_backup,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
+ rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
+rocksdb_restore_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
+ rocksdb_restore_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
+ rocksdb_restore_options_t* opt, int v);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+ uint32_t backup_id, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_restore_db_from_latest_backup(
+ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+ const rocksdb_restore_options_t* restore_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
+rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count(
+ const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_backup_engine_info_timestamp(const rocksdb_backup_engine_info_t* info,
+ int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_backup_engine_info_backup_id(const rocksdb_backup_engine_info_t* info,
+ int index);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_info_size(const rocksdb_backup_engine_info_t* info,
+ int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files(
+ const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
+ const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
+ rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
+ rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
+ uint64_t log_size_for_flush, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
+ rocksdb_checkpoint_t* checkpoint);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_open_for_read_only_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles,
+ unsigned char error_if_log_file_exist, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+ const rocksdb_options_t* options, const char* name,
+ const char* secondary_path, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** colummn_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
+ const rocksdb_options_t* options, const char* name, size_t* lencf,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy(
+ char** list, size_t len);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family(rocksdb_t* db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy(
+ rocksdb_column_family_handle_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* start_key,
+ size_t start_key_len, const char* end_key, size_t end_key_len,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_write(
+ rocksdb_t* db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t* batch, char** errptr);
+
+/* Returns NULL if not found. A malloc()ed array otherwise.
+ Stores the length of the array in *vallen. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_get(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+// if values_list[i] == NULL and errs[i] == NULL,
+// then we got status.IsNotFound(), which we will not return.
+// all errors except status status.ok() and status.IsNotFound() are returned.
+//
+// errs, values_list and values_list_sizes must be num_keys in length,
+// allocated by the caller.
+// errs is a list of strings as opposed to the conventional one error,
+// where errs[i] is the status for retrieval of keys_list[i].
+// each non-NULL errs entry is a malloc()ed, null terminated string.
+// each non-NULL values_list entry is a malloc()ed array, with
+// the length for each stored in values_list_sizes[i].
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+ const char* const* keys_list, const size_t* keys_list_sizes,
+ char** values_list, size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ const rocksdb_column_family_handle_t* const* column_families,
+ size_t num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, char** values_list,
+ size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
+ rocksdb_t* db, const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+ rocksdb_t* db, uint64_t seq_number,
+ const rocksdb_wal_readoptions_t* options,
+ char** errptr
+);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators(
+ rocksdb_t *db, rocksdb_readoptions_t* opts,
+ rocksdb_column_family_handle_t** column_families,
+ rocksdb_iterator_t** iterators, size_t size, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot(
+ rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
+ rocksdb_t* db, const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+ Else returns a pointer to a malloc()-ed null-terminated value. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
+ const char* propname);
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int(
+ rocksdb_t* db,
+ const char* propname, uint64_t *out_val);
+
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* propname, uint64_t *out_val);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* propname);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
+ rocksdb_t* db, int num_ranges, const char* const* range_start_key,
+ const size_t* range_start_key_len, const char* const* range_limit_key,
+ const size_t* range_limit_key_len, uint64_t* sizes);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ int num_ranges, const char* const* range_start_key,
+ const size_t* range_start_key_len, const char* const* range_limit_key,
+ const size_t* range_limit_key_len, uint64_t* sizes);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
+ const char* start_key,
+ size_t start_key_len,
+ const char* limit_key,
+ size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt(
+ rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key,
+ size_t start_key_len, const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db,
+ const char* name);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles(
+ rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush(
+ rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
+ rocksdb_t* db, const rocksdb_flushoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(
+ rocksdb_t* db, unsigned char force, char** errptr);
+
+/* Management operations */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_repair_db(
+ const rocksdb_options_t* options, const char* name, char** errptr);
+
+/* Iterator */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid(
+ const rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*,
+ const char* k, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*,
+ const char* k,
+ size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key(
+ const rocksdb_iterator_t*, size_t* klen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
+ const rocksdb_iterator_t*, size_t* vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
+ const rocksdb_iterator_t*, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid(
+ const rocksdb_wal_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) ;
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) ;
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) ;
+
+/* Write batch */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
+ const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
+ rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
+ const char* key,
+ size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
+ rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range(
+ rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev(
+ rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf(
+ rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data(
+ rocksdb_writebatch_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
+ rocksdb_writebatch_t*, void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
+ rocksdb_writebatch_t*, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(
+ rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point(
+ rocksdb_writebatch_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point(
+ rocksdb_writebatch_t*, char** errptr);
+
+/* Write batch with index */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(
+ size_t reserved_bytes,
+ unsigned char overwrite_keys);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create_from(
+ const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy(
+ rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t*,
+ const char* key,
+ size_t klen,
+ const char* val,
+ size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes, int num_values,
+ const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+ int num_values, const char* const* values_list,
+ const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t*,
+ const char* key,
+ size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf(
+ rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+ const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range(
+ rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len,
+ const char* end_key, size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* end_key,
+ size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev(
+ rocksdb_writebatch_wi_t* b, int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf(
+ rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+ int num_keys, const char* const* start_keys_list,
+ const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+ const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data(
+ rocksdb_writebatch_wi_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate(
+ rocksdb_writebatch_wi_t* b,
+ void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data(
+ rocksdb_writebatch_wi_t* b,
+ size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point(
+ rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point(
+ rocksdb_writebatch_wi_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch(
+ rocksdb_writebatch_wi_t* wbwi,
+ const rocksdb_options_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf(
+ rocksdb_writebatch_wi_t* wbwi,
+ const rocksdb_options_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_t* db,
+ const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
+ rocksdb_t* db,
+ const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_wi_t* wbwi,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_iterator_t* base_iterator);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+ rocksdb_writebatch_wi_t* wbwi,
+ rocksdb_iterator_t* base_iterator,
+ rocksdb_column_family_handle_t* cf);
+
+/* Block based table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
+ rocksdb_block_based_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
+ rocksdb_block_based_table_options_t* options, size_t block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_size_deviation(
+ rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_restart_interval(
+ rocksdb_block_based_table_options_t* options, int index_block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_metadata_block_size(
+ rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_partition_filters(
+ rocksdb_block_based_table_options_t* options, unsigned char partition_filters);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_use_delta_encoding(
+ rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_filterpolicy_t* filter_policy);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache(
+ rocksdb_block_based_table_options_t* options, unsigned char no_block_cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache(
+ rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_cache_compressed(
+ rocksdb_block_based_table_options_t* options,
+ rocksdb_cache_t* block_cache_compressed);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_whole_key_filtering(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version(
+ rocksdb_block_based_table_options_t*, int);
+enum {
+ rocksdb_block_based_table_index_type_binary_search = 0,
+ rocksdb_block_based_table_index_type_hash_search = 1,
+ rocksdb_block_based_table_index_type_two_level_index_search = 2,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type(
+ rocksdb_block_based_table_options_t*, int); // uses one of the above enums
+enum {
+ rocksdb_block_based_table_data_block_index_type_binary_search = 0,
+ rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_index_type(
+ rocksdb_block_based_table_options_t*, int); // uses one of the above enums
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_hash_ratio(
+ rocksdb_block_based_table_options_t* options, double v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_hash_index_allow_collision(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+ rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
+ rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
+
+/* Cuckoo table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
+ rocksdb_cuckoo_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
+ rocksdb_cuckoo_table_options_t* options, double v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size(
+ rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_cuckoo_options_set_identity_as_first_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash(
+ rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
+ rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options);
+
+/* Options */
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options(
+ rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
+ rocksdb_options_t* opt, int total_threads);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
+ rocksdb_options_t* opt, uint64_t block_cache_size_mb);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_optimize_universal_style_compaction(
+ rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
+ rocksdb_options_t*, rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
+ rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
+ rocksdb_options_t*, rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
+ rocksdb_options_t*, rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
+ rocksdb_options_t* opt, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*,
+ const rocksdb_dbpath_t** path_values,
+ size_t num_paths);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
+ rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
+ rocksdb_logger_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
+ rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
+ rocksdb_options_t*, int, int, int, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
+ rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_mem_compaction_level(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+ rocksdb_options_t*, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+ unsigned char val);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+ rocksdb_options_t* opt, unsigned char val);
+
+/* returns a pointer to a malloc()-ed, null terminated string */
+extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
+ rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
+ int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+ int64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+ rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit(
+ rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit(
+ rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt, size_t v);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_pending_compaction_bytes_limit(
+ rocksdb_options_t* opt, size_t v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_rate_limit_delay_max_milliseconds(rocksdb_options_t*,
+ unsigned int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_table_cache_remove_scan_count_limit(rocksdb_options_t*,
+ int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
+ rocksdb_options_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
+ const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
+ rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*,
+ unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
+ uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
+ uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
+ rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+ rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes(
+ rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
+ rocksdb_options_t*, size_t, int32_t, int32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory(
+ rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress(
+ rocksdb_options_t* opt, int level);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size(
+ rocksdb_options_t*, size_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
+ rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
+ rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
+ rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats(
+ rocksdb_options_t*, int);
+
+enum {
+ rocksdb_tolerate_corrupted_tail_records_recovery = 0,
+ rocksdb_absolute_consistency_recovery = 1,
+ rocksdb_point_in_time_recovery = 2,
+ rocksdb_skip_any_corrupted_records_recovery = 3
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode(
+ rocksdb_options_t*, int);
+
+enum {
+ rocksdb_no_compression = 0,
+ rocksdb_snappy_compression = 1,
+ rocksdb_zlib_compression = 2,
+ rocksdb_bz2_compression = 3,
+ rocksdb_lz4_compression = 4,
+ rocksdb_lz4hc_compression = 5,
+ rocksdb_xpress_compression = 6,
+ rocksdb_zstd_compression = 7
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
+ rocksdb_options_t*, int);
+
+enum {
+ rocksdb_level_compaction = 0,
+ rocksdb_universal_compaction = 1,
+ rocksdb_fifo_compaction = 2
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
+ rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_universal_compaction_options(
+ rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options(
+ rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter(
+ rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush(
+ rocksdb_options_t* opt, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
+ rocksdb_options_t* opt, rocksdb_cache_t* cache
+);
+
+/* RateLimiter */
+extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+ int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
+extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t*);
+
+/* PerfContext */
+enum {
+ rocksdb_uninitialized = 0,
+ rocksdb_disable = 1,
+ rocksdb_enable_count = 2,
+ rocksdb_enable_time_except_for_mutex = 3,
+ rocksdb_enable_time = 4,
+ rocksdb_out_of_bounds = 5
+};
+
+enum {
+ rocksdb_user_key_comparison_count = 0,
+ rocksdb_block_cache_hit_count,
+ rocksdb_block_read_count,
+ rocksdb_block_read_byte,
+ rocksdb_block_read_time,
+ rocksdb_block_checksum_time,
+ rocksdb_block_decompress_time,
+ rocksdb_get_read_bytes,
+ rocksdb_multiget_read_bytes,
+ rocksdb_iter_read_bytes,
+ rocksdb_internal_key_skipped_count,
+ rocksdb_internal_delete_skipped_count,
+ rocksdb_internal_recent_skipped_count,
+ rocksdb_internal_merge_count,
+ rocksdb_get_snapshot_time,
+ rocksdb_get_from_memtable_time,
+ rocksdb_get_from_memtable_count,
+ rocksdb_get_post_process_time,
+ rocksdb_get_from_output_files_time,
+ rocksdb_seek_on_memtable_time,
+ rocksdb_seek_on_memtable_count,
+ rocksdb_next_on_memtable_count,
+ rocksdb_prev_on_memtable_count,
+ rocksdb_seek_child_seek_time,
+ rocksdb_seek_child_seek_count,
+ rocksdb_seek_min_heap_time,
+ rocksdb_seek_max_heap_time,
+ rocksdb_seek_internal_seek_time,
+ rocksdb_find_next_user_entry_time,
+ rocksdb_write_wal_time,
+ rocksdb_write_memtable_time,
+ rocksdb_write_delay_time,
+ rocksdb_write_pre_and_post_process_time,
+ rocksdb_db_mutex_lock_nanos,
+ rocksdb_db_condition_wait_nanos,
+ rocksdb_merge_operator_time_nanos,
+ rocksdb_read_index_block_nanos,
+ rocksdb_read_filter_block_nanos,
+ rocksdb_new_table_block_iter_nanos,
+ rocksdb_new_table_iterator_nanos,
+ rocksdb_block_seek_nanos,
+ rocksdb_find_table_nanos,
+ rocksdb_bloom_memtable_hit_count,
+ rocksdb_bloom_memtable_miss_count,
+ rocksdb_bloom_sst_hit_count,
+ rocksdb_bloom_sst_miss_count,
+ rocksdb_key_lock_wait_time,
+ rocksdb_key_lock_wait_count,
+ rocksdb_env_new_sequential_file_nanos,
+ rocksdb_env_new_random_access_file_nanos,
+ rocksdb_env_new_writable_file_nanos,
+ rocksdb_env_reuse_writable_file_nanos,
+ rocksdb_env_new_random_rw_file_nanos,
+ rocksdb_env_new_directory_nanos,
+ rocksdb_env_file_exists_nanos,
+ rocksdb_env_get_children_nanos,
+ rocksdb_env_get_children_file_attributes_nanos,
+ rocksdb_env_delete_file_nanos,
+ rocksdb_env_create_dir_nanos,
+ rocksdb_env_create_dir_if_missing_nanos,
+ rocksdb_env_delete_dir_nanos,
+ rocksdb_env_get_file_size_nanos,
+ rocksdb_env_get_file_modification_time_nanos,
+ rocksdb_env_rename_file_nanos,
+ rocksdb_env_link_file_nanos,
+ rocksdb_env_lock_file_nanos,
+ rocksdb_env_unlock_file_nanos,
+ rocksdb_env_new_logger_nanos,
+ rocksdb_total_metric_count = 68
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
+ rocksdb_perfcontext_t* context);
+extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
+ rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_perfcontext_metric(
+ rocksdb_perfcontext_t* context, int metric);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy(
+ rocksdb_perfcontext_t* context);
+
+/* Compaction Filter */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
+rocksdb_compactionfilter_create(
+ void* state, void (*destructor)(void*),
+ unsigned char (*filter)(void*, int level, const char* key,
+ size_t key_length, const char* existing_value,
+ size_t value_length, char** new_value,
+ size_t* new_value_length,
+ unsigned char* value_changed),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots(
+ rocksdb_compactionfilter_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy(
+ rocksdb_compactionfilter_t*);
+
+/* Compaction Filter Context */
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_full_compaction(
+ rocksdb_compactionfiltercontext_t* context);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_manual_compaction(
+ rocksdb_compactionfiltercontext_t* context);
+
+/* Compaction Filter Factory */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t*
+rocksdb_compactionfilterfactory_create(
+ void* state, void (*destructor)(void*),
+ rocksdb_compactionfilter_t* (*create_compaction_filter)(
+ void*, rocksdb_compactionfiltercontext_t* context),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy(
+ rocksdb_compactionfilterfactory_t*);
+
+/* Comparator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy(
+ rocksdb_comparator_t*);
+
+/* Filter policy */
+
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+ void* state, void (*destructor)(void*),
+ char* (*create_filter)(void*, const char* const* key_array,
+ const size_t* key_length_array, int num_keys,
+ size_t* filter_length),
+ unsigned char (*key_may_match)(void*, const char* key, size_t length,
+ const char* filter, size_t filter_length),
+ void (*delete_filter)(void*, const char* filter, size_t filter_length),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy(
+ rocksdb_filterpolicy_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom(int bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom_full(int bits_per_key);
+
+/* Merge Operator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t*
+rocksdb_mergeoperator_create(
+ void* state, void (*destructor)(void*),
+ char* (*full_merge)(void*, const char* key, size_t key_length,
+ const char* existing_value,
+ size_t existing_value_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ char* (*partial_merge)(void*, const char* key, size_t key_length,
+ const char* const* operands_list,
+ const size_t* operands_list_length, int num_operands,
+ unsigned char* success, size_t* new_value_length),
+ void (*delete_value)(void*, const char* value, size_t value_length),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
+ rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
+ rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
+ rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
+ rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound(
+ rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
+ rocksdb_readoptions_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
+ rocksdb_readoptions_t*, unsigned char);
+// The functionality that this option controlled has been removed.
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
+ rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys(
+ rocksdb_readoptions_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+ rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
+ rocksdb_readoptions_t*, unsigned char);
+
+/* Write options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t*
+rocksdb_writeoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
+ rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
+ rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
+ rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+ unsigned char);
+
+/* Compact range options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
+rocksdb_compactoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
+ rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_exclusive_manual_compaction(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_bottommost_level_compaction(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
+ rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
+ rocksdb_compactoptions_t*, int);
+
+/* Flush options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t*
+rocksdb_flushoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
+ rocksdb_flushoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
+ rocksdb_flushoptions_t*, unsigned char);
+
+/* Cache */
+
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
+ size_t capacity);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
+ rocksdb_cache_t* cache, size_t capacity);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_usage(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
+
+/* DBPath */
+
+extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
+
+/* Env */
+
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env();
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env();
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
+ rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
+ rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
+ rocksdb_envoptions_t* opt);
+
+/* SstFile */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env,
+ const rocksdb_options_t* io_options);
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create_with_comparator(
+ const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+ const rocksdb_comparator_t* comparator);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open(
+ rocksdb_sstfilewriter_t* writer, const char* name, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete(
+ rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish(
+ rocksdb_sstfilewriter_t* writer, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size(
+ rocksdb_sstfilewriter_t* writer, uint64_t* file_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
+ rocksdb_sstfilewriter_t* writer);
+
+extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create();
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_move_files(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char snapshot_consistency);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+ rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char allow_blocking_flush);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_ingest_behind(
+ rocksdb_ingestexternalfileoptions_t* opt,
+ unsigned char ingest_behind);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
+ rocksdb_ingestexternalfileoptions_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file(
+ rocksdb_t* db, const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+ const char* const* file_list, const size_t list_len,
+ const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+ rocksdb_t* db, char** errptr);
+
+/* SliceTransform */
+
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create(
+ void* state, void (*destructor)(void*),
+ char* (*transform)(void*, const char* key, size_t length,
+ size_t* dst_length),
+ unsigned char (*in_domain)(void*, const char* key, size_t length),
+ unsigned char (*in_range)(void*, const char* key, size_t length),
+ const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+ rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create_noop();
+extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
+ rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+ rocksdb_similar_size_compaction_stop_style = 0,
+ rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create();
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_size_ratio(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_min_merge_width(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_merge_width(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_compression_size_percent(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_stop_style(
+ rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
+ rocksdb_universal_compaction_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
+rocksdb_fifo_compaction_options_create();
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_table_files_size(
+ rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
+ rocksdb_fifo_compaction_options_t* fifo_opts);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
+ const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
+ const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
+ const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions(
+ const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
+ const rocksdb_livefiles_t*);
+
+/* Utility Helpers */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
+ const rocksdb_options_t* base_options, const char* opts_str,
+ rocksdb_options_t* new_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range(
+ rocksdb_t* db, const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
+ rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+ const char* start_key, size_t start_key_len, const char* limit_key,
+ size_t limit_key_len, char** errptr);
+
+/* Transactions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_transactiondb_create_column_family(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_options_t* column_family_options,
+ const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ char** errptr);
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+ const rocksdb_options_t* options,
+ const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+ int num_column_families, const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin(
+ rocksdb_transactiondb_t* txn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_transaction_options_t* txn_options,
+ rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint(
+ rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint(
+ rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
+ rocksdb_transaction_t* txn);
+
+// This snapshot should be freed using rocksdb_free
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
+ char** errptr);
+
+char* rocksdb_transaction_get_for_update_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ size_t* vlen, unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_writebatch_t *batch, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+ size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+ const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
+ rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf(
+ rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
+ const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator_cf(
+ rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
+ const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator_cf(
+ rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
+ rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db,
+ char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
+ const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+ const rocksdb_options_t* options, const char* name, int num_column_families,
+ const char* const* column_family_names,
+ const rocksdb_options_t* const* column_family_options,
+ rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_optimistictransactiondb_get_base_db(
+ rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db(
+ rocksdb_t* base_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
+rocksdb_optimistictransaction_begin(
+ rocksdb_optimistictransactiondb_t* otxn_db,
+ const rocksdb_writeoptions_t* write_options,
+ const rocksdb_optimistictransaction_options_t* otxn_options,
+ rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
+ rocksdb_optimistictransactiondb_t* otxn_db);
+
+/* Transaction Options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
+rocksdb_transactiondb_options_create();
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
+ rocksdb_transactiondb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks(
+ rocksdb_transactiondb_options_t* opt, int64_t max_num_locks);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes(
+ rocksdb_transactiondb_options_t* opt, size_t num_stripes);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_transaction_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_default_lock_timeout(
+ rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
+rocksdb_transaction_options_create();
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
+ rocksdb_transaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot(
+ rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect(
+ rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout(
+ rocksdb_transaction_options_t* opt, int64_t lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration(
+ rocksdb_transaction_options_t* opt, int64_t expiration);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_deadlock_detect_depth(
+ rocksdb_transaction_options_t* opt, int64_t depth);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_max_write_batch_size(
+ rocksdb_transaction_options_t* opt, size_t size);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create();
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
+ rocksdb_optimistictransaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_optimistictransaction_options_set_set_snapshot(
+ rocksdb_optimistictransaction_options_t* opt, unsigned char v);
+
+// referring to convention (3), this should be used by client
+// to free memory that was malloc()ed
+extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned(
+ rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+ size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+ rocksdb_t* db, const rocksdb_readoptions_t* options,
+ rocksdb_column_family_handle_t* column_family, const char* key,
+ size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy(
+ rocksdb_pinnableslice_t* v);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
+ const rocksdb_pinnableslice_t* t, size_t* vlen);
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+ rocksdb_memory_consumers_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+ rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+ rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+ rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+ char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+ rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+ rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+ rocksdb_memory_usage_t* memory_usage);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
new file mode 100644
index 000000000..77ddf525d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -0,0 +1,278 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values. It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads. It may automatically evict entries to make room
+// for new entries. Values have a specified charge against the cache
+// capacity. For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided. Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+
+extern const bool kDefaultToAdaptiveMutex;
+
+enum CacheMetadataChargePolicy {
+ kDontChargeCacheMetadata,
+ kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+ kFullChargeCacheMetadata;
+
+struct LRUCacheOptions {
+ // Capacity of the cache.
+ size_t capacity = 0;
+
+ // Cache is sharded into 2^num_shard_bits shards,
+ // by hash of key. Refer to NewLRUCache for further
+ // information.
+ int num_shard_bits = -1;
+
+ // If strict_capacity_limit is set,
+ // insert to the cache will fail when cache is full.
+ bool strict_capacity_limit = false;
+
+ // Percentage of cache reserved for high priority entries.
+ // If greater than zero, the LRU list will be split into a high-pri
+ // list and a low-pri list. High-pri entries will be insert to the
+ // tail of high-pri list, while low-pri entries will be first inserted to
+ // the low-pri list (the midpoint). This is refered to as
+ // midpoint insertion strategy to make entries never get hit in cache
+ // age out faster.
+ //
+ // See also
+ // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
+ double high_pri_pool_ratio = 0.5;
+
+ // If non-nullptr will use this allocator instead of system allocator when
+ // allocating memory for cache blocks. Call this method before you start using
+ // the cache!
+ //
+ // Caveat: when the cache is used as block cache, the memory allocator is
+ // ignored when dealing with compression libraries that allocate memory
+ // internally (currently only XPRESS).
+ std::shared_ptr<MemoryAllocator> memory_allocator;
+
+ // Whether to use adaptive mutexes for cache shards. Note that adaptive
+ // mutexes need to be supported by the platform in order for this to have any
+ // effect. The default value is true if RocksDB is compiled with
+ // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
+
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy;
+
+ LRUCacheOptions() {}
+ LRUCacheOptions(size_t _capacity, int _num_shard_bits,
+ bool _strict_capacity_limit, double _high_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy)
+ : capacity(_capacity),
+ num_shard_bits(_num_shard_bits),
+ strict_capacity_limit(_strict_capacity_limit),
+ high_pri_pool_ratio(_high_pri_pool_ratio),
+ memory_allocator(std::move(_memory_allocator)),
+ use_adaptive_mutex(_use_adaptive_mutex),
+ metadata_charge_policy(_metadata_charge_policy) {}
+};
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^num_shard_bits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. If strict_capacity_limit
+// is set, insert to the cache will fail when cache is full. User can also
+// set percentage of the cache reserves for high priority entries via
+// high_pri_pool_pct.
+// num_shard_bits = -1 means it is automatically determined: every shard
+// will be at least 512KB and number of shard bits will not exceed 6.
+extern std::shared_ptr<Cache> NewLRUCache(
+ size_t capacity, int num_shard_bits = -1,
+ bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+ std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy);
+
+extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
+
+// Similar to NewLRUCache, but create a cache based on CLOCK algorithm with
+// better concurrent performance in some cases. See util/clock_cache.cc for
+// more detail.
+//
+// Return nullptr if it is not supported.
+extern std::shared_ptr<Cache> NewClockCache(
+ size_t capacity, int num_shard_bits = -1,
+ bool strict_capacity_limit = false,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy);
+class Cache {
+ public:
+ // Depending on implementation, cache entries with high priority could be less
+ // likely to get evicted than low priority entries.
+ enum class Priority { HIGH, LOW };
+
+ Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+ : memory_allocator_(std::move(allocator)) {}
+ // No copying allowed
+ Cache(const Cache&) = delete;
+ Cache& operator=(const Cache&) = delete;
+
+ // Destroys all existing entries by calling the "deleter"
+ // function that was passed via the Insert() function.
+ //
+ // @See Insert
+ virtual ~Cache() {}
+
+ // Opaque handle to an entry stored in the cache.
+ struct Handle {};
+
+ // The type of the Cache
+ virtual const char* Name() const = 0;
+
+ // Insert a mapping from key->value into the cache and assign it
+ // the specified charge against the total cache capacity.
+ // If strict_capacity_limit is true and cache reaches its full capacity,
+ // return Status::Incomplete.
+ //
+ // If handle is not nullptr, returns a handle that corresponds to the
+ // mapping. The caller must call this->Release(handle) when the returned
+ // mapping is no longer needed. In case of error caller is responsible to
+ // cleanup the value (i.e. calling "deleter").
+ //
+ // If handle is nullptr, it is as if Release is called immediately after
+ // insert. In case of error value will be cleanup.
+ //
+ // When the inserted entry is no longer needed, the key and
+ // value will be passed to "deleter".
+ virtual Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ Handle** handle = nullptr,
+ Priority priority = Priority::LOW) = 0;
+
+ // If the cache has no mapping for "key", returns nullptr.
+ //
+ // Else return a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ // If stats is not nullptr, relative tickers could be used inside the
+ // function.
+ virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0;
+
+ // Increments the reference count for the handle if it refers to an entry in
+ // the cache. Returns true if refcount was incremented; otherwise, returns
+ // false.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual bool Ref(Handle* handle) = 0;
+
+ /**
+ * Release a mapping returned by a previous Lookup(). A released entry might
+ * still remain in cache in case it is later looked up by others. If
+ * force_erase is set then it also erase it from the cache if there is no
+ * other reference to it. Erasing it should call the deleter function that
+ * was provided when the
+ * entry was inserted.
+ *
+ * Returns true if the entry was also erased.
+ */
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual bool Release(Handle* handle, bool force_erase = false) = 0;
+
+ // Return the value encapsulated in a handle returned by a
+ // successful Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void* Value(Handle* handle) = 0;
+
+ // If the cache contains entry for key, erase it. Note that the
+ // underlying entry will be kept around until all existing handles
+ // to it have been released.
+ virtual void Erase(const Slice& key) = 0;
+ // Return a new numeric id. May be used by multiple clients who are
+ // sharding the same cache to partition the key space. Typically the
+ // client will allocate a new id at startup and prepend the id to
+ // its cache keys.
+ virtual uint64_t NewId() = 0;
+
+ // sets the maximum configured capacity of the cache. When the new
+ // capacity is less than the old capacity and the existing usage is
+ // greater than new capacity, the implementation will do its best job to
+ // purge the released entries from the cache in order to lower the usage
+ virtual void SetCapacity(size_t capacity) = 0;
+
+ // Set whether to return error on insertion when cache reaches its full
+ // capacity.
+ virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+
+ // Get the flag whether to return error on insertion when cache reaches its
+ // full capacity.
+ virtual bool HasStrictCapacityLimit() const = 0;
+
+ // returns the maximum configured capacity of the cache
+ virtual size_t GetCapacity() const = 0;
+
+ // returns the memory size for the entries residing in the cache.
+ virtual size_t GetUsage() const = 0;
+
+ // returns the memory size for a specific entry in the cache.
+ virtual size_t GetUsage(Handle* handle) const = 0;
+
+ // returns the memory size for the entries in use by the system
+ virtual size_t GetPinnedUsage() const = 0;
+
+ // returns the charge for the specific entry in the cache.
+ virtual size_t GetCharge(Handle* handle) const = 0;
+
+ // Call this on shutdown if you want to speed it up. Cache will disown
+ // any underlying data and will not free it on delete. This call will leak
+ // memory - call this only if you're shutting down the process.
+ // Any attempts of using cache after this call will fail terribly.
+ // Always delete the DB object before calling this method!
+ virtual void DisownData(){
+ // default implementation is noop
+ }
+
+ // Apply callback to all entries in the cache
+ // If thread_safe is true, it will also lock the accesses. Otherwise, it will
+ // access the cache without the lock held
+ virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+ bool thread_safe) = 0;
+
+ // Remove all entries.
+ // Prerequisite: no entry is referenced.
+ virtual void EraseUnRefEntries() = 0;
+
+ virtual std::string GetPrintableOptions() const { return ""; }
+
+ MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
+
+ private:
+ std::shared_ptr<MemoryAllocator> memory_allocator_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h
new file mode 100644
index 000000000..b6a70ea64
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cleanable.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cleanable {
+ public:
+ Cleanable();
+ // No copy constructor and copy assignment allowed.
+ Cleanable(Cleanable&) = delete;
+ Cleanable& operator=(Cleanable&) = delete;
+
+ ~Cleanable();
+
+ // Move constructor and move assignment is allowed.
+ Cleanable(Cleanable&&);
+ Cleanable& operator=(Cleanable&&);
+
+ // Clients are allowed to register function/arg1/arg2 triples that
+ // will be invoked when this iterator is destroyed.
+ //
+ // Note that unlike all of the preceding methods, this method is
+ // not abstract and therefore clients should not override it.
+ typedef void (*CleanupFunction)(void* arg1, void* arg2);
+ void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+ void DelegateCleanupsTo(Cleanable* other);
+ // DoCleanup and also resets the pointers for reuse
+ inline void Reset() {
+ DoCleanup();
+ cleanup_.function = nullptr;
+ cleanup_.next = nullptr;
+ }
+
+ protected:
+ struct Cleanup {
+ CleanupFunction function;
+ void* arg1;
+ void* arg2;
+ Cleanup* next;
+ };
+ Cleanup cleanup_;
+ // It also becomes the owner of c
+ void RegisterCleanup(Cleanup* c);
+
+ private:
+ // Performs all the cleanups. It does not reset the pointers. Making it
+ // private
+ // to prevent misuse
+ inline void DoCleanup() {
+ if (cleanup_.function != nullptr) {
+ (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+ for (Cleanup* c = cleanup_.next; c != nullptr;) {
+ (*c->function)(c->arg1, c->arg2);
+ Cleanup* next = c->next;
+ delete c;
+ c = next;
+ }
+ }
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
new file mode 100644
index 000000000..976507831
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -0,0 +1,212 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+
+// Context information of a compaction run
+struct CompactionFilterContext {
+ // Does this compaction run include all data files
+ bool is_full_compaction;
+ // Is this compaction requested by the client (true),
+ // or is it occurring as an automatic compaction process
+ bool is_manual_compaction;
+};
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+ enum ValueType {
+ kValue,
+ kMergeOperand,
+ kBlobIndex, // used internally by BlobDB.
+ };
+
+ enum class Decision {
+ kKeep,
+ kRemove,
+ kChangeValue,
+ kRemoveAndSkipUntil,
+ };
+
+ enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
+
+ // Context information of a compaction run
+ struct Context {
+ // Does this compaction run include all data files
+ bool is_full_compaction;
+ // Is this compaction requested by the client (true),
+ // or is it occurring as an automatic compaction process
+ bool is_manual_compaction;
+ // Which column family this compaction is for.
+ uint32_t column_family_id;
+ };
+
+ virtual ~CompactionFilter() {}
+
+ // The compaction process invokes this
+ // method for kv that is being compacted. A return value
+ // of false indicates that the kv should be preserved in the
+ // output of this compaction run and a return value of true
+ // indicates that this key-value should be removed from the
+ // output of the compaction. The application can inspect
+ // the existing value of the key and make decision based on it.
+ //
+ // Key-Values that are results of merge operation during compaction are not
+ // passed into this function. Currently, when you have a mix of Put()s and
+ // Merge()s on a same key, we only guarantee to process the merge operands
+ // through the compaction filters. Put()s might be processed, or might not.
+ //
+ // When the value is to be preserved, the application has the option
+ // to modify the existing_value and pass it back through new_value.
+ // value_changed needs to be set to true in this case.
+ //
+ // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
+ // DB* object) will not guarantee to preserve the state of the DB with
+ // CompactionFilter. Data seen from a snapshot might disppear after a
+ // compaction finishes. If you use snapshots, think twice about whether you
+ // want to use compaction filter and whether you are using it in a safe way.
+ //
+ // If multithreaded compaction is being used *and* a single CompactionFilter
+ // instance was supplied via Options::compaction_filter, this method may be
+ // called from different threads concurrently. The application must ensure
+ // that the call is thread-safe.
+ //
+ // If the CompactionFilter was created by a factory, then it will only ever
+ // be used by a single thread that is doing the compaction run, and this
+ // call does not need to be thread-safe. However, multiple filters may be
+ // in existence and operating concurrently.
+ virtual bool Filter(int /*level*/, const Slice& /*key*/,
+ const Slice& /*existing_value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const {
+ return false;
+ }
+
+ // The compaction process invokes this method on every merge operand. If this
+ // method returns true, the merge operand will be ignored and not written out
+ // in the compaction output
+ //
+ // Note: If you are using a TransactionDB, it is not recommended to implement
+ // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB
+ // may not realize there is a write conflict and may allow a Transaction to
+ // Commit that should have failed. Instead, it is better to implement any
+ // Merge filtering inside the MergeOperator.
+ virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+ const Slice& /*operand*/) const {
+ return false;
+ }
+
+ // An extended API. Called for both values and merge operands.
+ // Allows changing value and skipping ranges of keys.
+ // The default implementation uses Filter() and FilterMergeOperand().
+ // If you're overriding this method, no need to override the other two.
+ // `value_type` indicates whether this key-value corresponds to a normal
+ // value (e.g. written with Put()) or a merge operand (written with Merge()).
+ //
+ // Possible return values:
+ // * kKeep - keep the key-value pair.
+ // * kRemove - remove the key-value pair or merge operand.
+ // * kChangeValue - keep the key and change the value/operand to *new_value.
+ // * kRemoveAndSkipUntil - remove this key-value pair, and also remove
+ // all key-value pairs with key in [key, *skip_until). This range
+ // of keys will be skipped without reading, potentially saving some
+ // IO operations compared to removing the keys one by one.
+ //
+ // *skip_until <= key is treated the same as Decision::kKeep
+ // (since the range [key, *skip_until) is empty).
+ //
+ // Caveats:
+ // - The keys are skipped even if there are snapshots containing them,
+ // i.e. values removed by kRemoveAndSkipUntil can disappear from a
+ // snapshot - beware if you're using TransactionDB or
+ // DB::GetSnapshot().
+ // - If value for a key was overwritten or merged into (multiple Put()s
+ // or Merge()s), and compaction filter skips this key with
+ // kRemoveAndSkipUntil, it's possible that it will remove only
+ // the new value, exposing the old value that was supposed to be
+ // overwritten.
+ // - Doesn't work with PlainTableFactory in prefix mode.
+ // - If you use kRemoveAndSkipUntil, consider also reducing
+ // compaction_readahead_size option.
+ //
+ // Note: If you are using a TransactionDB, it is not recommended to filter
+ // out or modify merge operands (ValueType::kMergeOperand).
+ // If a merge operation is filtered out, TransactionDB may not realize there
+ // is a write conflict and may allow a Transaction to Commit that should have
+ // failed. Instead, it is better to implement any Merge filtering inside the
+ // MergeOperator.
+ virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ switch (value_type) {
+ case ValueType::kValue: {
+ bool value_changed = false;
+ bool rv = Filter(level, key, existing_value, new_value, &value_changed);
+ if (rv) {
+ return Decision::kRemove;
+ }
+ return value_changed ? Decision::kChangeValue : Decision::kKeep;
+ }
+ case ValueType::kMergeOperand: {
+ bool rv = FilterMergeOperand(level, key, existing_value);
+ return rv ? Decision::kRemove : Decision::kKeep;
+ }
+ case ValueType::kBlobIndex:
+ return Decision::kKeep;
+ }
+ assert(false);
+ return Decision::kKeep;
+ }
+
+ // Internal (BlobDB) use only. Do not override in application code.
+ virtual BlobDecision PrepareBlobOutput(const Slice& /* key */,
+ const Slice& /* existing_value */,
+ std::string* /* new_value */) const {
+ return BlobDecision::kKeep;
+ }
+
+ // This function is deprecated. Snapshots will always be ignored for
+ // compaction filters, because we realized that not ignoring snapshots doesn't
+ // provide the gurantee we initially thought it would provide. Repeatable
+ // reads will not be guaranteed anyway. If you override the function and
+ // returns false, we will fail the compaction.
+ virtual bool IgnoreSnapshots() const { return true; }
+
+ // Returns a name that identifies this compaction filter.
+ // The name will be printed to LOG file on start up for diagnosis.
+ virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different compactions
+class CompactionFilterFactory {
+ public:
+ virtual ~CompactionFilterFactory() {}
+
+ virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) = 0;
+
+ // Returns a name that identifies this compaction filter factory.
+ virtual const char* Name() const = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h
new file mode 100644
index 000000000..8949b43e5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct CompactionJobStats {
+ CompactionJobStats() { Reset(); }
+ void Reset();
+ // Aggregate the CompactionJobStats from another instance with this one
+ void Add(const CompactionJobStats& stats);
+
+ // the elapsed time of this compaction in microseconds.
+ uint64_t elapsed_micros;
+
+ // the elapsed CPU time of this compaction in microseconds.
+ uint64_t cpu_micros;
+
+ // the number of compaction input records.
+ uint64_t num_input_records;
+ // the number of compaction input files.
+ size_t num_input_files;
+ // the number of compaction input files at the output level.
+ size_t num_input_files_at_output_level;
+
+ // the number of compaction output records.
+ uint64_t num_output_records;
+ // the number of compaction output files.
+ size_t num_output_files;
+
+ // true if the compaction is a manual compaction
+ bool is_manual_compaction;
+
+ // the size of the compaction input in bytes.
+ uint64_t total_input_bytes;
+ // the size of the compaction output in bytes.
+ uint64_t total_output_bytes;
+
+ // number of records being replaced by newer record associated with same key.
+ // this could be a new value or a deletion entry for that key so this field
+ // sums up all updated and deleted keys
+ uint64_t num_records_replaced;
+
+ // the sum of the uncompressed input keys in bytes.
+ uint64_t total_input_raw_key_bytes;
+ // the sum of the uncompressed input values in bytes.
+ uint64_t total_input_raw_value_bytes;
+
+ // the number of deletion entries before compaction. Deletion entries
+ // can disappear after compaction because they expired
+ uint64_t num_input_deletion_records;
+ // number of deletion records that were found obsolete and discarded
+ // because it is not possible to delete any more keys with this entry
+ // (i.e. all possible deletions resulting from it have been completed)
+ uint64_t num_expired_deletion_records;
+
+ // number of corrupt keys (ParseInternalKey returned false when applied to
+ // the key) encountered and written out.
+ uint64_t num_corrupt_keys;
+
+ // Following counters are only populated if
+ // options.report_bg_io_stats = true;
+
+ // Time spent on file's Append() call.
+ uint64_t file_write_nanos;
+
+ // Time spent on sync file range.
+ uint64_t file_range_sync_nanos;
+
+ // Time spent on file fsync.
+ uint64_t file_fsync_nanos;
+
+ // Time spent on preparing file write (fallocate, etc)
+ uint64_t file_prepare_write_nanos;
+
+ // 0-terminated strings storing the first 8 bytes of the smallest and
+ // largest key in the output.
+ static const size_t kMaxPrefixLength = 8;
+
+ std::string smallest_output_key_prefix;
+ std::string largest_output_key_prefix;
+
+ // number of single-deletes which do not meet a put
+ uint64_t num_single_del_fallthru;
+
+ // number of single-deletes which meet something other than a put
+ uint64_t num_single_del_mismatch;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
new file mode 100644
index 000000000..76981d108
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database. A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+ Comparator() : timestamp_size_(0) {}
+
+ Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+ Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+ Comparator& operator=(const Comparator& rhs) {
+ if (this != &rhs) {
+ timestamp_size_ = rhs.timestamp_size_;
+ }
+ return *this;
+ }
+
+ virtual ~Comparator() {}
+
+ static const char* Type() { return "Comparator"; }
+ // Three-way comparison. Returns value:
+ // < 0 iff "a" < "b",
+ // == 0 iff "a" == "b",
+ // > 0 iff "a" > "b"
+ virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+ // Compares two slices for equality. The following invariant should always
+ // hold (and is the default implementation):
+ // Equal(a, b) iff Compare(a, b) == 0
+ // Overwrite only if equality comparisons can be done more efficiently than
+ // three-way comparisons.
+ virtual bool Equal(const Slice& a, const Slice& b) const {
+ return Compare(a, b) == 0;
+ }
+
+ // The name of the comparator. Used to check for comparator
+ // mismatches (i.e., a DB created with one comparator is
+ // accessed using a different comparator.
+ //
+ // The client of this package should switch to a new name whenever
+ // the comparator implementation changes in a way that will cause
+ // the relative ordering of any two keys to change.
+ //
+ // Names starting with "rocksdb." are reserved and should not be used
+ // by any clients of this package.
+ virtual const char* Name() const = 0;
+
+ // Advanced functions: these are used to reduce the space requirements
+ // for internal data structures like index blocks.
+
+ // If *start < limit, changes *start to a short string in [start,limit).
+ // Simple comparator implementations may return with *start unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortestSeparator(std::string* start,
+ const Slice& limit) const = 0;
+
+ // Changes *key to a short string >= *key.
+ // Simple comparator implementations may return with *key unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortSuccessor(std::string* key) const = 0;
+
+ // if it is a wrapped comparator, may return the root one.
+ // return itself it is not wrapped.
+ virtual const Comparator* GetRootComparator() const { return this; }
+
+ // given two keys, determine if t is the successor of s
+ virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
+ const Slice& /*t*/) const {
+ return false;
+ }
+
+ // return true if two keys with different byte sequences can be regarded
+ // as equal by this comparator.
+ // The major use case is to determine if DataBlockHashIndex is compatible
+ // with the customized comparator.
+ virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+ inline size_t timestamp_size() const { return timestamp_size_; }
+
+ virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+ return Compare(a, b);
+ }
+
+ virtual int CompareTimestamp(const Slice& /*ts1*/,
+ const Slice& /*ts2*/) const {
+ return 0;
+ }
+
+ private:
+ size_t timestamp_size_;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering. The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering.
+extern const Comparator* ReverseBytewiseComparator();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
new file mode 100644
index 000000000..4fc6b7940
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ConcurrentTaskLimiter {
+ public:
+ virtual ~ConcurrentTaskLimiter() {}
+
+ // Returns a name that identifies this concurrent task limiter.
+ virtual const std::string& GetName() const = 0;
+
+ // Set max concurrent tasks.
+ // limit = 0 means no new task allowed.
+ // limit < 0 means no limitation.
+ virtual void SetMaxOutstandingTask(int32_t limit) = 0;
+
+ // Reset to unlimited max concurrent task.
+ virtual void ResetMaxOutstandingTask() = 0;
+
+ // Returns current outstanding task count.
+ virtual int32_t GetOutstandingTask() const = 0;
+};
+
+// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs
+// across RocksDB instances to control concurrent tasks.
+//
+// @param name: Name of the limiter.
+// @param limit: max concurrent tasks.
+// limit = 0 means no new task allowed.
+// limit < 0 means no limitation.
+extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+ int32_t limit);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h
new file mode 100644
index 000000000..442303d94
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/convenience.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+// The following set of functions provide a way to construct RocksDB Options
+// from a string or a string-to-string map. Here're the general rule of
+// setting option values from strings by type. Some RocksDB types are also
+// supported in these APIs. Please refer to the comment of the function itself
+// to find more information about how to config those RocksDB types.
+//
+// * Strings:
+// Strings will be used as values directly without any truncating or
+// trimming.
+//
+// * Booleans:
+// - "true" or "1" => true
+// - "false" or "0" => false.
+// [Example]:
+// - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or
+// - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString.
+//
+// * Integers:
+// Integers are converted directly from string, in addition to the following
+// units that we support:
+// - 'k' or 'K' => 2^10
+// - 'm' or 'M' => 2^20
+// - 'g' or 'G' => 2^30
+// - 't' or 'T' => 2^40 // only for unsigned int with sufficient bits.
+// [Example]:
+// - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or
+// - "arena_block_size=19G" in GetColumnFamilyOptionsFromString.
+//
+// * Doubles / Floating Points:
+// Doubles / Floating Points are converted directly from string. Note that
+// currently we do not support units.
+// [Example]:
+// - {"hard_rate_limit", "2.1"} in GetColumnFamilyOptionsFromMap, or
+// - "hard_rate_limit=2.1" in GetColumnFamilyOptionsFromString.
+// * Array / Vectors:
+// An array is specified by a list of values, where ':' is used as
+// the delimiter to separate each value.
+// [Example]:
+// - {"compression_per_level", "kNoCompression:kSnappyCompression"}
+// in GetColumnFamilyOptionsFromMap, or
+// - "compression_per_level=kNoCompression:kSnappyCompression" in
+// GetColumnFamilyOptionsFromMapString
+// * Enums:
+// The valid values of each enum are identical to the names of its constants.
+// [Example]:
+// - CompressionType: valid values are "kNoCompression",
+// "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ...
+// - CompactionStyle: valid values are "kCompactionStyleLevel",
+// "kCompactionStyleUniversal", "kCompactionStyleFIFO", and
+// "kCompactionStyleNone".
+//
+
+// Take a default ColumnFamilyOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// ColumnFamilyOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in ColumnFOptions:
+//
+// * table_factory:
+// table_factory can be configured using our custom nested-option syntax.
+//
+// {option_a=value_a; option_b=value_b; option_c=value_c; ... }
+//
+// A nested option is enclosed by two curly braces, within which there are
+// multiple option assignments. Each assignment is of the form
+// "variable_name=value;".
+//
+// Currently we support the following types of TableFactory:
+// - BlockBasedTableFactory:
+// Use name "block_based_table_factory" to initialize table_factory with
+// BlockBasedTableFactory. Its BlockBasedTableFactoryOptions can be
+// configured using the nested-option syntax.
+// [Example]:
+// * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"}
+// is equivalent to assigning table_factory with a BlockBasedTableFactory
+// that has 1M LRU block-cache with block size equals to 4k:
+// ColumnFamilyOptions cf_opt;
+// BlockBasedTableOptions blk_opt;
+// blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024);
+// blk_opt.block_size = 4 * 1024;
+// cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt));
+// - PlainTableFactory:
+// Use name "plain_table_factory" to initialize table_factory with
+// PlainTableFactory. Its PlainTableFactoryOptions can be configured using
+// the nested-option syntax.
+// [Example]:
+// * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"}
+//
+// * memtable_factory:
+// Use "memtable" to config memtable_factory. Here are the supported
+// memtable factories:
+// - SkipList:
+// Pass "skip_list:<lookahead>" to config memtable to use SkipList,
+// or simply "skip_list" to use the default SkipList.
+// [Example]:
+// * {"memtable", "skip_list:5"} is equivalent to setting
+// memtable to SkipListFactory(5).
+// - PrefixHash:
+// Pass "prfix_hash:<hash_bucket_count>" to config memtable
+// to use PrefixHash, or simply "prefix_hash" to use the default
+// PrefixHash.
+// [Example]:
+// * {"memtable", "prefix_hash:1000"} is equivalent to setting
+// memtable to NewHashSkipListRepFactory(hash_bucket_count).
+// - HashLinkedList:
+// Pass "hash_linkedlist:<hash_bucket_count>" to config memtable
+// to use HashLinkedList, or simply "hash_linkedlist" to use the default
+// HashLinkedList.
+// [Example]:
+// * {"memtable", "hash_linkedlist:1000"} is equivalent to
+// setting memtable to NewHashLinkListRepFactory(1000).
+// - VectorRepFactory:
+// Pass "vector:<count>" to config memtable to use VectorRepFactory,
+// or simply "vector" to use the default Vector memtable.
+// [Example]:
+// * {"memtable", "vector:1024"} is equivalent to setting memtable
+// to VectorRepFactory(1024).
+// - HashCuckooRepFactory:
+// Pass "cuckoo:<write_buffer_size>" to use HashCuckooRepFactory with the
+// specified write buffer size, or simply "cuckoo" to use the default
+// HashCuckooRepFactory.
+// [Example]:
+// * {"memtable", "cuckoo:1024"} is equivalent to setting memtable
+// to NewHashCuckooRepFactory(1024).
+//
+// * compression_opts:
+// Use "compression_opts" to config compression_opts. The value format
+// is of the form "<window_bits>:<level>:<strategy>:<max_dict_bytes>".
+// [Example]:
+// * {"compression_opts", "4:5:6:7"} is equivalent to setting:
+// ColumnFamilyOptions cf_opt;
+// cf_opt.compression_opts.window_bits = 4;
+// cf_opt.compression_opts.level = 5;
+// cf_opt.compression_opts.strategy = 6;
+// cf_opt.compression_opts.max_dict_bytes = 7;
+//
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+// should be set.
+// @param new_options the resulting options based on "base_options" with the
+// change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_options" will be set to "base_options".
+Status GetColumnFamilyOptionsFromMap(
+ const ColumnFamilyOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ ColumnFamilyOptions* new_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a default DBOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// DBOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in DBOptions:
+//
+// * rate_limiter_bytes_per_sec:
+// RateLimiter can be configured directly by specifying its bytes_per_sec.
+// [Example]:
+// - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to
+// passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec.
+//
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+// should be set.
+// @param new_options the resulting options based on "base_options" with the
+// change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_options" will be set to "base_options".
+Status GetDBOptionsFromMap(
+ const DBOptions& base_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ DBOptions* new_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+// We currently only support the following FilterPolicy in the convenience
+// functions:
+// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+// to specify BloomFilter. The above string is equivalent to calling
+// NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+// [Example]:
+// - Pass {"filter_policy", "bloomfilter:4:true"} in
+// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+// per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+// We currently only support LRU cache in the GetOptions API. The LRU
+// cache can be set by directly specifying its size.
+// [Example]:
+// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+// equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+// "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+// with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_table_options" will be set to
+// "table_options".
+Status GetBlockBasedTableOptionsFromMap(
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options,
+ bool input_strings_escaped = false, bool ignore_unknown_options = false);
+
+// Take a default PlainTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// PlainTableOptions "new_table_options".
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+// "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+// with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_table_options" will be set to
+// "table_options".
+Status GetPlainTableOptionsFromMap(
+ const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options, bool input_strings_escaped = false,
+ bool ignore_unknown_options = false);
+
+// Take a string representation of option names and values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+// "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+// "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+// "max_write_buffer_num=2"
+Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
+ const std::string& opts_str,
+ ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(const DBOptions& base_options,
+ const std::string& opts_str,
+ DBOptions* new_options);
+
+Status GetStringFromDBOptions(std::string* opts_str,
+ const DBOptions& db_options,
+ const std::string& delimiter = "; ");
+
+Status GetStringFromColumnFamilyOptions(std::string* opts_str,
+ const ColumnFamilyOptions& cf_options,
+ const std::string& delimiter = "; ");
+
+Status GetStringFromCompressionType(std::string* compression_str,
+ CompressionType compression_type);
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+Status GetBlockBasedTableOptionsFromString(
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options);
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options);
+
+Status GetMemTableRepFactoryFromString(
+ const std::string& opts_str,
+ std::unique_ptr<MemTableRepFactory>* new_mem_factory);
+
+Status GetOptionsFromString(const Options& base_options,
+ const std::string& opts_str, Options* new_options);
+
+Status StringToMap(const std::string& opts_str,
+ std::unordered_map<std::string, std::string>* opts_map);
+
+// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+
+// Delete files which are entirely in the given range
+// Could leave some keys in the range which are in files which are not
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
+// Snapshots before the delete might not see the data in the given range.
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end,
+ bool include_end = true);
+
+// Delete files in multiple ranges at once
+// Delete files in a lot of ranges one at a time can be slow, use this API for
+// better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end = true);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+ const EnvOptions& env_options,
+ const ReadOptions& read_options,
+ const std::string& file_path);
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
new file mode 100644
index 000000000..3108003f1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -0,0 +1,1525 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/version.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+struct Options;
+struct DBOptions;
+struct ColumnFamilyOptions;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+struct CompactionOptions;
+struct CompactRangeOptions;
+struct TableProperties;
+struct ExternalSstFileInfo;
+class WriteBatch;
+class Env;
+class EventListener;
+class StatsHistoryIterator;
+class TraceWriter;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
+class FileSystem;
+
+extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
+struct ColumnFamilyDescriptor {
+ std::string name;
+ ColumnFamilyOptions options;
+ ColumnFamilyDescriptor()
+ : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+ ColumnFamilyDescriptor(const std::string& _name,
+ const ColumnFamilyOptions& _options)
+ : name(_name), options(_options) {}
+};
+
+class ColumnFamilyHandle {
+ public:
+ virtual ~ColumnFamilyHandle() {}
+ // Returns the name of the column family associated with the current handle.
+ virtual const std::string& GetName() const = 0;
+ // Returns the ID of the column family associated with the current handle.
+ virtual uint32_t GetID() const = 0;
+ // Fills "*desc" with the up-to-date descriptor of the column family
+ // associated with this handle. Since it fills "*desc" with the up-to-date
+ // information, this call might internally lock and release DB mutex to
+ // access the up-to-date CF options. In addition, all the pointer-typed
+ // options cannot be referenced any longer than the original options exist.
+ //
+ // Note that this function is not supported in RocksDBLite.
+ virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
+ // Returns the comparator of the column family associated with the
+ // current handle.
+ virtual const Comparator* GetComparator() const = 0;
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+// A range of keys
+struct Range {
+ Slice start;
+ Slice limit;
+
+ Range() {}
+ Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+struct RangePtr {
+ const Slice* start;
+ const Slice* limit;
+
+ RangePtr() : start(nullptr), limit(nullptr) {}
+ RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+struct IngestExternalFileArg {
+ ColumnFamilyHandle* column_family = nullptr;
+ std::vector<std::string> external_files;
+ IngestExternalFileOptions options;
+};
+
+struct GetMergeOperandsOptions {
+ int expected_max_number_of_operands = 0;
+};
+
+// A collections of table properties objects, where
+// key: is the table's file name.
+// value: the table properties object of the given table.
+typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
+ TablePropertiesCollection;
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+ // Open the database with the specified "name".
+ // Stores a pointer to a heap-allocated database in *dbptr and returns
+ // OK on success.
+ // Stores nullptr in *dbptr and returns a non-OK status on error.
+ // Caller should delete *dbptr when it is no longer needed.
+ static Status Open(const Options& options, const std::string& name,
+ DB** dbptr);
+
+ // Open the database for read only. All DB interfaces
+ // that modify data, like put/delete, will return error.
+ // If the db is opened in read only mode, then no compactions
+ // will happen.
+ //
+ // Not supported in ROCKSDB_LITE, in which case the function will
+ // return Status::NotSupported.
+ static Status OpenForReadOnly(const Options& options, const std::string& name,
+ DB** dbptr,
+ bool error_if_log_file_exist = false);
+
+ // Open the database for read only with column families. When opening DB with
+ // read only, you can specify only a subset of column families in the
+ // database that should be opened. However, you always need to specify default
+ // column family. The default column family name is 'default' and it's stored
+ // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+ //
+ // Not supported in ROCKSDB_LITE, in which case the function will
+ // return Status::NotSupported.
+ static Status OpenForReadOnly(
+ const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_log_file_exist = false);
+
+ // The following OpenAsSecondary functions create a secondary instance that
+ // can dynamically tail the MANIFEST of a primary that must have already been
+ // created. User can call TryCatchUpWithPrimary to make the secondary
+ // instance catch up with primary (WAL tailing is NOT supported now) whenever
+ // the user feels necessary. Column families created by the primary after the
+ // secondary instance starts are currently ignored by the secondary instance.
+ // Column families opened by secondary and dropped by the primary will be
+ // dropped by secondary as well. However the user of the secondary instance
+ // can still access the data of such dropped column family as long as they
+ // do not destroy the corresponding column family handle.
+ // WAL tailing is not supported at present, but will arrive soon.
+ //
+ // The options argument specifies the options to open the secondary instance.
+ // The name argument specifies the name of the primary db that you have used
+ // to open the primary instance.
+ // The secondary_path argument points to a directory where the secondary
+ // instance stores its info log.
+ // The dbptr is an out-arg corresponding to the opened secondary instance.
+ // The pointer points to a heap-allocated database, and the user should
+ // delete it after use.
+ // Open DB as secondary instance with only the default column family.
+ // Return OK on success, non-OK on failures.
+ static Status OpenAsSecondary(const Options& options, const std::string& name,
+ const std::string& secondary_path, DB** dbptr);
+
+ // Open DB as secondary instance with column families. You can open a subset
+ // of column families in secondary mode.
+ // The db_options specify the database specific options.
+ // The name argument specifies the name of the primary db that you have used
+ // to open the primary instance.
+ // The secondary_path argument points to a directory where the secondary
+ // instance stores its info log.
+ // The column_families argument specifieds a list of column families to open.
+ // If any of the column families does not exist, the function returns non-OK
+ // status.
+ // The handles is an out-arg corresponding to the opened database column
+ // familiy handles.
+ // The dbptr is an out-arg corresponding to the opened secondary instance.
+ // The pointer points to a heap-allocated database, and the caller should
+ // delete it after use. Before deleting the dbptr, the user should also
+ // delete the pointers stored in handles vector.
+ // Return OK on success, on-OK on failures.
+ static Status OpenAsSecondary(
+ const DBOptions& db_options, const std::string& name,
+ const std::string& secondary_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+ // Open DB with column families.
+ // db_options specify database specific options
+ // column_families is the vector of all column families in the database,
+ // containing column family name and options. You need to open ALL column
+ // families in the database. To get the list of column families, you can use
+ // ListColumnFamilies(). Also, you can open only a subset of column families
+ // for read-only access.
+ // The default column family name is 'default' and it's stored
+ // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
+ // If everything is OK, handles will on return be the same size
+ // as column_families --- handles[i] will be a handle that you
+ // will use to operate on column family column_family[i].
+ // Before delete DB, you have to close All column families by calling
+ // DestroyColumnFamilyHandle() with all the handles.
+ static Status Open(const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+ virtual Status Resume() { return Status::NotSupported(); }
+
+ // Close the DB by releasing resources, closing files etc. This should be
+ // called before calling the destructor so that the caller can get back a
+ // status in case there are any errors. This will not fsync the WAL files.
+ // If syncing is required, the caller must first call SyncWAL(), or Write()
+ // using an empty write batch with WriteOptions.sync=true.
+ // Regardless of the return status, the DB must be freed.
+ // If the return status is Aborted(), closing fails because there is
+ // unreleased snapshot in the system. In this case, users can release
+ // the unreleased snapshots and try again and expect it to succeed. For
+ // other status, recalling Close() will be no-op.
+ // If the return status is NotSupported(), then the DB implementation does
+ // cleanup in the destructor
+ virtual Status Close() { return Status::NotSupported(); }
+
+ // ListColumnFamilies will open the DB specified by argument name
+ // and return the list of all column families in that DB
+ // through column_families argument. The ordering of
+ // column families in column_families is unspecified.
+ static Status ListColumnFamilies(const DBOptions& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families);
+
+ DB() {}
+ // No copying allowed
+ DB(const DB&) = delete;
+ void operator=(const DB&) = delete;
+
+ virtual ~DB();
+
+ // Create a column_family and return the handle of column family
+ // through the argument handle.
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle);
+
+ // Bulk create column families with the same column family options.
+ // Return the handles of the column families through the argument handles.
+ // In case of error, the request may succeed partially, and handles will
+ // contain column family handles that it managed to create, and have size
+ // equal to the number of created column families.
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles);
+
+ // Bulk create column families.
+ // Return the handles of the column families through the argument handles.
+ // In case of error, the request may succeed partially, and handles will
+ // contain column family handles that it managed to create, and have size
+ // equal to the number of created column families.
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles);
+
+ // Drop a column family specified by column_family handle. This call
+ // only records a drop record in the manifest and prevents the column
+ // family from flushing and compacting.
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+ // Bulk drop column families. This call only records drop records in the
+ // manifest and prevents the column families from flushing and compacting.
+ // In case of error, the request may succeed partially. User may call
+ // ListColumnFamilies to check the result.
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families);
+
+ // Close a column family specified by column_family handle and destroy
+ // the column family handle specified to avoid double deletion. This call
+ // deletes the column family handle by default. Use this method to
+ // close column family instead of deleting column family handle directly
+ virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
+
+ // Set the database entry for "key" to "value".
+ // If "key" already exists, it will be overwritten.
+ // Returns OK on success, and a non-OK status on error.
+ // Note: consider setting options.sync = true.
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Put(const WriteOptions& options, const Slice& key,
+ const Slice& value) {
+ return Put(options, DefaultColumnFamily(), key, value);
+ }
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = true.
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status Delete(const WriteOptions& options, const Slice& key) {
+ return Delete(options, DefaultColumnFamily(), key);
+ }
+
+ // Remove the database entry for "key". Requires that the key exists
+ // and was not overwritten. Returns OK on success, and a non-OK status
+ // on error. It is not an error if "key" did not exist in the database.
+ //
+ // If a key is overwritten (by calling Put() multiple times), then the result
+ // of calling SingleDelete() on this key is undefined. SingleDelete() only
+ // behaves correctly if there has been only one Put() for this key since the
+ // previous call to SingleDelete() for this key.
+ //
+ // This feature is currently an experimental performance optimization
+ // for a very specific workload. It is up to the caller to ensure that
+ // SingleDelete is only used for a key that is not deleted using Delete() or
+ // written using Merge(). Mixing SingleDelete operations with Deletes and
+ // Merges can result in undefined behavior.
+ //
+ // Note: consider setting options.sync = true.
+ virtual Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
+ return SingleDelete(options, DefaultColumnFamily(), key);
+ }
+
+ // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
+ // including "begin_key" and excluding "end_key". Returns OK on success, and
+ // a non-OK status on error. It is not an error if no keys exist in the range
+ // ["begin_key", "end_key").
+ //
+ // This feature is now usable in production, with the following caveats:
+ // 1) Accumulating many range tombstones in the memtable will degrade read
+ // performance; this can be avoided by manually flushing occasionally.
+ // 2) Limiting the maximum number of open files in the presence of range
+ // tombstones can degrade read performance. To avoid this problem, set
+ // max_open_files to -1 whenever possible.
+ virtual Status DeleteRange(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key);
+
+ // Merge the database entry for "key" with "value". Returns OK on success,
+ // and a non-OK status on error. The semantics of this operation is
+ // determined by the user provided merge_operator when opening DB.
+ // Note: consider setting options.sync = true.
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Merge(const WriteOptions& options, const Slice& key,
+ const Slice& value) {
+ return Merge(options, DefaultColumnFamily(), key, value);
+ }
+
+ // Apply the specified updates to the database.
+ // If `updates` contains no update, WAL will still be synced if
+ // options.sync=true.
+ // Returns OK on success, non-OK on failure.
+ // Note: consider setting options.sync = true.
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+ // If the database contains an entry for "key" store the
+ // corresponding value in *value and return OK.
+ //
+ // If there is no entry for "key" leave *value unchanged and return
+ // a status for which Status::IsNotFound() returns true.
+ //
+ // May return some other Status on an error.
+ virtual inline Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s = Get(options, column_family, key, &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+ }
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) = 0;
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) {
+ return Get(options, DefaultColumnFamily(), key, value);
+ }
+
+ // Returns all the merge operands corresponding to the key. If the
+ // number of merge operands in DB is greater than
+ // merge_operands_options.expected_max_number_of_operands
+ // no merge operands are returned and status is Incomplete. Merge operands
+ // returned are in the order of insertion.
+ // merge_operands- Points to an array of at-least
+ // merge_operands_options.expected_max_number_of_operands and the
+ // caller is responsible for allocating it. If the status
+ // returned is Incomplete then number_of_operands will contain
+ // the total number of merge operands found in DB for key.
+ virtual Status GetMergeOperands(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* merge_operands,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) = 0;
+
+ // If keys[i] does not exist in the database, then the i'th returned
+ // status will be one for which Status::IsNotFound() is true, and
+ // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+ // the i'th returned status will have Status::ok() true, and (*values)[i]
+ // will store the value associated with keys[i].
+ //
+ // (*values) will always be resized to be the same size as (keys).
+ // Similarly, the number of returned statuses will be the number of keys.
+ // Note: keys will not be "de-duplicated". Duplicate keys will return
+ // duplicate values in order.
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+ virtual std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) {
+ return MultiGet(
+ options,
+ std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+ keys, values);
+ }
+
+ // Overloaded MultiGet API that improves performance by batching operations
+ // in the read path for greater efficiency. Currently, only the block based
+ // table format with full filters are supported. Other table formats such
+ // as plain table, block based table with block based filters and
+ // partitioned indexes will still work, but will not get any performance
+ // benefits.
+ // Parameters -
+ // options - ReadOptions
+ // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+ // passed to the API are restricted to a single column family
+ // num_keys - Number of keys to lookup
+ // keys - Pointer to C style array of key Slices with num_keys elements
+ // values - Pointer to C style array of PinnableSlices with num_keys elements
+ // statuses - Pointer to C style array of Status with num_keys elements
+ // sorted_input - If true, it means the input keys are already sorted by key
+ // order, so the MultiGet() API doesn't have to sort them
+ // again. If false, the keys will be copied and sorted
+ // internally by the API - the input array will not be
+ // modified
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_family);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals);
+ std::copy(status.begin(), status.end(), statuses);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+
+ // Overloaded MultiGet API that improves performance by batching operations
+ // in the read path for greater efficiency. Currently, only the block based
+ // table format with full filters are supported. Other table formats such
+ // as plain table, block based table with block based filters and
+ // partitioned indexes will still work, but will not get any performance
+ // benefits.
+ // Parameters -
+ // options - ReadOptions
+ // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+ // passed to the API are restricted to a single column family
+ // num_keys - Number of keys to lookup
+ // keys - Pointer to C style array of key Slices with num_keys elements
+ // values - Pointer to C style array of PinnableSlices with num_keys elements
+ // statuses - Pointer to C style array of Status with num_keys elements
+ // sorted_input - If true, it means the input keys are already sorted by key
+ // order, so the MultiGet() API doesn't have to sort them
+ // again. If false, the keys will be copied and sorted
+ // internally by the API - the input array will not be
+ // modified
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ std::vector<ColumnFamilyHandle*> cf;
+ std::vector<Slice> user_keys;
+ std::vector<Status> status;
+ std::vector<std::string> vals;
+
+ for (size_t i = 0; i < num_keys; ++i) {
+ cf.emplace_back(column_families[i]);
+ user_keys.emplace_back(keys[i]);
+ }
+ status = MultiGet(options, cf, user_keys, &vals);
+ std::copy(status.begin(), status.end(), statuses);
+ for (auto& value : vals) {
+ values->PinSelf(value);
+ values++;
+ }
+ }
+
+ // If the key definitely does not exist in the database, then this method
+ // returns false, else true. If the caller wants to obtain value when the key
+ // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+ // will be true on return if value has been set properly.
+ // This check is potentially lighter-weight than invoking DB::Get(). One way
+ // to make this lighter weight is to avoid doing any IOs.
+ // Default implementation here returns true and sets 'value_found' to false
+ virtual bool KeyMayExist(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, std::string* /*value*/,
+ bool* value_found = nullptr) {
+ if (value_found != nullptr) {
+ *value_found = false;
+ }
+ return true;
+ }
+ virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+ std::string* value, bool* value_found = nullptr) {
+ return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+ }
+
+ // Return a heap-allocated iterator over the contents of the database.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ //
+ // Caller should delete the iterator when it is no longer needed.
+ // The returned iterator should be deleted before this db is deleted.
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+ virtual Iterator* NewIterator(const ReadOptions& options) {
+ return NewIterator(options, DefaultColumnFamily());
+ }
+ // Returns iterators from a consistent database state across multiple
+ // column families. Iterators are heap allocated and need to be deleted
+ // before the db is deleted
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) = 0;
+
+ // Return a handle to the current DB state. Iterators created with
+ // this handle will all observe a stable snapshot of the current DB
+ // state. The caller must call ReleaseSnapshot(result) when the
+ // snapshot is no longer needed.
+ //
+ // nullptr will be returned if the DB fails to take a snapshot or does
+ // not support snapshot.
+ virtual const Snapshot* GetSnapshot() = 0;
+
+ // Release a previously acquired snapshot. The caller must not
+ // use "snapshot" after this call.
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+#ifndef ROCKSDB_LITE
+ // Contains all valid property arguments for GetProperty().
+ //
+ // NOTE: Property names cannot end in numbers since those are interpreted as
+ // arguments, e.g., see kNumFilesAtLevelPrefix.
+ struct Properties {
+ // "rocksdb.num-files-at-level<N>" - returns string containing the number
+ // of files at level <N>, where <N> is an ASCII representation of a
+ // level number (e.g., "0").
+ static const std::string kNumFilesAtLevelPrefix;
+
+ // "rocksdb.compression-ratio-at-level<N>" - returns string containing the
+ // compression ratio of data at level <N>, where <N> is an ASCII
+ // representation of a level number (e.g., "0"). Here, compression
+ // ratio is defined as uncompressed data size / compressed file size.
+ // Returns "-1.0" if no open files at level <N>.
+ static const std::string kCompressionRatioAtLevelPrefix;
+
+ // "rocksdb.stats" - returns a multi-line string containing the data
+ // described by kCFStats followed by the data described by kDBStats.
+ static const std::string kStats;
+
+ // "rocksdb.sstables" - returns a multi-line string summarizing current
+ // SST files.
+ static const std::string kSSTables;
+
+ // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
+ // "rocksdb.cf-file-histogram" together. See below for description
+ // of the two.
+ static const std::string kCFStats;
+
+ // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
+ // general columm family stats per-level over db's lifetime ("L<n>"),
+ // aggregated over db's lifetime ("Sum"), and aggregated over the
+ // interval since the last retrieval ("Int").
+ // It could also be used to return the stats in the format of the map.
+ // In this case there will a pair of string to array of double for
+ // each level as well as for "Sum". "Int" stats will not be affected
+ // when this form of stats are retrieved.
+ static const std::string kCFStatsNoFileHistogram;
+
+ // "rocksdb.cf-file-histogram" - print out how many file reads to every
+ // level, as well as the histogram of latency of single requests.
+ static const std::string kCFFileHistogram;
+
+ // "rocksdb.dbstats" - returns a multi-line string with general database
+ // stats, both cumulative (over the db's lifetime) and interval (since
+ // the last retrieval of kDBStats).
+ static const std::string kDBStats;
+
+ // "rocksdb.levelstats" - returns multi-line string containing the number
+ // of files per level and total size of each level (MB).
+ static const std::string kLevelStats;
+
+ // "rocksdb.num-immutable-mem-table" - returns number of immutable
+ // memtables that have not yet been flushed.
+ static const std::string kNumImmutableMemTable;
+
+ // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
+ // memtables that have already been flushed.
+ static const std::string kNumImmutableMemTableFlushed;
+
+ // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
+ // pending; otherwise, returns 0.
+ static const std::string kMemTableFlushPending;
+
+ // "rocksdb.num-running-flushes" - returns the number of currently running
+ // flushes.
+ static const std::string kNumRunningFlushes;
+
+ // "rocksdb.compaction-pending" - returns 1 if at least one compaction is
+ // pending; otherwise, returns 0.
+ static const std::string kCompactionPending;
+
+ // "rocksdb.num-running-compactions" - returns the number of currently
+ // running compactions.
+ static const std::string kNumRunningCompactions;
+
+ // "rocksdb.background-errors" - returns accumulated number of background
+ // errors.
+ static const std::string kBackgroundErrors;
+
+ // "rocksdb.cur-size-active-mem-table" - returns approximate size of active
+ // memtable (bytes).
+ static const std::string kCurSizeActiveMemTable;
+
+ // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
+ // and unflushed immutable memtables (bytes).
+ static const std::string kCurSizeAllMemTables;
+
+ // "rocksdb.size-all-mem-tables" - returns approximate size of active,
+ // unflushed immutable, and pinned immutable memtables (bytes).
+ static const std::string kSizeAllMemTables;
+
+ // "rocksdb.num-entries-active-mem-table" - returns total number of entries
+ // in the active memtable.
+ static const std::string kNumEntriesActiveMemTable;
+
+ // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
+ // in the unflushed immutable memtables.
+ static const std::string kNumEntriesImmMemTables;
+
+ // "rocksdb.num-deletes-active-mem-table" - returns total number of delete
+ // entries in the active memtable.
+ static const std::string kNumDeletesActiveMemTable;
+
+ // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
+ // entries in the unflushed immutable memtables.
+ static const std::string kNumDeletesImmMemTables;
+
+ // "rocksdb.estimate-num-keys" - returns estimated number of total keys in
+ // the active and unflushed immutable memtables and storage.
+ static const std::string kEstimateNumKeys;
+
+ // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
+ // reading SST tables, excluding memory used in block cache (e.g.,
+ // filter and index blocks).
+ static const std::string kEstimateTableReadersMem;
+
+ // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
+ // files is enabled; otherwise, returns a non-zero number.
+ static const std::string kIsFileDeletionsEnabled;
+
+ // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
+ // database.
+ static const std::string kNumSnapshots;
+
+ // "rocksdb.oldest-snapshot-time" - returns number representing unix
+ // timestamp of oldest unreleased snapshot.
+ static const std::string kOldestSnapshotTime;
+
+ // "rocksdb.oldest-snapshot-sequence" - returns number representing
+ // sequence number of oldest unreleased snapshot.
+ static const std::string kOldestSnapshotSequence;
+
+ // "rocksdb.num-live-versions" - returns number of live versions. `Version`
+ // is an internal data structure. See version_set.h for details. More
+ // live versions often mean more SST files are held from being deleted,
+ // by iterators or unfinished compactions.
+ static const std::string kNumLiveVersions;
+
+ // "rocksdb.current-super-version-number" - returns number of current LSM
+ // version. It is a uint64_t integer number, incremented after there is
+ // any change to the LSM tree. The number is not preserved after restarting
+ // the DB. After DB restart, it will start from 0 again.
+ static const std::string kCurrentSuperVersionNumber;
+
+ // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
+ // live data in bytes.
+ static const std::string kEstimateLiveDataSize;
+
+ // "rocksdb.min-log-number-to-keep" - return the minimum log number of the
+ // log files that should be kept.
+ static const std::string kMinLogNumberToKeep;
+
+ // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+ // number for an obsolete SST to be kept. The max value of `uint64_t`
+ // will be returned if all obsolete files can be deleted.
+ static const std::string kMinObsoleteSstNumberToKeep;
+
+ // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
+ // files.
+ // WARNING: may slow down online queries if there are too many files.
+ static const std::string kTotalSstFilesSize;
+
+ // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
+ // files belong to the latest LSM tree.
+ static const std::string kLiveSstFilesSize;
+
+ // "rocksdb.base-level" - returns number of level to which L0 data will be
+ // compacted.
+ static const std::string kBaseLevel;
+
+ // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
+ // number of bytes compaction needs to rewrite to get all levels down
+ // to under target size. Not valid for other compactions than level-
+ // based.
+ static const std::string kEstimatePendingCompactionBytes;
+
+ // "rocksdb.aggregated-table-properties" - returns a string representation
+ // of the aggregated table properties of the target column family.
+ static const std::string kAggregatedTableProperties;
+
+ // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
+ // one but only returns the aggregated table properties of the
+ // specified level "N" at the target column family.
+ static const std::string kAggregatedTablePropertiesAtLevel;
+
+ // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
+ // write rate. 0 means no delay.
+ static const std::string kActualDelayedWriteRate;
+
+ // "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
+ static const std::string kIsWriteStopped;
+
+ // "rocksdb.estimate-oldest-key-time" - returns an estimation of
+ // oldest key timestamp in the DB. Currently only available for
+ // FIFO compaction with
+ // compaction_options_fifo.allow_compaction = false.
+ static const std::string kEstimateOldestKeyTime;
+
+ // "rocksdb.block-cache-capacity" - returns block cache capacity.
+ static const std::string kBlockCacheCapacity;
+
+ // "rocksdb.block-cache-usage" - returns the memory size for the entries
+ // residing in block cache.
+ static const std::string kBlockCacheUsage;
+
+ // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
+ // entries being pinned.
+ static const std::string kBlockCachePinnedUsage;
+
+ // "rocksdb.options-statistics" - returns multi-line string
+ // of options.statistics
+ static const std::string kOptionsStatistics;
+ };
+#endif /* ROCKSDB_LITE */
+
+ // DB implementations can export properties about their state via this method.
+ // If "property" is a valid property understood by this DB implementation (see
+ // Properties struct above for valid options), fills "*value" with its current
+ // value and returns true. Otherwise, returns false.
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) = 0;
+ virtual bool GetProperty(const Slice& property, std::string* value) {
+ return GetProperty(DefaultColumnFamily(), property, value);
+ }
+ virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
+ const Slice& property,
+ std::map<std::string, std::string>* value) = 0;
+ virtual bool GetMapProperty(const Slice& property,
+ std::map<std::string, std::string>* value) {
+ return GetMapProperty(DefaultColumnFamily(), property, value);
+ }
+
+ // Similar to GetProperty(), but only works for a subset of properties whose
+ // return value is an integer. Return the value by integer. Supported
+ // properties:
+ // "rocksdb.num-immutable-mem-table"
+ // "rocksdb.mem-table-flush-pending"
+ // "rocksdb.compaction-pending"
+ // "rocksdb.background-errors"
+ // "rocksdb.cur-size-active-mem-table"
+ // "rocksdb.cur-size-all-mem-tables"
+ // "rocksdb.size-all-mem-tables"
+ // "rocksdb.num-entries-active-mem-table"
+ // "rocksdb.num-entries-imm-mem-tables"
+ // "rocksdb.num-deletes-active-mem-table"
+ // "rocksdb.num-deletes-imm-mem-tables"
+ // "rocksdb.estimate-num-keys"
+ // "rocksdb.estimate-table-readers-mem"
+ // "rocksdb.is-file-deletions-enabled"
+ // "rocksdb.num-snapshots"
+ // "rocksdb.oldest-snapshot-time"
+ // "rocksdb.num-live-versions"
+ // "rocksdb.current-super-version-number"
+ // "rocksdb.estimate-live-data-size"
+ // "rocksdb.min-log-number-to-keep"
+ // "rocksdb.min-obsolete-sst-number-to-keep"
+ // "rocksdb.total-sst-files-size"
+ // "rocksdb.live-sst-files-size"
+ // "rocksdb.base-level"
+ // "rocksdb.estimate-pending-compaction-bytes"
+ // "rocksdb.num-running-compactions"
+ // "rocksdb.num-running-flushes"
+ // "rocksdb.actual-delayed-write-rate"
+ // "rocksdb.is-write-stopped"
+ // "rocksdb.estimate-oldest-key-time"
+ // "rocksdb.block-cache-capacity"
+ // "rocksdb.block-cache-usage"
+ // "rocksdb.block-cache-pinned-usage"
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) = 0;
+ virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
+ return GetIntProperty(DefaultColumnFamily(), property, value);
+ }
+
+ // Reset internal stats for DB and all column families.
+ // Note this doesn't reset options.statistics as it is not owned by
+ // DB.
+ virtual Status ResetStats() {
+ return Status::NotSupported("Not implemented");
+ }
+
+ // Same as GetIntProperty(), but this one returns the aggregated int
+ // property from all column families.
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* value) = 0;
+
+ // Flags for DB::GetSizeApproximation that specify whether memtable
+ // stats should be included, or file stats approximation or both
+ enum SizeApproximationFlags : uint8_t {
+ NONE = 0,
+ INCLUDE_MEMTABLES = 1 << 0,
+ INCLUDE_FILES = 1 << 1
+ };
+
+ // For each i in [0,n-1], store in "sizes[i]", the approximate
+ // file system space used by keys in "[range[i].start .. range[i].limit)".
+ //
+ // Note that the returned sizes measure file system space usage, so
+ // if the user data compresses by a factor of ten, the returned
+ // sizes will be one-tenth the size of the corresponding user data size.
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n,
+ uint64_t* sizes) = 0;
+
+ // Simpler versions of the GetApproximateSizes() method above.
+ // The include_flags argumenbt must of type DB::SizeApproximationFlags
+ // and can not be NONE.
+ virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+ const Range* range, int n, uint64_t* sizes,
+ uint8_t include_flags = INCLUDE_FILES) {
+ SizeApproximationOptions options;
+ options.include_memtabtles =
+ (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
+ options.include_files =
+ (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
+ GetApproximateSizes(options, column_family, range, n, sizes);
+ }
+ virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
+ uint8_t include_flags = INCLUDE_FILES) {
+ GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
+ }
+
+ // The method is similar to GetApproximateSizes, except it
+ // returns approximate number of records in memtables.
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) = 0;
+ virtual void GetApproximateMemTableStats(const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) {
+ GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
+ }
+
+ // Deprecated versions of GetApproximateSizes
+ ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
+ const Range* range, int n, uint64_t* sizes, bool include_memtable) {
+ uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
+ if (include_memtable) {
+ include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
+ }
+ GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
+ }
+ ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
+ ColumnFamilyHandle* column_family, const Range* range, int n,
+ uint64_t* sizes, bool include_memtable) {
+ uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
+ if (include_memtable) {
+ include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
+ }
+ GetApproximateSizes(column_family, range, n, sizes, include_flags);
+ }
+
+ // Compact the underlying storage for the key range [*begin,*end].
+ // The actual compaction interval might be superset of [*begin, *end].
+ // In particular, deleted and overwritten versions are discarded,
+ // and the data is rearranged to reduce the cost of operations
+ // needed to access the data. This operation should typically only
+ // be invoked by users who understand the underlying implementation.
+ //
+ // begin==nullptr is treated as a key before all keys in the database.
+ // end==nullptr is treated as a key after all keys in the database.
+ // Therefore the following call will compact the entire database:
+ // db->CompactRange(options, nullptr, nullptr);
+ // Note that after the entire database is compacted, all data are pushed
+ // down to the last level containing any data. If the total data size after
+ // compaction is reduced, that level might not be appropriate for hosting all
+ // the files. In this case, client could set options.change_level to true, to
+ // move the files back to the minimum level capable of holding the data set
+ // or a given level (specified by non-negative options.target_level).
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) = 0;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ const Slice* begin, const Slice* end) {
+ return CompactRange(options, DefaultColumnFamily(), begin, end);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
+ ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end,
+ bool change_level = false, int target_level = -1,
+ uint32_t target_path_id = 0) {
+ CompactRangeOptions options;
+ options.change_level = change_level;
+ options.target_level = target_level;
+ options.target_path_id = target_path_id;
+ return CompactRange(options, column_family, begin, end);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
+ const Slice* begin, const Slice* end, bool change_level = false,
+ int target_level = -1, uint32_t target_path_id = 0) {
+ CompactRangeOptions options;
+ options.change_level = change_level;
+ options.target_level = target_level;
+ options.target_path_id = target_path_id;
+ return CompactRange(options, DefaultColumnFamily(), begin, end);
+ }
+
+ virtual Status SetOptions(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::unordered_map<std::string, std::string>& /*new_options*/) {
+ return Status::NotSupported("Not implemented");
+ }
+ virtual Status SetOptions(
+ const std::unordered_map<std::string, std::string>& new_options) {
+ return SetOptions(DefaultColumnFamily(), new_options);
+ }
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& new_options) = 0;
+
+ // CompactFiles() inputs a list of files specified by file numbers and
+ // compacts them to the specified level. Note that the behavior is different
+ // from CompactRange() in that CompactFiles() performs the compaction job
+ // using the CURRENT thread.
+ //
+ // @see GetDataBaseMetaData
+ // @see GetColumnFamilyMetaData
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) = 0;
+
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) {
+ return CompactFiles(compact_options, DefaultColumnFamily(),
+ input_file_names, output_level, output_path_id,
+ output_file_names, compaction_job_info);
+ }
+
+ // This function will wait until all currently running background processes
+ // finish. After it returns, no background process will be run until
+ // ContinueBackgroundWork is called
+ virtual Status PauseBackgroundWork() = 0;
+ virtual Status ContinueBackgroundWork() = 0;
+
+ // This function will enable automatic compactions for the given column
+ // families if they were previously disabled. The function will first set the
+ // disable_auto_compactions option for each column family to 'false', after
+ // which it will schedule a flush/compaction.
+ //
+ // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
+ // does NOT schedule a flush/compaction afterwards, and only changes the
+ // parameter itself within the column family option.
+ //
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
+
+ virtual void DisableManualCompaction() = 0;
+ virtual void EnableManualCompaction() = 0;
+
+ // Number of levels used for this DB.
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+ virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+ // Maximum level to which a new compacted memtable is pushed if it
+ // does not create overlap.
+ virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+ virtual int MaxMemCompactionLevel() {
+ return MaxMemCompactionLevel(DefaultColumnFamily());
+ }
+
+ // Number of files in level-0 that would stop writes.
+ virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+ virtual int Level0StopWriteTrigger() {
+ return Level0StopWriteTrigger(DefaultColumnFamily());
+ }
+
+ // Get DB name -- the exact same name that was provided as an argument to
+ // DB::Open()
+ virtual const std::string& GetName() const = 0;
+
+ // Get Env object from the DB
+ virtual Env* GetEnv() const = 0;
+
+ virtual FileSystem* GetFileSystem() const;
+
+ // Get DB Options that we use. During the process of opening the
+ // column family, the options provided when calling DB::Open() or
+ // DB::CreateColumnFamily() will have been "sanitized" and transformed
+ // in an implementation-defined manner.
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
+ virtual Options GetOptions() const {
+ return GetOptions(DefaultColumnFamily());
+ }
+
+ virtual DBOptions GetDBOptions() const = 0;
+
+ // Flush all mem-table data.
+ // Flush a single column family, even when atomic flush is enabled. To flush
+ // multiple column families, use Flush(options, column_families).
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+ virtual Status Flush(const FlushOptions& options) {
+ return Flush(options, DefaultColumnFamily());
+ }
+ // Flushes multiple column families.
+ // If atomic flush is not enabled, Flush(options, column_families) is
+ // equivalent to calling Flush(options, column_family) multiple times.
+ // If atomic flush is enabled, Flush(options, column_families) will flush all
+ // column families specified in 'column_families' up to the latest sequence
+ // number at the time when flush is requested.
+ // Note that RocksDB 5.15 and earlier may not be able to open later versions
+ // with atomic flush enabled.
+ virtual Status Flush(
+ const FlushOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families) = 0;
+
+ // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
+ // afterwards.
+ virtual Status FlushWAL(bool /*sync*/) {
+ return Status::NotSupported("FlushWAL not implemented");
+ }
+ // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
+ // same as Write() with sync=true: in the latter case the changes won't be
+ // visible until the sync is done.
+ // Currently only works if allow_mmap_writes = false in Options.
+ virtual Status SyncWAL() = 0;
+
+ // Lock the WAL. Also flushes the WAL after locking.
+ virtual Status LockWAL() {
+ return Status::NotSupported("LockWAL not implemented");
+ }
+
+ // Unlock the WAL.
+ virtual Status UnlockWAL() {
+ return Status::NotSupported("UnlockWAL not implemented");
+ }
+
+ // The sequence number of the most recent transaction.
+ virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+ // Instructs DB to preserve deletes with sequence numbers >= passed seqnum.
+ // Has no effect if DBOptions.preserve_deletes is set to false.
+ // This function assumes that user calls this function with monotonically
+ // increasing seqnums (otherwise we can't guarantee that a particular delete
+ // hasn't been already processed); returns true if the value was successfully
+ // updated, false if user attempted to call if with seqnum <= current value.
+ virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
+
+#ifndef ROCKSDB_LITE
+
+ // Prevent file deletions. Compactions will continue to occur,
+ // but no obsolete files will be deleted. Calling this multiple
+ // times have the same effect as calling it once.
+ virtual Status DisableFileDeletions() = 0;
+
+ // Allow compactions to delete obsolete files.
+ // If force == true, the call to EnableFileDeletions() will guarantee that
+ // file deletions are enabled after the call, even if DisableFileDeletions()
+ // was called multiple times before.
+ // If force == false, EnableFileDeletions will only enable file deletion
+ // after it's been called at least as many times as DisableFileDeletions(),
+ // enabling the two methods to be called by two threads concurrently without
+ // synchronization -- i.e., file deletions will be enabled only after both
+ // threads call EnableFileDeletions()
+ virtual Status EnableFileDeletions(bool force = true) = 0;
+
+ // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+ // Retrieve the list of all files in the database. The files are
+ // relative to the dbname and are not absolute paths. Despite being relative
+ // paths, the file names begin with "/". The valid size of the manifest file
+ // is returned in manifest_file_size. The manifest file is an ever growing
+ // file, but only the portion specified by manifest_file_size is valid for
+ // this snapshot. Setting flush_memtable to true does Flush before recording
+ // the live files. Setting flush_memtable to false is useful when we don't
+ // want to wait for flush which may have to wait for compaction to complete
+ // taking an indeterminate time.
+ //
+ // In case you have multiple column families, even if flush_memtable is true,
+ // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
+ // for new data that arrived to already-flushed column families while other
+ // column families were flushing
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) = 0;
+
+ // Retrieve the sorted list of all wal files with earliest file first
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+ // Retrieve information about the current wal file
+ //
+ // Note that the log might have rolled after this call in which case
+ // the current_log_file would not point to the current log file.
+ //
+ // Additionally, for the sake of optimization current_log_file->StartSequence
+ // would always be set to 0
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) = 0;
+
+ // Retrieves the creation time of the oldest file in the DB.
+ // This API only works if max_open_files = -1, if it is not then
+ // Status returned is Status::NotSupported()
+ // The file creation time is set using the env provided to the DB.
+ // If the DB was created from a very old release then its possible that
+ // the SST files might not have file_creation_time property and even after
+ // moving to a newer release its possible that some files never got compacted
+ // and may not have file_creation_time property. In both the cases
+ // file_creation_time is considered 0 which means this API will return
+ // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+ virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
+ // Note: this API is not yet consistent with WritePrepared transactions.
+ // Sets iter to an iterator that is positioned at a write-batch containing
+ // seq_number. If the sequence number is non existent, it returns an iterator
+ // at the first available seq_no after the requested seq_no
+ // Returns Status::OK if iterator is valid
+ // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+ // use this api, else the WAL files will get
+ // cleared aggressively and the iterator might keep getting invalid before
+ // an update is read.
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options =
+ TransactionLogIterator::ReadOptions()) = 0;
+
+// Windows API macro interference
+#undef DeleteFile
+ // Delete the file name from the db directory and update the internal state to
+ // reflect that. Supports deletion of sst and log files only. 'name' must be
+ // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+ virtual Status DeleteFile(std::string name) = 0;
+
+ // Returns a list of all table files with their level, start key
+ // and end key
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* /*metadata*/) {}
+
+ // Obtains the meta data of the specified column family of the DB.
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+ ColumnFamilyMetaData* /*metadata*/) {}
+
+ // Get the metadata of the default column family.
+ void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
+ GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+ }
+
+ // IngestExternalFile() will load a list of external SST files (1) into the DB
+ // Two primary modes are supported:
+ // - Duplicate keys in the new files will overwrite exiting keys (default)
+ // - Duplicate keys will be skipped (set ingest_behind=true)
+ // In the first mode we will try to find the lowest possible level that
+ // the file can fit in, and ingest the file into this level (2). A file that
+ // have a key range that overlap with the memtable key range will require us
+ // to Flush the memtable first before ingesting the file.
+ // In the second mode we will always ingest in the bottom most level (see
+ // docs to IngestExternalFileOptions::ingest_behind).
+ //
+ // (1) External SST files can be created using SstFileWriter
+ // (2) We will try to ingest the files to the lowest possible level
+ // even if the file compression doesn't match the level compression
+ // (3) If IngestExternalFileOptions->ingest_behind is set to true,
+ // we always ingest at the bottommost level, which should be reserved
+ // for this purpose (see DBOPtions::allow_ingest_behind flag).
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) = 0;
+
+ virtual Status IngestExternalFile(
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) {
+ return IngestExternalFile(DefaultColumnFamily(), external_files, options);
+ }
+
+ // IngestExternalFiles() will ingest files for multiple column families, and
+ // record the result atomically to the MANIFEST.
+ // If this function returns OK, all column families' ingestion must succeed.
+ // If this function returns NOK, or the process crashes, then non-of the
+ // files will be ingested into the database after recovery.
+ // Note that it is possible for application to observe a mixed state during
+ // the execution of this function. If the user performs range scan over the
+ // column families with iterators, iterator on one column family may return
+ // ingested data, while iterator on other column family returns old data.
+ // Users can use snapshot for a consistent view of data.
+ // If your db ingests multiple SST files using this API, i.e. args.size()
+ // > 1, then RocksDB 5.15 and earlier will not be able to open it.
+ //
+ // REQUIRES: each arg corresponds to a different column family: namely, for
+ // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) = 0;
+
+ // CreateColumnFamilyWithImport() will create a new column family with
+ // column_family_name and import external SST files specified in metadata into
+ // this column family.
+ // (1) External SST files can be created using SstFileWriter.
+ // (2) External SST files can be exported from a particular column family in
+ // an existing DB.
+ // Option in import_options specifies whether the external files are copied or
+ // moved (default is copy). When option specifies copy, managing files at
+ // external_file_path is caller's responsibility. When option specifies a
+ // move, the call ensures that the specified files at external_file_path are
+ // deleted on successful return and files are not modified on any error
+ // return.
+ // On error return, column family handle returned will be nullptr.
+ // ColumnFamily will be present on successful return and will not be present
+ // on error return. ColumnFamily may be present on any crash during this call.
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) = 0;
+
+ virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+ virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+ // AddFile() is deprecated, please use IngestExternalFile()
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& file_path_list, bool move_file = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(column_family, file_path_list, ifo);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ const std::vector<std::string>& file_path_list, bool move_file = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo);
+ }
+
+ // AddFile() is deprecated, please use IngestExternalFile()
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ ColumnFamilyHandle* column_family, const std::string& file_path,
+ bool move_file = false, bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(column_family, {file_path}, ifo);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ const std::string& file_path, bool move_file = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo);
+ }
+
+ // Load table file with information "file_info" into "column_family"
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<ExternalSstFileInfo>& file_info_list,
+ bool move_file = false, bool skip_snapshot_check = false) {
+ std::vector<std::string> external_files;
+ for (const ExternalSstFileInfo& file_info : file_info_list) {
+ external_files.push_back(file_info.file_path);
+ }
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(column_family, external_files, ifo);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ const std::vector<ExternalSstFileInfo>& file_info_list,
+ bool move_file = false, bool skip_snapshot_check = false) {
+ std::vector<std::string> external_files;
+ for (const ExternalSstFileInfo& file_info : file_info_list) {
+ external_files.push_back(file_info.file_path);
+ }
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(DefaultColumnFamily(), external_files, ifo);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info,
+ bool move_file = false, bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(column_family, {file_info->file_path}, ifo);
+ }
+
+ ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
+ const ExternalSstFileInfo* file_info, bool move_file = false,
+ bool skip_snapshot_check = false) {
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_file;
+ ifo.snapshot_consistency = !skip_snapshot_check;
+ ifo.allow_global_seqno = false;
+ ifo.allow_blocking_flush = false;
+ return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path},
+ ifo);
+ }
+
+#endif // ROCKSDB_LITE
+
+ // Returns the unique ID which is read from IDENTITY file during the opening
+ // of database by setting in the identity variable
+ // Returns Status::OK if identity could be set properly
+ virtual Status GetDbIdentity(std::string& identity) const = 0;
+
+ // Returns default column family handle
+ virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+ virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) = 0;
+ virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+ return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+ }
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) = 0;
+
+ virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) {
+ return Status::NotSupported("SuggestCompactRange() is not implemented.");
+ }
+
+ virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
+ int /*target_level*/) {
+ return Status::NotSupported("PromoteL0() is not implemented.");
+ }
+
+ // Trace DB operations. Use EndTrace() to stop tracing.
+ virtual Status StartTrace(const TraceOptions& /*options*/,
+ std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartTrace() is not implemented.");
+ }
+
+ virtual Status EndTrace() {
+ return Status::NotSupported("EndTrace() is not implemented.");
+ }
+
+ // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+ virtual Status StartBlockCacheTrace(
+ const TraceOptions& /*options*/,
+ std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+ return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+ }
+
+ virtual Status EndBlockCacheTrace() {
+ return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+ }
+#endif // ROCKSDB_LITE
+
+ // Needed for StackableDB
+ virtual DB* GetRootDB() { return this; }
+
+ // Given a window [start_time, end_time), setup a StatsHistoryIterator
+ // to access stats history. Note the start_time and end_time are epoch
+ // time measured in seconds, and end_time is an exclusive bound.
+ virtual Status GetStatsHistory(
+ uint64_t /*start_time*/, uint64_t /*end_time*/,
+ std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
+ return Status::NotSupported("GetStatsHistory() is not implemented.");
+ }
+
+#ifndef ROCKSDB_LITE
+ // Make the secondary instance catch up with the primary by tailing and
+ // replaying the MANIFEST and WAL of the primary.
+ // Column families created by the primary after the secondary instance starts
+ // will be ignored unless the secondary instance closes and restarts with the
+ // newly created column families.
+ // Column families that exist before secondary instance starts and dropped by
+ // the primary afterwards will be marked as dropped. However, as long as the
+ // secondary instance does not delete the corresponding column family
+ // handles, the data of the column family is still accessible to the
+ // secondary.
+ // TODO: we will support WAL tailing soon.
+ virtual Status TryCatchUpWithPrimary() {
+ return Status::NotSupported("Supported only by secondary instance");
+ }
+#endif // !ROCKSDB_LITE
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options,
+ const std::vector<ColumnFamilyDescriptor>& column_families =
+ std::vector<ColumnFamilyDescriptor>());
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+//
+// With this API, we will warn and skip data associated with column families not
+// specified in column_families.
+//
+// @param column_families Descriptors for known column families
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families);
+
+// @param unknown_cf_opts Options for column families encountered during the
+// repair that were not specified in column_families.
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ const ColumnFamilyOptions& unknown_cf_opts);
+
+// @param options These options will be used for the database and for ALL column
+// families encountered during the repair
+Status RepairDB(const std::string& dbname, const Options& options);
+
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h
new file mode 100644
index 000000000..17f4e6bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_bench_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_bench_tool(int argc, char** argv);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h
new file mode 100644
index 000000000..b7d4766a2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_dump_tool.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct DumpOptions {
+ // Database that will be dumped
+ std::string db_path;
+ // File location that will contain dump output
+ std::string dump_location;
+ // Don't include db information header in the dump
+ bool anonymous = false;
+};
+
+class DbDumpTool {
+ public:
+ bool Run(const DumpOptions& dump_options,
+ ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+
+struct UndumpOptions {
+ // Database that we will load the dumped file into
+ std::string db_path;
+ // File location of the dumped file that will be loaded
+ std::string dump_location;
+ // Compact the db after loading the dumped file
+ bool compact_db = false;
+};
+
+class DbUndumpTool {
+ public:
+ bool Run(const UndumpOptions& undump_options,
+ ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h
new file mode 100644
index 000000000..7d3d42c9d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_stress_tool(int argc, char** argv);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
new file mode 100644
index 000000000..056d8a1c0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -0,0 +1,1589 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc. Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
+ __attribute__((__format__(__printf__, format_param, dots_param)))
+#else
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DynamicLibrary;
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+class MemoryMappedFileBuffer;
+class Directory;
+struct DBOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+class ThreadStatusUpdater;
+struct ThreadStatus;
+
+const size_t kDefaultPageSize = 4 * 1024;
+
+// Options while opening a file to read/write
+struct EnvOptions {
+ // Construct with default Options
+ EnvOptions();
+
+ // Construct from Options
+ explicit EnvOptions(const DBOptions& options);
+
+ // If true, then use mmap to read data
+ bool use_mmap_reads = false;
+
+ // If true, then use mmap to write data
+ bool use_mmap_writes = true;
+
+ // If true, then use O_DIRECT for reading data
+ bool use_direct_reads = false;
+
+ // If true, then use O_DIRECT for writing data
+ bool use_direct_writes = false;
+
+ // If false, fallocate() calls are bypassed
+ bool allow_fallocate = true;
+
+ // If true, set the FD_CLOEXEC on open fd.
+ bool set_fd_cloexec = true;
+
+ // Allows OS to incrementally sync files to disk while they are being
+ // written, in the background. Issue one request for every bytes_per_sync
+ // written. 0 turns it off.
+ // Default: 0
+ uint64_t bytes_per_sync = 0;
+
+ // When true, guarantees the file has at most `bytes_per_sync` bytes submitted
+ // for writeback at any given time.
+ //
+ // - If `sync_file_range` is supported it achieves this by waiting for any
+ // prior `sync_file_range`s to finish before proceeding. In this way,
+ // processing (compression, etc.) can proceed uninhibited in the gap
+ // between `sync_file_range`s, and we block only when I/O falls behind.
+ // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+ // always blocks, thus preventing the interleaving of I/O and processing.
+ //
+ // Note: Enabling this option does not provide any additional persistence
+ // guarantees, as it may use `sync_file_range`, which does not write out
+ // metadata.
+ //
+ // Default: false
+ bool strict_bytes_per_sync = false;
+
+ // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+ // means that file size won't change as part of preallocation.
+ // If false, preallocation will also change the file size. This option will
+ // improve the performance in workloads where you sync the data on every
+ // write. By default, we set it to true for MANIFEST writes and false for
+ // WAL writes
+ bool fallocate_with_keep_size = true;
+
+ // See DBOptions doc
+ size_t compaction_readahead_size = 0;
+
+ // See DBOptions doc
+ size_t random_access_max_buffer_size = 0;
+
+ // See DBOptions doc
+ size_t writable_file_max_buffer_size = 1024 * 1024;
+
+ // If not nullptr, write rate limiting is enabled for flush and compaction
+ RateLimiter* rate_limiter = nullptr;
+};
+
+class Env {
+ public:
+ struct FileAttributes {
+ // File name
+ std::string name;
+
+ // Size of file in bytes
+ uint64_t size_bytes;
+ };
+
+ Env() : thread_status_updater_(nullptr) {}
+ // No copying allowed
+ Env(const Env&) = delete;
+ void operator=(const Env&) = delete;
+
+ virtual ~Env();
+
+ static const char* Type() { return "Environment"; }
+
+ // Loads the environment specified by the input value into the result
+ static Status LoadEnv(const std::string& value, Env** result);
+
+ // Loads the environment specified by the input value into the result
+ static Status LoadEnv(const std::string& value, Env** result,
+ std::shared_ptr<Env>* guard);
+
+ // Return a default environment suitable for the current operating
+ // system. Sophisticated users may wish to provide their own Env
+ // implementation instead of relying on this default environment.
+ //
+ // The result of Default() belongs to rocksdb and must never be deleted.
+ static Env* Default();
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewSequentialFile(const std::string& fname,
+ std::unique_ptr<SequentialFile>* result,
+ const EnvOptions& options) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& options) = 0;
+ // These values match Linux definition
+ // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+ enum WriteLifeTimeHint {
+ WLTH_NOT_SET = 0, // No hint information set
+ WLTH_NONE, // No hints about write life time
+ WLTH_SHORT, // Data written has a short life time
+ WLTH_MEDIUM, // Data written has a medium life time
+ WLTH_LONG, // Data written has a long life time
+ WLTH_EXTREME, // Data written has an extremely long life time
+ };
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) = 0;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status ReopenWritableFile(const std::string& /*fname*/,
+ std::unique_ptr<WritableFile>* /*result*/,
+ const EnvOptions& /*options*/) {
+ return Status::NotSupported();
+ }
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ virtual Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options);
+
+ // Open `fname` for random read and write, if file doesn't exist the file
+ // will be created. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewRandomRWFile(const std::string& /*fname*/,
+ std::unique_ptr<RandomRWFile>* /*result*/,
+ const EnvOptions& /*options*/) {
+ return Status::NotSupported("RandomRWFile is not implemented in this Env");
+ }
+
+ // Opens `fname` as a memory-mapped file for read and write (in-place updates
+ // only, i.e., no appends). On success, stores a raw buffer covering the whole
+ // file in `*result`. The file must exist prior to this call.
+ virtual Status NewMemoryMappedFileBuffer(
+ const std::string& /*fname*/,
+ std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+ return Status::NotSupported(
+ "MemoryMappedFileBuffer is not implemented in this Env");
+ }
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ virtual Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) = 0;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status FileExists(const std::string& fname) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) = 0;
+
+ // Store in *result the attributes of the children of the specified directory.
+ // In case the implementation lists the directory prior to iterating the files
+ // and files are concurrently deleted, the deleted files will be omitted from
+ // result.
+ // The name attributes are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual Status GetChildrenFileAttributes(const std::string& dir,
+ std::vector<FileAttributes>* result);
+
+ // Delete the named file.
+ virtual Status DeleteFile(const std::string& fname) = 0;
+
+ // Truncate the named file to the specified size.
+ virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) {
+ return Status::NotSupported("Truncate is not supported for this Env");
+ }
+
+ // Create the specified directory. Returns error if directory exists.
+ virtual Status CreateDir(const std::string& dirname) = 0;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+ // Delete the specified directory.
+ virtual Status DeleteDir(const std::string& dirname) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+ // Store the last modification time of fname in *file_mtime.
+ virtual Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) = 0;
+ // Rename file src to target.
+ virtual Status RenameFile(const std::string& src,
+ const std::string& target) = 0;
+
+ // Hard Link file src to target.
+ virtual Status LinkFile(const std::string& /*src*/,
+ const std::string& /*target*/) {
+ return Status::NotSupported("LinkFile is not supported for this Env");
+ }
+
+ virtual Status NumFileLinks(const std::string& /*fname*/,
+ uint64_t* /*count*/) {
+ return Status::NotSupported(
+ "Getting number of file links is not supported for this Env");
+ }
+
+ virtual Status AreFilesSame(const std::string& /*first*/,
+ const std::string& /*second*/, bool* /*res*/) {
+ return Status::NotSupported("AreFilesSame is not supported for this Env");
+ }
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual Status UnlockFile(FileLock* lock) = 0;
+
+ // Opens `lib_name` as a dynamic library.
+ // If the 'search_path' is specified, breaks the path into its components
+ // based on the appropriate platform separator (";" or ";") and looks for the
+ // library in those directories. If 'search path is not specified, uses the
+ // default library path search mechanism (such as LD_LIBRARY_PATH). On
+ // success, stores a dynamic library in `*result`.
+ virtual Status LoadLibrary(const std::string& /*lib_name*/,
+ const std::string& /*search_path */,
+ std::shared_ptr<DynamicLibrary>* /*result*/) {
+ return Status::NotSupported("LoadLibrary is not implemented in this Env");
+ }
+
+ // Priority for scheduling job in thread pool
+ enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
+
+ static std::string PriorityToString(Priority priority);
+
+ // Priority for requesting bytes in rate limiter scheduler
+ enum IOPriority { IO_LOW = 0, IO_HIGH = 1, IO_TOTAL = 2 };
+
+ // Arrange to run "(*function)(arg)" once in a background thread, in
+ // the thread pool specified by pri. By default, jobs go to the 'LOW'
+ // priority thread pool.
+
+ // "function" may run in an unspecified thread. Multiple functions
+ // added to the same Env may run concurrently in different threads.
+ // I.e., the caller may not assume that background work items are
+ // serialized.
+ // When the UnSchedule function is called, the unschedFunction
+ // registered at the time of Schedule is invoked with arg as a parameter.
+ virtual void Schedule(void (*function)(void* arg), void* arg,
+ Priority pri = LOW, void* tag = nullptr,
+ void (*unschedFunction)(void* arg) = nullptr) = 0;
+
+ // Arrange to remove jobs for given arg from the queue_ if they are not
+ // already scheduled. Caller is expected to have exclusive lock on arg.
+ virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; }
+
+ // Start a new thread, invoking "function(arg)" within the new thread.
+ // When "function(arg)" returns, the thread will be destroyed.
+ virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+ // Wait for all threads started by StartThread to terminate.
+ virtual void WaitForJoin() {}
+
+ // Get thread pool queue length for specific thread pool.
+ virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
+ return 0;
+ }
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual Status GetTestDirectory(std::string* path) = 0;
+
+ // Create and returns a default logger (an instance of EnvLogger) for storing
+ // informational messages. Derived classes can overide to provide custom
+ // logger.
+ virtual Status NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result);
+
+ // Returns the number of micro-seconds since some fixed point in time.
+ // It is often used as system time such as in GenericRateLimiter
+ // and other places so a port needs to return system time in order to work.
+ virtual uint64_t NowMicros() = 0;
+
+ // Returns the number of nano-seconds since some fixed point in time. Only
+ // useful for computing deltas of time in one run.
+ // Default implementation simply relies on NowMicros.
+ // In platform-specific implementations, NowNanos() should return time points
+ // that are MONOTONIC.
+ virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+ // 0 indicates not supported.
+ virtual uint64_t NowCPUNanos() { return 0; }
+
+ // Sleep/delay the thread for the prescribed number of micro-seconds.
+ virtual void SleepForMicroseconds(int micros) = 0;
+
+ // Get the current host name.
+ virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+ // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+ // Only overwrites *unix_time on success.
+ virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+ // Get full directory name for this db.
+ virtual Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) = 0;
+
+ // The number of background worker threads of a specific thread pool
+ // for this environment. 'LOW' is the default pool.
+ // default number: 1
+ virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+ virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
+
+ virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
+ return Status::NotSupported("Not supported.");
+ }
+
+ // Enlarge number of background worker threads of a specific thread pool
+ // for this environment if it is smaller than specified. 'LOW' is the default
+ // pool.
+ virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
+ // Lower IO priority for threads from the specified pool.
+ virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
+
+ // Lower CPU priority for threads from the specified pool.
+ virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
+
+ // Converts seconds-since-Jan-01-1970 to a printable string
+ virtual std::string TimeToString(uint64_t time) = 0;
+
+ // Generates a unique id that can be used to identify a db
+ virtual std::string GenerateUniqueId();
+
+ // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+ // the EnvOptions in the parameters, but is optimized for reading log files.
+ virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const;
+
+ // OptimizeForManifestRead will create a new EnvOptions object that is a copy
+ // of the EnvOptions in the parameters, but is optimized for reading manifest
+ // files.
+ virtual EnvOptions OptimizeForManifestRead(
+ const EnvOptions& env_options) const;
+
+ // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+ // the EnvOptions in the parameters, but is optimized for writing log files.
+ // Default implementation returns the copy of the same object.
+ virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const;
+ // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+ // of the EnvOptions in the parameters, but is optimized for writing manifest
+ // files. Default implementation returns the copy of the same object.
+ virtual EnvOptions OptimizeForManifestWrite(
+ const EnvOptions& env_options) const;
+
+ // OptimizeForCompactionTableWrite will create a new EnvOptions object that is
+ // a copy of the EnvOptions in the parameters, but is optimized for writing
+ // table files.
+ virtual EnvOptions OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& immutable_ops) const;
+
+ // OptimizeForCompactionTableWrite will create a new EnvOptions object that
+ // is a copy of the EnvOptions in the parameters, but is optimized for reading
+ // table files.
+ virtual EnvOptions OptimizeForCompactionTableRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const;
+
+ // Returns the status of all threads that belong to the current Env.
+ virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
+ return Status::NotSupported("Not supported.");
+ }
+
+ // Returns the pointer to ThreadStatusUpdater. This function will be
+ // used in RocksDB internally to update thread status and supports
+ // GetThreadList().
+ virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+ return thread_status_updater_;
+ }
+
+ // Returns the ID of the current thread.
+ virtual uint64_t GetThreadID() const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+ // Get the amount of free disk space
+ virtual Status GetFreeSpace(const std::string& /*path*/,
+ uint64_t* /*diskfree*/) {
+ return Status::NotSupported();
+ }
+
+ virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
+ // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ protected:
+ // The pointer to an internal structure that will update the
+ // status of each thread.
+ ThreadStatusUpdater* thread_status_updater_;
+};
+
+// The factory function to construct a ThreadStatusUpdater. Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+ SequentialFile() {}
+ virtual ~SequentialFile();
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Skip(uint64_t n) = 0;
+
+ // Indicates the upper layers if the current SequentialFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Positioned Read for direct I/O
+ // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+ virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+ Slice* /*result*/, char* /*scratch*/) {
+ return Status::NotSupported();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+ // File offset in bytes
+ uint64_t offset;
+
+ // Length to read in bytes
+ size_t len;
+
+ // A buffer that MultiRead() can optionally place data in. It can
+ // ignore this and allocate its own buffer
+ char* scratch;
+
+ // Output parameter set by MultiRead() to point to the data buffer, and
+ // the number of valid bytes
+ Slice result;
+
+ // Status of read
+ Status status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+ RandomAccessFile() {}
+ virtual ~RandomAccessFile();
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // Safe for concurrent use by multiple threads.
+ // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+
+ // Readahead the file starting from offset by n bytes for caching.
+ virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) {
+ return Status::OK();
+ }
+
+ // Read a bunch of blocks as described by reqs. The blocks can
+ // optionally be read in parallel. This is a synchronous call, i.e it
+ // should return after all reads have completed. The reads will be
+ // non-overlapping. If the function return Status is not ok, status of
+ // individual requests will be ignored and return status will be assumed
+ // for all read requests. The function return status is only meant for any
+ // any errors that occur before even processing specific read requests
+ virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+ assert(reqs != nullptr);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ ReadRequest& req = reqs[i];
+ req.status = Read(req.offset, req.len, &req.result, req.scratch);
+ }
+ return Status::OK();
+ }
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to each other by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ // compatibility.
+ }
+
+ enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+ virtual void Hint(AccessPattern /*pattern*/) {}
+
+ // Indicates the upper layers if the current RandomAccessFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported("InvalidateCache not supported.");
+ }
+
+ // If you're adding methods here, remember to add them to
+ // RandomAccessFileWrapper too.
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+ WritableFile()
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(false) {}
+
+ explicit WritableFile(const EnvOptions& options)
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+ // No copying allowed
+ WritableFile(const WritableFile&) = delete;
+ void operator=(const WritableFile&) = delete;
+
+ virtual ~WritableFile();
+
+ // Append data to the end of the file
+ // Note: A WriteabelFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ virtual Status Append(const Slice& data) = 0;
+
+ // PositionedAppend data to the specified offset. The new EOF after append
+ // must be larger than the previous EOF. This is to be used when writes are
+ // not backed by OS buffers and hence has to always start from the start of
+ // the sector. The implementation thus needs to also rewrite the last
+ // partial sector.
+ // Note: PositionAppend does not guarantee moving the file offset after the
+ // write. A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ //
+ // PositionedAppend() can only happen on the page/sector boundaries. For that
+ // reason, if the last write was an incomplete sector we still need to rewind
+ // back to the nearest sector/page and rewrite the portion of it with whatever
+ // we need to add. We need to keep where we stop writing.
+ //
+ // PositionedAppend() can only write whole sectors. For that reason we have to
+ // pad with zeros for the last write and trim the file when closing according
+ // to the position we keep in the previous step.
+ //
+ // PositionedAppend() requires aligned buffer to be passed in. The alignment
+ // required is queried via GetRequiredBufferAlignment()
+ virtual Status PositionedAppend(const Slice& /* data */,
+ uint64_t /* offset */) {
+ return Status::NotSupported();
+ }
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); }
+ virtual Status Close() = 0;
+ virtual Status Flush() = 0;
+ virtual Status Sync() = 0; // sync data
+
+ /*
+ * Sync data and/or metadata as well.
+ * By default, sync only data.
+ * Override this method for environments where we need to sync
+ * metadata as well.
+ */
+ virtual Status Fsync() { return Sync(); }
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ virtual bool IsSyncThreadSafe() const { return false; }
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+ /*
+ * Change the priority in rate limiter if rate limiting is enabled.
+ * If rate limiting is not enabled, this call has no effect.
+ */
+ virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+ virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+ virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+ write_hint_ = hint;
+ }
+
+ virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+ /*
+ * Get the size of valid data in the file.
+ */
+ virtual uint64_t GetFileSize() { return 0; }
+
+ /*
+ * Get and set the default pre-allocation block size for writes to
+ * this file. If non-zero, then Allocate will be used to extend the
+ * underlying storage of a file (generally via fallocate) if the Env
+ * instance supports it.
+ */
+ virtual void SetPreallocationBlockSize(size_t size) {
+ preallocation_block_size_ = size;
+ }
+
+ virtual void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) {
+ *last_allocated_block = last_preallocated_block_;
+ *block_size = preallocation_block_size_;
+ }
+
+ // For documentation, refer to RandomAccessFile::GetUniqueId()
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return Status::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
+ if (strict_bytes_per_sync_) {
+ return Sync();
+ }
+ return Status::OK();
+ }
+
+ // PrepareWrite performs any necessary preparation for a write
+ // before the write actually occurs. This allows for pre-allocation
+ // of space on devices where it can result in less file
+ // fragmentation and/or less waste from over-zealous filesystem
+ // pre-allocation.
+ virtual void PrepareWrite(size_t offset, size_t len) {
+ if (preallocation_block_size_ == 0) {
+ return;
+ }
+ // If this write would cross one or more preallocation blocks,
+ // determine what the last preallocation block necessary to
+ // cover this write would be and Allocate to that point.
+ const auto block_size = preallocation_block_size_;
+ size_t new_last_preallocated_block =
+ (offset + len + block_size - 1) / block_size;
+ if (new_last_preallocated_block > last_preallocated_block_) {
+ size_t num_spanned_blocks =
+ new_last_preallocated_block - last_preallocated_block_;
+ Allocate(block_size * last_preallocated_block_,
+ block_size * num_spanned_blocks);
+ last_preallocated_block_ = new_last_preallocated_block;
+ }
+ }
+
+ // Pre-allocates space for a file.
+ virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) {
+ return Status::OK();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // WritableFileWrapper too.
+
+ protected:
+ size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+ size_t last_preallocated_block_;
+ size_t preallocation_block_size_;
+
+ protected:
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+ RandomRWFile() {}
+ // No copying allowed
+ RandomRWFile(const RandomRWFile&) = delete;
+ RandomRWFile& operator=(const RandomRWFile&) = delete;
+
+ virtual ~RandomRWFile() {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ virtual Status Write(uint64_t offset, const Slice& data) = 0;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ // Returns Status::OK() on success.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+
+ virtual Status Flush() = 0;
+
+ virtual Status Sync() = 0;
+
+ virtual Status Fsync() { return Sync(); }
+
+ virtual Status Close() = 0;
+
+ // If you're adding methods here, remember to add them to
+ // RandomRWFileWrapper too.
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class MemoryMappedFileBuffer {
+ public:
+ MemoryMappedFileBuffer(void* _base, size_t _length)
+ : base_(_base), length_(_length) {}
+
+ virtual ~MemoryMappedFileBuffer() = 0;
+
+ // We do not want to unmap this twice. We can make this class
+ // movable if desired, however, since
+ MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete;
+ MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete;
+
+ void* GetBase() const { return base_; }
+ size_t GetLen() const { return length_; }
+
+ protected:
+ void* base_;
+ const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+ virtual ~Directory() {}
+ // Fsync directory. Can be called concurrently from multiple threads.
+ virtual Status Fsync() = 0;
+
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0;
+ }
+
+ // If you're adding methods here, remember to add them to
+ // DirectoryWrapper too.
+};
+
+enum InfoLogLevel : unsigned char {
+ DEBUG_LEVEL = 0,
+ INFO_LEVEL,
+ WARN_LEVEL,
+ ERROR_LEVEL,
+ FATAL_LEVEL,
+ HEADER_LEVEL,
+ NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+ size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
+
+ explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+ : closed_(false), log_level_(log_level) {}
+ // No copying allowed
+ Logger(const Logger&) = delete;
+ void operator=(const Logger&) = delete;
+
+ virtual ~Logger();
+
+ // Close the log file. Must be called before destructor. If the return
+ // status is NotSupported(), it means the implementation does cleanup in
+ // the destructor
+ virtual Status Close();
+
+ // Write a header to the log file with the specified format
+ // It is recommended that you log all header information at the start of the
+ // application. But it is not enforced.
+ virtual void LogHeader(const char* format, va_list ap) {
+ // Default implementation does a simple INFO level log write.
+ // Please override as per the logger class requirement.
+ Logv(format, ap);
+ }
+
+ // Write an entry to the log file with the specified format.
+ virtual void Logv(const char* format, va_list ap) = 0;
+
+ // Write an entry to the log file with the specified log level
+ // and format. Any log with level under the internal log level
+ // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+ // printed.
+ virtual void Logv(const InfoLogLevel log_level, const char* format,
+ va_list ap);
+
+ virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
+ // Flush to the OS buffers
+ virtual void Flush() {}
+ virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+ virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+ log_level_ = log_level;
+ }
+
+ // If you're adding methods here, remember to add them to LoggerWrapper too.
+
+ protected:
+ virtual Status CloseImpl();
+ bool closed_;
+
+ private:
+ InfoLogLevel log_level_;
+};
+
+// Identifies a locked file.
+class FileLock {
+ public:
+ FileLock() {}
+ virtual ~FileLock();
+
+ private:
+ // No copying allowed
+ FileLock(const FileLock&) = delete;
+ void operator=(const FileLock&) = delete;
+};
+
+class DynamicLibrary {
+ public:
+ virtual ~DynamicLibrary() {}
+
+ // Returns the name of the dynamic library.
+ virtual const char* Name() const = 0;
+
+ // Loads the symbol for sym_name from the library and updates the input
+ // function. Returns the loaded symbol.
+ template <typename T>
+ Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+ assert(nullptr != function);
+ void* ptr = nullptr;
+ Status s = LoadSymbol(sym_name, &ptr);
+ *function = reinterpret_cast<T*>(ptr);
+ return s;
+ }
+ // Loads and returns the symbol for sym_name from the library.
+ virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
+};
+
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+ const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// a set of log functions with different log levels.
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+ ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+extern void LogFlush(Logger* info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+ const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(Logger* info_log, const char* format, ...)
+ ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname,
+ bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+ std::string* data);
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::SequentialFileWrapper {
+// public:
+// MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target):
+// ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {}
+// Status Read(size_t n, Slice* result, char* scratch) override {
+// cout << "Doing a read of size " << n << "!" << endl;
+// return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result,
+// scratch);
+// }
+// // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+// forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+// rocksdb class. Unless you actually want to override the behavior.
+// (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit EnvWrapper(Env* t) : target_(t) {}
+ ~EnvWrapper() override;
+
+ // Return the target to which this Env forwards all calls
+ Env* target() const { return target_; }
+
+ // The following text is boilerplate that forwards all methods to target()
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) override {
+ return target_->NewSequentialFile(f, r, options);
+ }
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) override {
+ return target_->NewRandomAccessFile(f, r, options);
+ }
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override {
+ return target_->NewWritableFile(f, r, options);
+ }
+ Status ReopenWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override {
+ return target_->ReopenWritableFile(fname, result, options);
+ }
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override {
+ return target_->ReuseWritableFile(fname, old_fname, r, options);
+ }
+ Status NewRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& options) override {
+ return target_->NewRandomRWFile(fname, result, options);
+ }
+ Status NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return target_->NewMemoryMappedFileBuffer(fname, result);
+ }
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override {
+ return target_->NewDirectory(name, result);
+ }
+ Status FileExists(const std::string& f) override {
+ return target_->FileExists(f);
+ }
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* r) override {
+ return target_->GetChildren(dir, r);
+ }
+ Status GetChildrenFileAttributes(
+ const std::string& dir, std::vector<FileAttributes>* result) override {
+ return target_->GetChildrenFileAttributes(dir, result);
+ }
+ Status DeleteFile(const std::string& f) override {
+ return target_->DeleteFile(f);
+ }
+ Status Truncate(const std::string& fname, size_t size) override {
+ return target_->Truncate(fname, size);
+ }
+ Status CreateDir(const std::string& d) override {
+ return target_->CreateDir(d);
+ }
+ Status CreateDirIfMissing(const std::string& d) override {
+ return target_->CreateDirIfMissing(d);
+ }
+ Status DeleteDir(const std::string& d) override {
+ return target_->DeleteDir(d);
+ }
+ Status GetFileSize(const std::string& f, uint64_t* s) override {
+ return target_->GetFileSize(f, s);
+ }
+
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override {
+ return target_->GetFileModificationTime(fname, file_mtime);
+ }
+
+ Status RenameFile(const std::string& s, const std::string& t) override {
+ return target_->RenameFile(s, t);
+ }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ return target_->LinkFile(s, t);
+ }
+
+ Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+ return target_->NumFileLinks(fname, count);
+ }
+
+ Status AreFilesSame(const std::string& first, const std::string& second,
+ bool* res) override {
+ return target_->AreFilesSame(first, second, res);
+ }
+
+ Status LockFile(const std::string& f, FileLock** l) override {
+ return target_->LockFile(f, l);
+ }
+
+ Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
+
+ Status LoadLibrary(const std::string& lib_name,
+ const std::string& search_path,
+ std::shared_ptr<DynamicLibrary>* result) override {
+ return target_->LoadLibrary(lib_name, search_path, result);
+ }
+
+ void Schedule(void (*f)(void* arg), void* a, Priority pri,
+ void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+ return target_->Schedule(f, a, pri, tag, u);
+ }
+
+ int UnSchedule(void* tag, Priority pri) override {
+ return target_->UnSchedule(tag, pri);
+ }
+
+ void StartThread(void (*f)(void*), void* a) override {
+ return target_->StartThread(f, a);
+ }
+ void WaitForJoin() override { return target_->WaitForJoin(); }
+ unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+ return target_->GetThreadPoolQueueLen(pri);
+ }
+ Status GetTestDirectory(std::string* path) override {
+ return target_->GetTestDirectory(path);
+ }
+ Status NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result) override {
+ return target_->NewLogger(fname, result);
+ }
+ uint64_t NowMicros() override { return target_->NowMicros(); }
+ uint64_t NowNanos() override { return target_->NowNanos(); }
+ uint64_t NowCPUNanos() override { return target_->NowCPUNanos(); }
+
+ void SleepForMicroseconds(int micros) override {
+ target_->SleepForMicroseconds(micros);
+ }
+ Status GetHostName(char* name, uint64_t len) override {
+ return target_->GetHostName(name, len);
+ }
+ Status GetCurrentTime(int64_t* unix_time) override {
+ return target_->GetCurrentTime(unix_time);
+ }
+ Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) override {
+ return target_->GetAbsolutePath(db_path, output_path);
+ }
+ void SetBackgroundThreads(int num, Priority pri) override {
+ return target_->SetBackgroundThreads(num, pri);
+ }
+ int GetBackgroundThreads(Priority pri) override {
+ return target_->GetBackgroundThreads(pri);
+ }
+
+ Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+ return target_->SetAllowNonOwnerAccess(allow_non_owner_access);
+ }
+
+ void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+ return target_->IncBackgroundThreadsIfNeeded(num, pri);
+ }
+
+ void LowerThreadPoolIOPriority(Priority pool = LOW) override {
+ target_->LowerThreadPoolIOPriority(pool);
+ }
+
+ void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
+ target_->LowerThreadPoolCPUPriority(pool);
+ }
+
+ std::string TimeToString(uint64_t time) override {
+ return target_->TimeToString(time);
+ }
+
+ Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+ return target_->GetThreadList(thread_list);
+ }
+
+ ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+ return target_->GetThreadStatusUpdater();
+ }
+
+ uint64_t GetThreadID() const override { return target_->GetThreadID(); }
+
+ std::string GenerateUniqueId() override {
+ return target_->GenerateUniqueId();
+ }
+
+ EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+ return target_->OptimizeForLogRead(env_options);
+ }
+ EnvOptions OptimizeForManifestRead(
+ const EnvOptions& env_options) const override {
+ return target_->OptimizeForManifestRead(env_options);
+ }
+ EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const override {
+ return target_->OptimizeForLogWrite(env_options, db_options);
+ }
+ EnvOptions OptimizeForManifestWrite(
+ const EnvOptions& env_options) const override {
+ return target_->OptimizeForManifestWrite(env_options);
+ }
+ EnvOptions OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return target_->OptimizeForCompactionTableWrite(env_options, immutable_ops);
+ }
+ EnvOptions OptimizeForCompactionTableRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForCompactionTableRead(env_options, db_options);
+ }
+ Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+ return target_->GetFreeSpace(path, diskfree);
+ }
+ void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+ target_->SanitizeEnvOptions(env_opts);
+ }
+
+ private:
+ Env* target_;
+};
+
+class SequentialFileWrapper : public SequentialFile {
+ public:
+ explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ return target_->Read(n, result, scratch);
+ }
+ Status Skip(uint64_t n) override { return target_->Skip(n); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+ char* scratch) override {
+ return target_->PositionedRead(offset, n, result, scratch);
+ }
+
+ private:
+ SequentialFile* target_;
+};
+
+class RandomAccessFileWrapper : public RandomAccessFile {
+ public:
+ explicit RandomAccessFileWrapper(RandomAccessFile* target)
+ : target_(target) {}
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+ Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+ return target_->MultiRead(reqs, num_reqs);
+ }
+ Status Prefetch(uint64_t offset, size_t n) override {
+ return target_->Prefetch(offset, n);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+ void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ private:
+ RandomAccessFile* target_;
+};
+
+class WritableFileWrapper : public WritableFile {
+ public:
+ explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
+
+ Status Append(const Slice& data) override { return target_->Append(data); }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ return target_->PositionedAppend(data, offset);
+ }
+ Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+ Status Close() override { return target_->Close(); }
+ Status Flush() override { return target_->Flush(); }
+ Status Sync() override { return target_->Sync(); }
+ Status Fsync() override { return target_->Fsync(); }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetIOPriority(Env::IOPriority pri) override {
+ target_->SetIOPriority(pri);
+ }
+
+ Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ return target_->RangeSync(offset, nbytes);
+ }
+
+ void PrepareWrite(size_t offset, size_t len) override {
+ target_->PrepareWrite(offset, len);
+ }
+
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ return target_->Allocate(offset, len);
+ }
+
+ private:
+ WritableFile* target_;
+};
+
+class RandomRWFileWrapper : public RandomRWFile {
+ public:
+ explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {}
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status Write(uint64_t offset, const Slice& data) override {
+ return target_->Write(offset, data);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+ Status Flush() override { return target_->Flush(); }
+ Status Sync() override { return target_->Sync(); }
+ Status Fsync() override { return target_->Fsync(); }
+ Status Close() override { return target_->Close(); }
+
+ private:
+ RandomRWFile* target_;
+};
+
+class DirectoryWrapper : public Directory {
+ public:
+ explicit DirectoryWrapper(Directory* target) : target_(target) {}
+
+ Status Fsync() override { return target_->Fsync(); }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ Directory* target_;
+};
+
+class LoggerWrapper : public Logger {
+ public:
+ explicit LoggerWrapper(Logger* target) : target_(target) {}
+
+ Status Close() override { return target_->Close(); }
+ void LogHeader(const char* format, va_list ap) override {
+ return target_->LogHeader(format, ap);
+ }
+ void Logv(const char* format, va_list ap) override {
+ return target_->Logv(format, ap);
+ }
+ void Logv(const InfoLogLevel log_level, const char* format,
+ va_list ap) override {
+ return target_->Logv(log_level, format, ap);
+ }
+ size_t GetLogFileSize() const override { return target_->GetLogFileSize(); }
+ void Flush() override { return target_->Flush(); }
+ InfoLogLevel GetInfoLogLevel() const override {
+ return target_->GetInfoLogLevel();
+ }
+ void SetInfoLogLevel(const InfoLogLevel log_level) override {
+ return target_->SetInfoLogLevel(log_level);
+ }
+
+ private:
+ Logger* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+// Returns a new environment that is used for HDFS environment.
+// This is a factory method for HdfsEnv declared in hdfs/env_hdfs.h
+Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname);
+
+// Returns a new environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+// This is a factory method for TimedEnv defined in utilities/env_timed.cc.
+Env* NewTimedEnv(Env* base_env);
+
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+ std::shared_ptr<Logger>* result);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h
new file mode 100644
index 000000000..a4db10fd0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env_encryption.h
@@ -0,0 +1,206 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include <string>
+
+#include "env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EncryptionProvider;
+
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider);
+
+// BlockAccessCipherStream is the base class for any cipher stream that
+// supports random access at block level (without requiring data from other
+// blocks). E.g. CTR (Counter operation mode) supports this requirement.
+class BlockAccessCipherStream {
+ public:
+ virtual ~BlockAccessCipherStream(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() = 0;
+
+ // Encrypt one or more (partial) blocks of data at the file offset.
+ // Length of data is given in dataSize.
+ virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ // Decrypt one or more (partial) blocks of data at the file offset.
+ // Length of data is given in dataSize.
+ virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ protected:
+ // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+ virtual void AllocateScratch(std::string&) = 0;
+
+ // Encrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) = 0;
+
+ // Decrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) = 0;
+};
+
+// BlockCipher
+class BlockCipher {
+ public:
+ virtual ~BlockCipher(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() = 0;
+
+ // Encrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Encrypt(char* data) = 0;
+
+ // Decrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Decrypt(char* data) = 0;
+};
+
+// Implements a BlockCipher using ROT13.
+//
+// Note: This is a sample implementation of BlockCipher,
+// it is NOT considered safe and should NOT be used in production.
+class ROT13BlockCipher : public BlockCipher {
+ private:
+ size_t blockSize_;
+
+ public:
+ ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {}
+ virtual ~ROT13BlockCipher(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() override { return blockSize_; }
+
+ // Encrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Encrypt(char* data) override;
+
+ // Decrypt a block of data.
+ // Length of data is equal to BlockSize().
+ virtual Status Decrypt(char* data) override;
+};
+
+// CTRCipherStream implements BlockAccessCipherStream using an
+// Counter operations mode.
+// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
+//
+// Note: This is a possible implementation of BlockAccessCipherStream,
+// it is considered suitable for use.
+class CTRCipherStream final : public BlockAccessCipherStream {
+ private:
+ BlockCipher& cipher_;
+ std::string iv_;
+ uint64_t initialCounter_;
+
+ public:
+ CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter)
+ : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){};
+ virtual ~CTRCipherStream(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ virtual size_t BlockSize() override { return cipher_.BlockSize(); }
+
+ protected:
+ // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+ virtual void AllocateScratch(std::string&) override;
+
+ // Encrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) override;
+
+ // Decrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) override;
+};
+
+// The encryption provider is used to create a cipher stream for a specific
+// file. The returned cipher stream will be used for actual
+// encryption/decryption actions.
+class EncryptionProvider {
+ public:
+ virtual ~EncryptionProvider(){};
+
+ // GetPrefixLength returns the length of the prefix that is added to every
+ // file and used for storing encryption options. For optimal performance, the
+ // prefix length should be a multiple of the page size.
+ virtual size_t GetPrefixLength() = 0;
+
+ // CreateNewPrefix initialized an allocated block of prefix memory
+ // for a new file.
+ virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+ size_t prefixLength) = 0;
+
+ // CreateCipherStream creates a block access cipher stream for a file given
+ // given name and options.
+ virtual Status CreateCipherStream(
+ const std::string& fname, const EnvOptions& options, Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result) = 0;
+};
+
+// This encryption provider uses a CTR cipher stream, with a given block cipher
+// and IV.
+//
+// Note: This is a possible implementation of EncryptionProvider,
+// it is considered suitable for use, provided a safe BlockCipher is used.
+class CTREncryptionProvider : public EncryptionProvider {
+ private:
+ BlockCipher& cipher_;
+
+ protected:
+ const static size_t defaultPrefixLength = 4096;
+
+ public:
+ CTREncryptionProvider(BlockCipher& c) : cipher_(c){};
+ virtual ~CTREncryptionProvider() {}
+
+ // GetPrefixLength returns the length of the prefix that is added to every
+ // file and used for storing encryption options. For optimal performance, the
+ // prefix length should be a multiple of the page size.
+ virtual size_t GetPrefixLength() override;
+
+ // CreateNewPrefix initialized an allocated block of prefix memory
+ // for a new file.
+ virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+ size_t prefixLength) override;
+
+ // CreateCipherStream creates a block access cipher stream for a file given
+ // given name and options.
+ virtual Status CreateCipherStream(
+ const std::string& fname, const EnvOptions& options, Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result) override;
+
+ protected:
+ // PopulateSecretPrefixPart initializes the data into a new prefix block
+ // that will be encrypted. This function will store the data in plain text.
+ // It will be encrypted later (before written to disk).
+ // Returns the amount of space (starting from the start of the prefix)
+ // that has been initialized.
+ virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
+ size_t blockSize);
+
+ // CreateCipherStreamFromPrefix creates a block access cipher stream for a
+ // file given given name and options. The given prefix is already decrypted.
+ virtual Status CreateCipherStreamFromPrefix(
+ const std::string& fname, const EnvOptions& options,
+ uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h
new file mode 100644
index 000000000..f26d6371c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/experimental.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+// Supported only for Leveled compaction
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end);
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
+
+// Move all L0 files to target_level skipping compaction.
+// This operation succeeds only if the files in L0 have disjoint ranges; this
+// is guaranteed to happen, for instance, if keys are inserted in sorted
+// order. Furthermore, all levels between 1 and target_level must be empty.
+// If any of the above condition is violated, InvalidArgument will be
+// returned.
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
+ int target_level = 1);
+
+} // namespace experimental
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h
new file mode 100644
index 000000000..35f54f40b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_checksum.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FileChecksumFunc is the function class to generates the checksum value
+// for each file when the file is written to the file system.
+class FileChecksumFunc {
+ public:
+ virtual ~FileChecksumFunc() {}
+ // Return the checksum of concat (A, data[0,n-1]) where init_checksum is the
+ // returned value of some string A. It is used to maintain the checksum of a
+ // stream of data
+ virtual std::string Extend(const std::string& init_checksum, const char* data,
+ size_t n) = 0;
+
+ // Return the checksum value of data[0,n-1]
+ virtual std::string Value(const char* data, size_t n) = 0;
+
+ // Return a processed value of the checksum for store in somewhere
+ virtual std::string ProcessChecksum(const std::string& checksum) = 0;
+
+ // Returns a name that identifies the current file checksum function.
+ virtual const char* Name() const = 0;
+};
+
+// FileChecksumList stores the checksum information of a list of files (e.g.,
+// SST files). The FileChecksumLIst can be used to store the checksum
+// information of all SST file getting from the MANIFEST, which are
+// the checksum information of all valid SST file of a DB instance. It can
+// also be used to store the checksum information of a list of SST files to
+// be ingested.
+class FileChecksumList {
+ public:
+ virtual ~FileChecksumList() {}
+
+ // Clean the previously stored file checksum information.
+ virtual void reset() = 0;
+
+ // Get the number of checksums in the checksum list
+ virtual size_t size() const = 0;
+
+ // Return all the file checksum information being stored in a unordered_map.
+ // File_number is the key, the first part of the value is checksum value,
+ // and the second part of the value is checksum function name.
+ virtual Status GetAllFileChecksums(
+ std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+ std::vector<std::string>* checksum_func_names) = 0;
+
+ // Given the file_number, it searches if the file checksum information is
+ // stored.
+ virtual Status SearchOneFileChecksum(uint64_t file_number,
+ std::string* checksum,
+ std::string* checksum_func_name) = 0;
+
+ // Insert the checksum information of one file to the FileChecksumList.
+ virtual Status InsertOneFileChecksum(
+ uint64_t file_number, const std::string& checksum,
+ const std::string& checksum_func_name) = 0;
+
+ // Remove the checksum information of one SST file.
+ virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0;
+};
+
+// Create a new file checksum list.
+extern FileChecksumList* NewFileChecksumList();
+
+// Create a Crc32c based file checksum function
+extern FileChecksumFunc* CreateFileChecksumFuncCrc32c();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h
new file mode 100644
index 000000000..c1fd919f3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_system.h
@@ -0,0 +1,1358 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// A FileSystem is an interface used by the rocksdb implementation to access
+// storage functionality like the filesystem etc. Callers
+// may wish to provide a custom FileSystem object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All FileSystem implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+//
+// WARNING: Since this is a new interface, it is expected that there will be
+// some changes as storage systems are ported over.
+
+#pragma once
+
+#include <stdint.h>
+#include <chrono>
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileLock;
+class FSDirectory;
+class FSRandomAccessFile;
+class FSRandomRWFile;
+class FSSequentialFile;
+class FSWritableFile;
+class Logger;
+class Slice;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+
+using AccessPattern = RandomAccessFile::AccessPattern;
+using FileAttributes = Env::FileAttributes;
+
+// Priority of an IO request. This is a hint and does not guarantee any
+// particular QoS.
+// IO_LOW - Typically background reads/writes such as compaction/flush
+// IO_HIGH - Typically user reads/synchronous WAL writes
+enum class IOPriority : uint8_t {
+ kIOLow,
+ kIOHigh,
+ kIOTotal,
+};
+
+// Type of the data begin read/written. It can be passed down as a flag
+// for the FileSystem implementation to optionally handle different types in
+// different ways
+enum class IOType : uint8_t {
+ kData,
+ kFilter,
+ kIndex,
+ kMetadata,
+ kWAL,
+ kManifest,
+ kLog,
+ kUnknown,
+ kInvalid,
+};
+
+// Per-request options that can be passed down to the FileSystem
+// implementation. These are hints and are not necessarily guaranteed to be
+// honored. More hints can be added here in the future to indicate things like
+// storage media (HDD/SSD) to be used, replication level etc.
+struct IOOptions {
+ // Timeout for the operation in milliseconds
+ std::chrono::milliseconds timeout;
+
+ // Priority - high or low
+ IOPriority prio;
+
+ // Type of data being read/written
+ IOType type;
+};
+
+// File scope options that control how a file is opened/created and accessed
+// while its open. We may add more options here in the future such as
+// redundancy level, media to use etc.
+struct FileOptions : EnvOptions {
+ // Embedded IOOptions to control the parameters for any IOs that need
+ // to be issued for the file open/creation
+ IOOptions io_options;
+
+ FileOptions() : EnvOptions() {}
+
+ FileOptions(const DBOptions& opts)
+ : EnvOptions(opts) {}
+
+ FileOptions(const EnvOptions& opts)
+ : EnvOptions(opts) {}
+};
+
+// A structure to pass back some debugging information from the FileSystem
+// implementation to RocksDB in case of an IO error
+struct IODebugContext {
+ // file_path to be filled in by RocksDB in case of an error
+ std::string file_path;
+
+ // A map of counter names to values - set by the FileSystem implementation
+ std::map<std::string, uint64_t> counters;
+
+ // To be set by the FileSystem implementation
+ std::string msg;
+
+ IODebugContext() {}
+
+ void AddCounter(std::string& name, uint64_t value) {
+ counters.emplace(name, value);
+ }
+
+ std::string ToString() {
+ std::ostringstream ss;
+ ss << file_path << ", ";
+ for (auto counter : counters) {
+ ss << counter.first << " = " << counter.second << ",";
+ }
+ ss << msg;
+ return ss.str();
+ }
+};
+
+// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile,
+// FSRandomRWFileclass, and FSDIrectory classes define the interface between
+// RocksDB and storage systems, such as Posix filesystems,
+// remote filesystems etc.
+// The interface allows for fine grained control of individual IO operations,
+// such as setting a timeout, prioritization, hints on data placement,
+// different handling based on type of IO etc.
+// This is accomplished by passing an instance of IOOptions to every
+// API call that can potentially perform IO. Additionally, each such API is
+// passed a pointer to a IODebugContext structure that can be used by the
+// storage system to include troubleshooting information. The return values
+// of the APIs is of type IOStatus, which can indicate an error code/sub-code,
+// as well as metadata about the error such as its scope and whether its
+// retryable.
+class FileSystem {
+ public:
+ FileSystem();
+
+ // No copying allowed
+ FileSystem(const FileSystem&) = delete;
+
+ virtual ~FileSystem();
+
+ virtual const char* Name() const = 0;
+
+ static const char* Type() { return "FileSystem"; }
+
+ // Loads the FileSystem specified by the input value into the result
+ static Status Load(const std::string& value,
+ std::shared_ptr<FileSystem>* result);
+
+ // Return a default fie_system suitable for the current operating
+ // system. Sophisticated users may wish to provide their own Env
+ // implementation instead of relying on this default file_system
+ //
+ // The result of Default() belongs to rocksdb and must never be deleted.
+ static std::shared_ptr<FileSystem> Default();
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual IOStatus NewRandomAccessFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) = 0;
+ // These values match Linux definition
+ // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+ enum WriteLifeTimeHint {
+ kWLTHNotSet = 0, // No hint information set
+ kWLTHNone, // No hints about write life time
+ kWLTHShort, // Data written has a short life time
+ kWLTHMedium, // Data written has a medium life time
+ kWLTHLong, // Data written has a long life time
+ kWLTHExtreme, // Data written has an extremely long life time
+ };
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) = 0;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus ReopenWritableFile(
+ const std::string& /*fname*/, const FileOptions& /*options*/,
+ std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported();
+ }
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ virtual IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) = 0;
+
+ // Open `fname` for random read and write, if file doesn't exist the file
+ // will be created. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual IOStatus NewRandomRWFile(const std::string& /*fname*/,
+ const FileOptions& /*options*/,
+ std::unique_ptr<FSRandomRWFile>* /*result*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "RandomRWFile is not implemented in this FileSystem");
+ }
+
+ // Opens `fname` as a memory-mapped file for read and write (in-place updates
+ // only, i.e., no appends). On success, stores a raw buffer covering the whole
+ // file in `*result`. The file must exist prior to this call.
+ virtual IOStatus NewMemoryMappedFileBuffer(
+ const std::string& /*fname*/,
+ std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+ return IOStatus::NotSupported(
+ "MemoryMappedFileBuffer is not implemented in this FileSystem");
+ }
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ virtual IOStatus NewDirectory(const std::string& name,
+ const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) = 0;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus FileExists(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) = 0;
+
+ // Store in *result the attributes of the children of the specified directory.
+ // In case the implementation lists the directory prior to iterating the files
+ // and files are concurrently deleted, the deleted files will be omitted from
+ // result.
+ // The name attributes are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ virtual IOStatus GetChildrenFileAttributes(
+ const std::string& dir, const IOOptions& options,
+ std::vector<FileAttributes>* result, IODebugContext* dbg) {
+ assert(result != nullptr);
+ std::vector<std::string> child_fnames;
+ IOStatus s = GetChildren(dir, options, &child_fnames, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ result->resize(child_fnames.size());
+ size_t result_size = 0;
+ for (size_t i = 0; i < child_fnames.size(); ++i) {
+ const std::string path = dir + "/" + child_fnames[i];
+ if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes,
+ dbg))
+ .ok()) {
+ if (FileExists(path, options, dbg).IsNotFound()) {
+ // The file may have been deleted since we listed the directory
+ continue;
+ }
+ return s;
+ }
+ (*result)[result_size].name = std::move(child_fnames[i]);
+ result_size++;
+ }
+ result->resize(result_size);
+ return IOStatus::OK();
+ }
+
+ // Delete the named file.
+ virtual IOStatus DeleteFile(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Truncate the named file to the specified size.
+ virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("Truncate is not supported for this FileSystem");
+ }
+
+ // Create the specified directory. Returns error if directory exists.
+ virtual IOStatus CreateDir(const std::string& dirname,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ virtual IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Delete the specified directory.
+ virtual IOStatus DeleteDir(const std::string& dirname,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual IOStatus GetFileSize(const std::string& fname,
+ const IOOptions& options, uint64_t* file_size,
+ IODebugContext* dbg) = 0;
+
+ // Store the last modification time of fname in *file_mtime.
+ virtual IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) = 0;
+ // Rename file src to target.
+ virtual IOStatus RenameFile(const std::string& src, const std::string& target,
+ const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // Hard Link file src to target.
+ virtual IOStatus LinkFile(const std::string& /*src*/,
+ const std::string& /*target*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("LinkFile is not supported for this FileSystem");
+ }
+
+ virtual IOStatus NumFileLinks(const std::string& /*fname*/,
+ const IOOptions& /*options*/,
+ uint64_t* /*count*/, IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported(
+ "Getting number of file links is not supported for this FileSystem");
+ }
+
+ virtual IOStatus AreFilesSame(const std::string& /*first*/,
+ const std::string& /*second*/,
+ const IOOptions& /*options*/, bool* /*res*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported("AreFilesSame is not supported for this FileSystem");
+ }
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) = 0;
+
+ // Create and returns a default logger (an instance of EnvLogger) for storing
+ // informational messages. Derived classes can overide to provide custom
+ // logger.
+ virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) = 0;
+
+ // Get full directory name for this db.
+ virtual IOStatus GetAbsolutePath(const std::string& db_path,
+ const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) = 0;
+
+ // OptimizeForLogRead will create a new FileOptions object that is a copy of
+ // the FileOptions in the parameters, but is optimized for reading log files.
+ virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
+
+ // OptimizeForManifestRead will create a new FileOptions object that is a copy
+ // of the FileOptions in the parameters, but is optimized for reading manifest
+ // files.
+ virtual FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const;
+
+ // OptimizeForLogWrite will create a new FileOptions object that is a copy of
+ // the FileOptions in the parameters, but is optimized for writing log files.
+ // Default implementation returns the copy of the same object.
+ virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const;
+
+ // OptimizeForManifestWrite will create a new FileOptions object that is a
+ // copy of the FileOptions in the parameters, but is optimized for writing
+ // manifest files. Default implementation returns the copy of the same
+ // object.
+ virtual FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const;
+
+ // OptimizeForCompactionTableWrite will create a new FileOptions object that
+ // is a copy of the FileOptions in the parameters, but is optimized for
+ // writing table files.
+ virtual FileOptions OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& immutable_ops) const;
+
+ // OptimizeForCompactionTableRead will create a new FileOptions object that
+ // is a copy of the FileOptions in the parameters, but is optimized for
+ // reading table files.
+ virtual FileOptions OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+
+ // Get the amount of free disk space
+ virtual IOStatus GetFreeSpace(const std::string& /*path*/,
+ const IOOptions& /*options*/,
+ uint64_t* /*diskfree*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported();
+ }
+
+ // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ private:
+ void operator=(const FileSystem&);
+};
+
+// A file abstraction for reading sequentially through a file
+class FSSequentialFile {
+ public:
+ FSSequentialFile() {}
+
+ virtual ~FSSequentialFile() {}
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) = 0;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ virtual IOStatus Skip(uint64_t n) = 0;
+
+ // Indicates the upper layers if the current SequentialFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Positioned Read for direct I/O
+ // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+ virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+ const IOOptions& /*options*/,
+ Slice* /*result*/, char* /*scratch*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead
+struct FSReadRequest {
+ // File offset in bytes
+ uint64_t offset;
+
+ // Length to read in bytes
+ size_t len;
+
+ // A buffer that MultiRead() can optionally place data in. It can
+ // ignore this and allocate its own buffer
+ char* scratch;
+
+ // Output parameter set by MultiRead() to point to the data buffer, and
+ // the number of valid bytes
+ Slice result;
+
+ // Status of read
+ IOStatus status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class FSRandomAccessFile {
+ public:
+ FSRandomAccessFile() {}
+
+ virtual ~FSRandomAccessFile() {}
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // Safe for concurrent use by multiple threads.
+ // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const = 0;
+
+ // Readahead the file starting from offset by n bytes for caching.
+ virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+ }
+
+ // Read a bunch of blocks as described by reqs. The blocks can
+ // optionally be read in parallel. This is a synchronous call, i.e it
+ // should return after all reads have completed. The reads will be
+ // non-overlapping. If the function return Status is not ok, status of
+ // individual requests will be ignored and return status will be assumed
+ // for all read requests. The function return status is only meant for any
+ // any errors that occur before even processing specific read requests
+ virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) {
+ assert(reqs != nullptr);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ FSReadRequest& req = reqs[i];
+ req.status =
+ Read(req.offset, req.len, options, &req.result, req.scratch, dbg);
+ }
+ return IOStatus::OK();
+ }
+
+ // Tries to get an unique ID for this file that will be the same each time
+ // the file is opened (and will stay the same while the file is open).
+ // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+ // ID can be created this function returns the length of the ID and places it
+ // in "id"; otherwise, this function returns 0, in which case "id"
+ // may not have been modified.
+ //
+ // This function guarantees, for IDs from a given environment, two unique ids
+ // cannot be made equal to each other by adding arbitrary bytes to one of
+ // them. That is, no unique ID is the prefix of another.
+ //
+ // This function guarantees that the returned ID will not be interpretable as
+ // a single varint.
+ //
+ // Note: these IDs are only valid for the duration of the process.
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ // compatibility.
+ };
+
+ enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed };
+
+ virtual void Hint(AccessPattern /*pattern*/) {}
+
+ // Indicates the upper layers if the current RandomAccessFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // If you're adding methods here, remember to add them to
+ // RandomAccessFileWrapper too.
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class FSWritableFile {
+ public:
+ FSWritableFile()
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(false) {}
+
+ explicit FSWritableFile(const FileOptions& options)
+ : last_preallocated_block_(0),
+ preallocation_block_size_(0),
+ io_priority_(Env::IO_TOTAL),
+ write_hint_(Env::WLTH_NOT_SET),
+ strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+
+ virtual ~FSWritableFile() {}
+
+ // Append data to the end of the file
+ // Note: A WriteabelFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ virtual IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) = 0;
+
+ // PositionedAppend data to the specified offset. The new EOF after append
+ // must be larger than the previous EOF. This is to be used when writes are
+ // not backed by OS buffers and hence has to always start from the start of
+ // the sector. The implementation thus needs to also rewrite the last
+ // partial sector.
+ // Note: PositionAppend does not guarantee moving the file offset after the
+ // write. A WritableFile object must support either Append or
+ // PositionedAppend, so the users cannot mix the two.
+ //
+ // PositionedAppend() can only happen on the page/sector boundaries. For that
+ // reason, if the last write was an incomplete sector we still need to rewind
+ // back to the nearest sector/page and rewrite the portion of it with whatever
+ // we need to add. We need to keep where we stop writing.
+ //
+ // PositionedAppend() can only write whole sectors. For that reason we have to
+ // pad with zeros for the last write and trim the file when closing according
+ // to the position we keep in the previous step.
+ //
+ // PositionedAppend() requires aligned buffer to be passed in. The alignment
+ // required is queried via GetRequiredBufferAlignment()
+ virtual IOStatus PositionedAppend(const Slice& /* data */,
+ uint64_t /* offset */,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::NotSupported();
+ }
+
+ // Truncate is necessary to trim the file to the correct size
+ // before closing. It is not always possible to keep track of the file
+ // size due to whole pages writes. The behavior is undefined if called
+ // with other writes to follow.
+ virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+ }
+ virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0;
+ virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+ virtual IOStatus Sync(const IOOptions& options,
+ IODebugContext* dbg) = 0; // sync data
+
+ /*
+ * Sync data and/or metadata as well.
+ * By default, sync only data.
+ * Override this method for environments where we need to sync
+ * metadata as well.
+ */
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return Sync(options, dbg);
+ }
+
+ // true if Sync() and Fsync() are safe to call concurrently with Append()
+ // and Flush().
+ virtual bool IsSyncThreadSafe() const { return false; }
+
+ // Indicates the upper layers if the current WritableFile implementation
+ // uses direct IO.
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+ write_hint_ = hint;
+ }
+
+ virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+ virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+ virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+ /*
+ * Get the size of valid data in the file.
+ */
+ virtual uint64_t GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return 0;
+ }
+
+ /*
+ * Get and set the default pre-allocation block size for writes to
+ * this file. If non-zero, then Allocate will be used to extend the
+ * underlying storage of a file (generally via fallocate) if the Env
+ * instance supports it.
+ */
+ virtual void SetPreallocationBlockSize(size_t size) {
+ preallocation_block_size_ = size;
+ }
+
+ virtual void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) {
+ *last_allocated_block = last_preallocated_block_;
+ *block_size = preallocation_block_size_;
+ }
+
+ // For documentation, refer to RandomAccessFile::GetUniqueId()
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0; // Default implementation to prevent issues with backwards
+ }
+
+ // Remove any kind of caching of data from the offset to offset+length
+ // of this file. If the length is 0, then it refers to the end of file.
+ // If the system is not caching the file contents, then this is a noop.
+ // This call has no effect on dirty pages in the cache.
+ virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+ return IOStatus::NotSupported("InvalidateCache not supported.");
+ }
+
+ // Sync a file range with disk.
+ // offset is the starting byte of the file range to be synchronized.
+ // nbytes specifies the length of the range to be synchronized.
+ // This asks the OS to initiate flushing the cached data to disk,
+ // without waiting for completion.
+ // Default implementation does nothing.
+ virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+ const IOOptions& options, IODebugContext* dbg) {
+ if (strict_bytes_per_sync_) {
+ return Sync(options, dbg);
+ }
+ return IOStatus::OK();
+ }
+
+ // PrepareWrite performs any necessary preparation for a write
+ // before the write actually occurs. This allows for pre-allocation
+ // of space on devices where it can result in less file
+ // fragmentation and/or less waste from over-zealous filesystem
+ // pre-allocation.
+ virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) {
+ if (preallocation_block_size_ == 0) {
+ return;
+ }
+ // If this write would cross one or more preallocation blocks,
+ // determine what the last preallocation block necessary to
+ // cover this write would be and Allocate to that point.
+ const auto block_size = preallocation_block_size_;
+ size_t new_last_preallocated_block =
+ (offset + len + block_size - 1) / block_size;
+ if (new_last_preallocated_block > last_preallocated_block_) {
+ size_t num_spanned_blocks =
+ new_last_preallocated_block - last_preallocated_block_;
+ Allocate(block_size * last_preallocated_block_,
+ block_size * num_spanned_blocks, options, dbg);
+ last_preallocated_block_ = new_last_preallocated_block;
+ }
+ }
+
+ // Pre-allocates space for a file.
+ virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+ }
+
+ // If you're adding methods here, remember to add them to
+ // WritableFileWrapper too.
+
+ protected:
+ size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+ size_t last_preallocated_block_;
+ size_t preallocation_block_size_;
+ // No copying allowed
+ FSWritableFile(const FSWritableFile&);
+ void operator=(const FSWritableFile&);
+
+ protected:
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class FSRandomRWFile {
+ public:
+ FSRandomRWFile() {}
+
+ virtual ~FSRandomRWFile() {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ virtual bool use_direct_io() const { return false; }
+
+ // Use the returned alignment value to allocate
+ // aligned buffer for Direct I/O
+ virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ virtual IOStatus Write(uint64_t offset, const Slice& data,
+ const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ // Returns Status::OK() on success.
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const = 0;
+
+ virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return Sync(options, dbg);
+ }
+
+ virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ // If you're adding methods here, remember to add them to
+ // RandomRWFileWrapper too.
+
+ // No copying allowed
+ FSRandomRWFile(const RandomRWFile&) = delete;
+ FSRandomRWFile& operator=(const RandomRWFile&) = delete;
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class FSMemoryMappedFileBuffer {
+ public:
+ FSMemoryMappedFileBuffer(void* _base, size_t _length)
+ : base_(_base), length_(_length) {}
+
+ virtual ~FSMemoryMappedFileBuffer() = 0;
+
+ // We do not want to unmap this twice. We can make this class
+ // movable if desired, however, since
+ FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete;
+ FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete;
+
+ void* GetBase() const { return base_; }
+ size_t GetLen() const { return length_; }
+
+ protected:
+ void* base_;
+ const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class FSDirectory {
+ public:
+ virtual ~FSDirectory() {}
+ // Fsync directory. Can be called concurrently from multiple threads.
+ virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+ virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+ return 0;
+ }
+
+ // If you're adding methods here, remember to add them to
+ // DirectoryWrapper too.
+};
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper {
+// public:
+// MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target):
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {}
+// Status Read(size_t n, FileSystem::IOOptions& options, Slice* result,
+// char* scratch, FileSystem::IODebugContext* dbg) override {
+// cout << "Doing a read of size " << n << "!" << endl;
+// return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options,
+// result,
+// scratch, dbg);
+// }
+// // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+// forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+// rocksdb class. Unless you actually want to override the behavior.
+// (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class FileSystemWrapper : public FileSystem {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit FileSystemWrapper(FileSystem* t) : target_(t) {}
+ ~FileSystemWrapper() override {}
+
+ // Return the target to which this Env forwards all calls
+ FileSystem* target() const { return target_; }
+
+ // The following text is boilerplate that forwards all methods to target()
+ IOStatus NewSequentialFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewSequentialFile(f, file_opts, r, dbg);
+ }
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewRandomAccessFile(f, file_opts, r, dbg);
+ }
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override {
+ return target_->NewWritableFile(f, file_opts, r, dbg);
+ }
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ return target_->ReopenWritableFile(fname, file_opts, result, dbg);
+ }
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override {
+ return target_->ReuseWritableFile(fname, old_fname, file_opts, r,
+ dbg);
+ }
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override {
+ return target_->NewRandomRWFile(fname, file_opts, result, dbg);
+ }
+ IOStatus NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return target_->NewMemoryMappedFileBuffer(fname, result);
+ }
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override {
+ return target_->NewDirectory(name, io_opts, result, dbg);
+ }
+ IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+ IODebugContext* dbg) override {
+ return target_->FileExists(f, io_opts, dbg);
+ }
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override {
+ return target_->GetChildren(dir, io_opts, r, dbg);
+ }
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override {
+ return target_->GetChildrenFileAttributes(dir, options, result, dbg);
+ }
+ IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->DeleteFile(f, options, dbg);
+ }
+ IOStatus Truncate(const std::string& fname, size_t size,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Truncate(fname, size, options, dbg);
+ }
+ IOStatus CreateDir(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->CreateDir(d, options, dbg);
+ }
+ IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->CreateDirIfMissing(d, options, dbg);
+ }
+ IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->DeleteDir(d, options, dbg);
+ }
+ IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+ uint64_t* s, IODebugContext* dbg) override {
+ return target_->GetFileSize(f, options, s, dbg);
+ }
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override {
+ return target_->GetFileModificationTime(fname, options, file_mtime, dbg);
+ }
+
+ IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) override {
+ return target_->GetAbsolutePath(db_path, options, output_path, dbg);
+ }
+
+ IOStatus RenameFile(const std::string& s, const std::string& t,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->RenameFile(s, t, options, dbg);
+ }
+
+ IOStatus LinkFile(const std::string& s, const std::string& t,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->LinkFile(s, t, options, dbg);
+ }
+
+ IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
+ uint64_t* count, IODebugContext* dbg) override {
+ return target_->NumFileLinks(fname, options, count, dbg);
+ }
+
+ IOStatus AreFilesSame(const std::string& first, const std::string& second,
+ const IOOptions& options, bool* res,
+ IODebugContext* dbg) override {
+ return target_->AreFilesSame(first, second, options, res, dbg);
+ }
+
+ IOStatus LockFile(const std::string& f, const IOOptions& options,
+ FileLock** l, IODebugContext* dbg) override {
+ return target_->LockFile(f, options, l, dbg);
+ }
+
+ IOStatus UnlockFile(FileLock* l, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->UnlockFile(l, options, dbg);
+ }
+
+ IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) override {
+ return target_->GetTestDirectory(options, path, dbg);
+ }
+ IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override {
+ return target_->NewLogger(fname, options, result, dbg);
+ }
+
+ FileOptions OptimizeForLogRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForLogRead(file_options);
+ }
+ FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestRead(file_options);
+ }
+ FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const override {
+ return target_->OptimizeForLogWrite(file_options, db_options);
+ }
+ FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestWrite(file_options);
+ }
+ FileOptions OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return target_->OptimizeForCompactionTableWrite(file_options,
+ immutable_ops);
+ }
+ FileOptions OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForCompactionTableRead(file_options, db_options);
+ }
+ IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
+ uint64_t* diskfree, IODebugContext* dbg) override {
+ return target_->GetFreeSpace(path, options, diskfree, dbg);
+ }
+
+ private:
+ FileSystem* target_;
+};
+
+class FSSequentialFileWrapper : public FSSequentialFile {
+ public:
+ explicit FSSequentialFileWrapper(FSSequentialFile* target)
+ : target_(target) {}
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override {
+ return target_->Read(n, options, result, scratch, dbg);
+ }
+ IOStatus Skip(uint64_t n) override { return target_->Skip(n); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override {
+ return target_->PositionedRead(offset, n, options, result, scratch, dbg);
+ }
+
+ private:
+ FSSequentialFile* target_;
+};
+
+class FSRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+ explicit FSRandomAccessFileWrapper(FSRandomAccessFile* target)
+ : target_(target) {}
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ return target_->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override {
+ return target_->MultiRead(reqs, num_reqs, options, dbg);
+ }
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Prefetch(offset, n, options, dbg);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ };
+ void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ private:
+ FSRandomAccessFile* target_;
+};
+
+class FSWritableFileWrapper : public FSWritableFile {
+ public:
+ explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {}
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Append(data, options, dbg);
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->PositionedAppend(data, offset, options, dbg);
+ }
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Truncate(size, options, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Close(options, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->GetFileSize(options, dbg);
+ }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->RangeSync(offset, nbytes, options, dbg);
+ }
+
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ target_->PrepareWrite(offset, len, options, dbg);
+ }
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Allocate(offset, len, options, dbg);
+ }
+
+ private:
+ FSWritableFile* target_;
+};
+
+class FSRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+ explicit FSRandomRWFileWrapper(FSRandomRWFile* target) : target_(target) {}
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Write(offset, data, options, dbg);
+ }
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ return target_->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Close(options, dbg);
+ }
+
+ private:
+ FSRandomRWFile* target_;
+};
+
+class FSDirectoryWrapper : public FSDirectory {
+ public:
+ explicit FSDirectoryWrapper(FSDirectory* target) : target_(target) {}
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return target_->Fsync(options, dbg);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ FSDirectory* target_;
+};
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(FileSystem* fs, const std::string& fname,
+ std::string* data);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h
new file mode 100644
index 000000000..03d6471cf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/filter_policy.h
@@ -0,0 +1,200 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys. These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#pragma once
+
+#include <stdlib.h>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct BlockBasedTableOptions;
+
+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+ virtual ~FilterBitsBuilder() {}
+
+ // Add Key to filter, you could use any way to store the key.
+ // Such as: storing hashes or original keys
+ // Keys are in sorted order and duplicated keys are possible.
+ virtual void AddKey(const Slice& key) = 0;
+
+ // Generate the filter using the keys that are added
+ // The return value of this function would be the filter bits,
+ // The ownership of actual data is set to buf
+ virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+
+ // Calculate num of keys that can be added and generate a filter
+ // <= the specified number of bytes.
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4702) // unreachable code
+#endif
+ virtual int CalculateNumEntry(const uint32_t /*bytes*/) {
+#ifndef ROCKSDB_LITE
+ throw std::runtime_error("CalculateNumEntry not Implemented");
+#else
+ abort();
+#endif
+ return 0;
+ }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+ virtual ~FilterBitsReader() {}
+
+ // Check if the entry match the bits in filter
+ virtual bool MayMatch(const Slice& entry) = 0;
+
+ // Check if an array of entries match the bits in filter
+ virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) {
+ for (int i = 0; i < num_keys; ++i) {
+ may_match[i] = MayMatch(*keys[i]);
+ }
+ }
+};
+
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext(). References other
+// structs because this is expected to be a temporary, stack-allocated object.
+struct FilterBuildingContext {
+ // This constructor is for internal use only and subject to change.
+ FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+ // Options for the table being built
+ const BlockBasedTableOptions& table_options;
+
+ // Name of the column family for the table (or empty string if unknown)
+ std::string column_family_name;
+
+ // The compactions style in effect for the table
+ CompactionStyle compaction_style = kCompactionStyleLevel;
+
+ // The table level at time of constructing the SST file, or -1 if unknown.
+ // (The table file could later be used at a different level.)
+ int level_at_creation = -1;
+
+ // An optional logger for reporting errors, warnings, etc.
+ Logger* info_log = nullptr;
+};
+
+// We add a new format of filter block called full filter block
+// This new interface gives you more space of customization
+//
+// For the full filter block, you can plug in your version by implement
+// the FilterBitsBuilder and FilterBitsReader
+//
+// There are two sets of interface in FilterPolicy
+// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter
+// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for
+// full filter.
+// Set 1 MUST be implemented correctly, Set 2 is optional
+// RocksDB would first try using functions in Set 2. if they return nullptr,
+// it would use Set 1 instead.
+// You can choose filter type in NewBloomFilterPolicy
+class FilterPolicy {
+ public:
+ virtual ~FilterPolicy();
+
+ // Return the name of this policy. Note that if the filter encoding
+ // changes in an incompatible way, the name returned by this method
+ // must be changed. Otherwise, old incompatible filters may be
+ // passed to methods of this type.
+ virtual const char* Name() const = 0;
+
+ // keys[0,n-1] contains a list of keys (potentially with duplicates)
+ // that are ordered according to the user supplied comparator.
+ // Append a filter that summarizes keys[0,n-1] to *dst.
+ //
+ // Warning: do not change the initial contents of *dst. Instead,
+ // append the newly constructed filter to *dst.
+ virtual void CreateFilter(const Slice* keys, int n,
+ std::string* dst) const = 0;
+
+ // "filter" contains the data appended by a preceding call to
+ // CreateFilter() on this class. This method must return true if
+ // the key was in the list of keys passed to CreateFilter().
+ // This method may return true or false if the key was not on the
+ // list, but it should aim to return false with a high probability.
+ virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
+ // Return a new FilterBitsBuilder for full or partitioned filter blocks, or
+ // nullptr if using block-based filter.
+ // NOTE: This function is only called by GetBuilderWithContext() below for
+ // custom FilterPolicy implementations. Thus, it is not necessary to
+ // override this function if overriding GetBuilderWithContext().
+ virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
+
+ // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy
+ // to customize the builder for contextual constraints and hints.
+ // (Name changed to avoid triggering -Werror=overloaded-virtual.)
+ // If overriding GetFilterBitsBuilder() suffices, it is not necessary to
+ // override this function.
+ virtual FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext&) const {
+ return GetFilterBitsBuilder();
+ }
+
+ // Return a new FilterBitsReader for full or partitioned filter blocks, or
+ // nullptr if using block-based filter.
+ // As here, the input slice should NOT be deleted by FilterPolicy.
+ virtual FilterBitsReader* GetFilterBitsReader(
+ const Slice& /*contents*/) const {
+ return nullptr;
+ }
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.
+//
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
+// use_block_based_builder: use deprecated block based filter (true) rather
+// than full or partitioned filter (false).
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys. For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(
+ double bits_per_key, bool use_block_based_builder = false);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h
new file mode 100644
index 000000000..badc0808a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class BlockBuilder;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+ // Keep track of the key/value sequences and return the boolean value to
+ // determine if table builder should flush current data block.
+ virtual bool Update(const Slice& key, const Slice& value) = 0;
+
+ virtual ~FlushBlockPolicy() {}
+};
+
+class FlushBlockPolicyFactory {
+ public:
+ // Return the name of the flush block policy.
+ virtual const char* Name() const = 0;
+
+ // Return a new block flush policy that flushes data blocks by data size.
+ // FlushBlockPolicy may need to access the metadata of the data block
+ // builder to determine when to flush the blocks.
+ //
+ // Callers must delete the result after any database that is using the
+ // result has been closed.
+ virtual FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& table_options,
+ const BlockBuilder& data_block_builder) const = 0;
+
+ virtual ~FlushBlockPolicyFactory() {}
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ FlushBlockBySizePolicyFactory() {}
+
+ const char* Name() const override { return "FlushBlockBySizePolicyFactory"; }
+
+ FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& table_options,
+ const BlockBuilder& data_block_builder) const override;
+
+ static FlushBlockPolicy* NewFlushBlockPolicy(
+ const uint64_t size, const int deviation,
+ const BlockBuilder& data_block_builder);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h
new file mode 100644
index 000000000..a1de859ad
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/io_status.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// An IOStatus encapsulates the result of an operation. It may indicate
+// success, or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on an IOStatus without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same IOStatus must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/slice.h"
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+#include "status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IOStatus : public Status {
+ public:
+ using Code = Status::Code;
+ using SubCode = Status::SubCode;
+
+ enum IOErrorScope {
+ kIOErrorScopeFileSystem,
+ kIOErrorScopeFile,
+ kIOErrorScopeRange,
+ kIOErrorScopeMax,
+ };
+
+ // Create a success status.
+ IOStatus() : IOStatus(kOk, kNone) {}
+ ~IOStatus() {}
+
+ // Copy the specified status.
+ IOStatus(const IOStatus& s);
+ IOStatus& operator=(const IOStatus& s);
+ IOStatus(IOStatus&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ ;
+ IOStatus& operator=(IOStatus&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ ;
+ bool operator==(const IOStatus& rhs) const;
+ bool operator!=(const IOStatus& rhs) const;
+
+ void SetRetryable(bool retryable) { retryable_ = retryable; }
+ void SetDataLoss(bool data_loss) { data_loss_ = data_loss; }
+ void SetScope(IOErrorScope scope) { scope_ = scope; }
+
+ bool GetRetryable() const { return retryable_; }
+ bool GetDataLoss() const { return data_loss_; }
+ IOErrorScope GetScope() const { return scope_; }
+
+ // Return a success status.
+ static IOStatus OK() { return IOStatus(); }
+
+ static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kNotSupported, msg, msg2);
+ }
+ static IOStatus NotSupported(SubCode msg = kNone) {
+ return IOStatus(kNotSupported, msg);
+ }
+
+ // Return error status of an appropriate type.
+ static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kNotFound, msg, msg2);
+ }
+ // Fast path for not found without malloc;
+ static IOStatus NotFound(SubCode msg = kNone) {
+ return IOStatus(kNotFound, msg);
+ }
+
+ static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kCorruption, msg, msg2);
+ }
+ static IOStatus Corruption(SubCode msg = kNone) {
+ return IOStatus(kCorruption, msg);
+ }
+
+ static IOStatus InvalidArgument(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return IOStatus(kInvalidArgument, msg, msg2);
+ }
+ static IOStatus InvalidArgument(SubCode msg = kNone) {
+ return IOStatus(kInvalidArgument, msg);
+ }
+
+ static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, msg, msg2);
+ }
+ static IOStatus IOError(SubCode msg = kNone) {
+ return IOStatus(kIOError, msg);
+ }
+
+ static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); }
+ static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kBusy, msg, msg2);
+ }
+
+ static IOStatus TimedOut(SubCode msg = kNone) {
+ return IOStatus(kTimedOut, msg);
+ }
+ static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kTimedOut, msg, msg2);
+ }
+
+ static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); }
+ static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, kNoSpace, msg, msg2);
+ }
+
+ static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); }
+ static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return IOStatus(kIOError, kPathNotFound, msg, msg2);
+ }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ // std::string ToString() const;
+
+ private:
+ friend IOStatus status_to_io_status(Status&&);
+ bool retryable_;
+ bool data_loss_;
+ IOErrorScope scope_;
+
+ explicit IOStatus(Code _code, SubCode _subcode = kNone)
+ : Status(_code, _subcode),
+ retryable_(false),
+ data_loss_(false),
+ scope_(kIOErrorScopeFileSystem) {}
+
+ IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+ IOStatus(Code _code, const Slice& msg, const Slice& msg2)
+ : IOStatus(_code, kNone, msg, msg2) {}
+};
+
+inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg,
+ const Slice& msg2)
+ : Status(_code, _subcode),
+ retryable_(false),
+ data_loss_(false),
+ scope_(kIOErrorScopeFileSystem) {
+ assert(code_ != kOk);
+ assert(subcode_ != kMaxSubCode);
+ const size_t len1 = msg.size();
+ const size_t len2 = msg2.size();
+ const size_t size = len1 + (len2 ? (2 + len2) : 0);
+ char* const result = new char[size + 1]; // +1 for null terminator
+ memcpy(result, msg.data(), len1);
+ if (len2) {
+ result[len1] = ':';
+ result[len1 + 1] = ' ';
+ memcpy(result + len1 + 2, msg2.data(), len2);
+ }
+ result[size] = '\0'; // null terminator for C style string
+ state_ = result;
+}
+
+inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) {
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline IOStatus& IOStatus::operator=(const IOStatus& s) {
+ // The following condition catches both aliasing (when this == &s),
+ // and the common case where both s and *this are ok.
+ if (this != &s) {
+ code_ = s.code_;
+ subcode_ = s.subcode_;
+ retryable_ = s.retryable_;
+ data_loss_ = s.data_loss_;
+ scope_ = s.scope_;
+ delete[] state_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+ }
+ return *this;
+}
+
+inline IOStatus::IOStatus(IOStatus&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ : IOStatus() {
+ *this = std::move(s);
+}
+
+inline IOStatus& IOStatus::operator=(IOStatus&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+{
+ if (this != &s) {
+ code_ = std::move(s.code_);
+ s.code_ = kOk;
+ subcode_ = std::move(s.subcode_);
+ s.subcode_ = kNone;
+ retryable_ = s.retryable_;
+ retryable_ = false;
+ data_loss_ = s.data_loss_;
+ data_loss_ = false;
+ scope_ = s.scope_;
+ scope_ = kIOErrorScopeFileSystem;
+ delete[] state_;
+ state_ = nullptr;
+ std::swap(state_, s.state_);
+ }
+ return *this;
+}
+
+inline bool IOStatus::operator==(const IOStatus& rhs) const {
+ return (code_ == rhs.code_);
+}
+
+inline bool IOStatus::operator!=(const IOStatus& rhs) const {
+ return !(*this == rhs);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
new file mode 100644
index 000000000..b31b6d70a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+// A thread local context for gathering io-stats efficiently and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+namespace ROCKSDB_NAMESPACE {
+
+struct IOStatsContext {
+ // reset all io-stats counter to zero
+ void Reset();
+
+ std::string ToString(bool exclude_zero_counters = false) const;
+
+ // the thread pool id
+ uint64_t thread_pool_id;
+
+ // number of bytes that has been written.
+ uint64_t bytes_written;
+ // number of bytes that has been read.
+ uint64_t bytes_read;
+
+ // time spent in open() and fopen().
+ uint64_t open_nanos;
+ // time spent in fallocate().
+ uint64_t allocate_nanos;
+ // time spent in write() and pwrite().
+ uint64_t write_nanos;
+ // time spent in read() and pread()
+ uint64_t read_nanos;
+ // time spent in sync_file_range().
+ uint64_t range_sync_nanos;
+ // time spent in fsync
+ uint64_t fsync_nanos;
+ // time spent in preparing write (fallocate etc).
+ uint64_t prepare_write_nanos;
+ // time spent in Logger::Logv().
+ uint64_t logger_nanos;
+ // CPU time spent in write() and pwrite()
+ uint64_t cpu_write_nanos;
+ // CPU time spent in read() and pread()
+ uint64_t cpu_read_nanos;
+};
+
+// Get Thread-local IOStatsContext object pointer
+IOStatsContext* get_iostats_context();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h
new file mode 100644
index 000000000..2f8f1e385
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iterator.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface. Multiple implementations
+// are provided by this library. In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/cleanable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator : public Cleanable {
+ public:
+ Iterator() {}
+ // No copying allowed
+ Iterator(const Iterator&) = delete;
+ void operator=(const Iterator&) = delete;
+
+ virtual ~Iterator() {}
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ // Always returns false if !status().ok().
+ virtual bool Valid() const = 0;
+
+ // Position at the first key in the source. The iterator is Valid()
+ // after this call iff the source is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last key in the source. The iterator is
+ // Valid() after this call iff the source is not empty.
+ virtual void SeekToLast() = 0;
+
+ // Position at the first key in the source that at or past target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ // All Seek*() methods clear any error status() that the iterator had prior to
+ // the call; after the seek, status() indicates only the error (if any) that
+ // happened during the seek, not any past errors.
+ virtual void Seek(const Slice& target) = 0;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const Slice& target) = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Moves to the previous entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the first entry in source.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice value() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ // If non-blocking IO is requested and this operation cannot be
+ // satisfied without doing some IO, then this returns Status::Incomplete().
+ virtual Status status() const = 0;
+
+ // If supported, renew the iterator to represent the latest state. The
+ // iterator will be invalidated after the call. Not supported if
+ // ReadOptions.snapshot is given when creating the iterator.
+ virtual Status Refresh() {
+ return Status::NotSupported("Refresh() is not supported");
+ }
+
+ // Property "rocksdb.iterator.is-key-pinned":
+ // If returning "1", this means that the Slice returned by key() is valid
+ // as long as the iterator is not deleted.
+ // It is guaranteed to always return "1" if
+ // - Iterator created with ReadOptions::pin_data = true
+ // - DB tables were created with
+ // BlockBasedTableOptions::use_delta_encoding = false.
+ // Property "rocksdb.iterator.super-version-number":
+ // LSM version used by the iterator. The same format as DB Property
+ // kCurrentSuperVersionNumber. See its comment for more information.
+ // Property "rocksdb.iterator.internal-key":
+ // Get the user-key portion of the internal key at which the iteration
+ // stopped.
+ virtual Status GetProperty(std::string prop_name, std::string* prop);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h
new file mode 100644
index 000000000..22ea7734f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/ldb_tool.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An interface for converting a slice to a readable string
+class SliceFormatter {
+ public:
+ virtual ~SliceFormatter() {}
+ virtual std::string Format(const Slice& s) const = 0;
+};
+
+// Options for customizing ldb tool (beyond the DB Options)
+struct LDBOptions {
+ // Create LDBOptions with default values for all fields
+ LDBOptions();
+
+ // Key formatter that converts a slice to a readable string.
+ // Default: Slice::ToString()
+ std::shared_ptr<SliceFormatter> key_formatter;
+
+ std::string print_help_header = "ldb - RocksDB Tool";
+};
+
+class LDBTool {
+ public:
+ void Run(
+ int argc, char** argv, Options db_options = Options(),
+ const LDBOptions& ldb_options = LDBOptions(),
+ const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
new file mode 100644
index 000000000..d1c953f0f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -0,0 +1,491 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
+ TablePropertiesCollection;
+
+class DB;
+class ColumnFamilyHandle;
+class Status;
+struct CompactionJobStats;
+enum CompressionType : unsigned char;
+
+enum class TableFileCreationReason {
+ kFlush,
+ kCompaction,
+ kRecovery,
+ kMisc,
+};
+
+struct TableFileCreationBriefInfo {
+ // the name of the database where the file was created
+ std::string db_name;
+ // the name of the column family where the file was created.
+ std::string cf_name;
+ // the path to the created file.
+ std::string file_path;
+ // the id of the job (which could be flush or compaction) that
+ // created the file.
+ int job_id;
+ // reason of creating the table.
+ TableFileCreationReason reason;
+};
+
+struct TableFileCreationInfo : public TableFileCreationBriefInfo {
+ TableFileCreationInfo() = default;
+ explicit TableFileCreationInfo(TableProperties&& prop)
+ : table_properties(prop) {}
+ // the size of the file.
+ uint64_t file_size;
+ // Detailed properties of the created file.
+ TableProperties table_properties;
+ // The status indicating whether the creation was successful or not.
+ Status status;
+};
+
+enum class CompactionReason : int {
+ kUnknown = 0,
+ // [Level] number of L0 files > level0_file_num_compaction_trigger
+ kLevelL0FilesNum,
+ // [Level] total size of level > MaxBytesForLevel()
+ kLevelMaxLevelSize,
+ // [Universal] Compacting for size amplification
+ kUniversalSizeAmplification,
+ // [Universal] Compacting for size ratio
+ kUniversalSizeRatio,
+ // [Universal] number of sorted runs > level0_file_num_compaction_trigger
+ kUniversalSortedRunNum,
+ // [FIFO] total size > max_table_files_size
+ kFIFOMaxSize,
+ // [FIFO] reduce number of files.
+ kFIFOReduceNumFiles,
+ // [FIFO] files with creation time < (current_time - interval)
+ kFIFOTtl,
+ // Manual compaction
+ kManualCompaction,
+ // DB::SuggestCompactRange() marked files for compaction
+ kFilesMarkedForCompaction,
+ // [Level] Automatic compaction within bottommost level to cleanup duplicate
+ // versions of same user key, usually due to a released snapshot.
+ kBottommostFiles,
+ // Compaction based on TTL
+ kTtl,
+ // According to the comments in flush_job.cc, RocksDB treats flush as
+ // a level 0 compaction in internal stats.
+ kFlush,
+ // Compaction caused by external sst file ingestion
+ kExternalSstIngestion,
+ // Compaction due to SST file being too old
+ kPeriodicCompaction,
+ // total number of compaction reasons, new reasons must be added above this.
+ kNumOfReasons,
+};
+
+enum class FlushReason : int {
+ kOthers = 0x00,
+ kGetLiveFiles = 0x01,
+ kShutDown = 0x02,
+ kExternalFileIngestion = 0x03,
+ kManualCompaction = 0x04,
+ kWriteBufferManager = 0x05,
+ kWriteBufferFull = 0x06,
+ kTest = 0x07,
+ kDeleteFiles = 0x08,
+ kAutoCompaction = 0x09,
+ kManualFlush = 0x0a,
+ kErrorRecovery = 0xb,
+};
+
+enum class BackgroundErrorReason {
+ kFlush,
+ kCompaction,
+ kWriteCallback,
+ kMemTable,
+};
+
+enum class WriteStallCondition {
+ kNormal,
+ kDelayed,
+ kStopped,
+};
+
+struct WriteStallInfo {
+ // the name of the column family
+ std::string cf_name;
+ // state of the write controller
+ struct {
+ WriteStallCondition cur;
+ WriteStallCondition prev;
+ } condition;
+};
+
+#ifndef ROCKSDB_LITE
+
+struct TableFileDeletionInfo {
+ // The name of the database where the file was deleted.
+ std::string db_name;
+ // The path to the deleted file.
+ std::string file_path;
+ // The id of the job which deleted the file.
+ int job_id;
+ // The status indicating whether the deletion was successful or not.
+ Status status;
+};
+
+struct FileOperationInfo {
+ using TimePoint = std::chrono::time_point<std::chrono::system_clock,
+ std::chrono::nanoseconds>;
+
+ const std::string& path;
+ uint64_t offset;
+ size_t length;
+ const TimePoint& start_timestamp;
+ const TimePoint& finish_timestamp;
+ Status status;
+ FileOperationInfo(const std::string& _path, const TimePoint& start,
+ const TimePoint& finish)
+ : path(_path), start_timestamp(start), finish_timestamp(finish) {}
+};
+
+struct FlushJobInfo {
+ // the id of the column family
+ uint32_t cf_id;
+ // the name of the column family
+ std::string cf_name;
+ // the path to the newly created file
+ std::string file_path;
+ // the file number of the newly created file
+ uint64_t file_number;
+ // the oldest blob file referenced by the newly created file
+ uint64_t oldest_blob_file_number;
+ // the id of the thread that completed this flush job.
+ uint64_t thread_id;
+ // the job id, which is unique in the same thread.
+ int job_id;
+ // If true, then rocksdb is currently slowing-down all writes to prevent
+ // creating too many Level 0 files as compaction seems not able to
+ // catch up the write request speed. This indicates that there are
+ // too many files in Level 0.
+ bool triggered_writes_slowdown;
+ // If true, then rocksdb is currently blocking any writes to prevent
+ // creating more L0 files. This indicates that there are too many
+ // files in level 0. Compactions should try to compact L0 files down
+ // to lower levels as soon as possible.
+ bool triggered_writes_stop;
+ // The smallest sequence number in the newly created file
+ SequenceNumber smallest_seqno;
+ // The largest sequence number in the newly created file
+ SequenceNumber largest_seqno;
+ // Table properties of the table being flushed
+ TableProperties table_properties;
+
+ FlushReason flush_reason;
+};
+
+struct CompactionFileInfo {
+ // The level of the file.
+ int level;
+
+ // The file number of the file.
+ uint64_t file_number;
+
+ // The file number of the oldest blob file this SST file references.
+ uint64_t oldest_blob_file_number;
+};
+
+struct CompactionJobInfo {
+ // the id of the column family where the compaction happened.
+ uint32_t cf_id;
+ // the name of the column family where the compaction happened.
+ std::string cf_name;
+ // the status indicating whether the compaction was successful or not.
+ Status status;
+ // the id of the thread that completed this compaction job.
+ uint64_t thread_id;
+ // the job id, which is unique in the same thread.
+ int job_id;
+ // the smallest input level of the compaction.
+ int base_input_level;
+ // the output level of the compaction.
+ int output_level;
+
+ // The following variables contain information about compaction inputs
+ // and outputs. A file may appear in both the input and output lists
+ // if it was simply moved to a different level. The order of elements
+ // is the same across input_files and input_file_infos; similarly, it is
+ // the same across output_files and output_file_infos.
+
+ // The names of the compaction input files.
+ std::vector<std::string> input_files;
+
+ // Additional information about the compaction input files.
+ std::vector<CompactionFileInfo> input_file_infos;
+
+ // The names of the compaction output files.
+ std::vector<std::string> output_files;
+
+ // Additional information about the compaction output files.
+ std::vector<CompactionFileInfo> output_file_infos;
+
+ // Table properties for input and output tables.
+ // The map is keyed by values from input_files and output_files.
+ TablePropertiesCollection table_properties;
+
+ // Reason to run the compaction
+ CompactionReason compaction_reason;
+
+ // Compression algorithm used for output files
+ CompressionType compression;
+
+ // If non-null, this variable stores detailed information
+ // about this compaction.
+ CompactionJobStats stats;
+};
+
+struct MemTableInfo {
+ // the name of the column family to which memtable belongs
+ std::string cf_name;
+ // Sequence number of the first element that was inserted
+ // into the memtable.
+ SequenceNumber first_seqno;
+ // Sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into this
+ // memtable. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ SequenceNumber earliest_seqno;
+ // Total number of entries in memtable
+ uint64_t num_entries;
+ // Total number of deletes in memtable
+ uint64_t num_deletes;
+};
+
+struct ExternalFileIngestionInfo {
+ // the name of the column family
+ std::string cf_name;
+ // Path of the file outside the DB
+ std::string external_file_path;
+ // Path of the file inside the DB
+ std::string internal_file_path;
+ // The global sequence number assigned to keys in this file
+ SequenceNumber global_seqno;
+ // Table properties of the table being flushed
+ TableProperties table_properties;
+};
+
+// EventListener class contains a set of callback functions that will
+// be called when specific RocksDB event happens such as flush. It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// Note that callback functions should not run for an extended period of
+// time before the function returns, otherwise RocksDB may be blocked.
+// For example, it is not suggested to do DB::CompactFiles() (as it may
+// run for a long while) or issue many of DB::Put() (as Put may be blocked
+// in certain cases) in the same thread in the EventListener callback.
+// However, doing DB::CompactFiles() and DB::Put() in another thread is
+// considered safe.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event. For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+//
+// [Locking] All EventListener callbacks are designed to be called without
+// the current thread holding any DB mutex. This is to prevent potential
+// deadlock and performance issue when using EventListener callback
+// in a complex way.
+class EventListener {
+ public:
+ // A callback function to RocksDB which will be called whenever a
+ // registered RocksDB flushes a file. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnFlushCompleted(DB* /*db*/,
+ const FlushJobInfo& /*flush_job_info*/) {}
+
+ // A callback function to RocksDB which will be called before a
+ // RocksDB starts to flush memtables. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnFlushBegin(DB* /*db*/,
+ const FlushJobInfo& /*flush_job_info*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a SST file is deleted. Different from OnCompactionCompleted and
+ // OnFlushCompleted, this callback is designed for external logging
+ // service and thus only provide string parameters instead
+ // of a pointer to DB. Applications that build logic basic based
+ // on file creations and deletions is suggested to implement
+ // OnFlushCompleted and OnCompactionCompleted.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from the
+ // returned value.
+ virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
+
+ // A callback function to RocksDB which will be called before a
+ // RocksDB starts to compact. The default implementation is
+ // no-op.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a registered RocksDB compacts a file. The default implementation
+ // is a no-op.
+ //
+ // Note that this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // @param db a pointer to the rocksdb instance which just compacted
+ // a file.
+ // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+ // after this function is returned, and must be copied if it is needed
+ // outside of this function.
+ virtual void OnCompactionCompleted(DB* /*db*/,
+ const CompactionJobInfo& /*ci*/) {}
+
+ // A callback function for RocksDB which will be called whenever
+ // a SST file is created. Different from OnCompactionCompleted and
+ // OnFlushCompleted, this callback is designed for external logging
+ // service and thus only provide string parameters instead
+ // of a pointer to DB. Applications that build logic basic based
+ // on file creations and deletions is suggested to implement
+ // OnFlushCompleted and OnCompactionCompleted.
+ //
+ // Historically it will only be called if the file is successfully created.
+ // Now it will also be called on failure case. User can check info.status
+ // to see if it succeeded or not.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a SST file is being created. It will follow by OnTableFileCreated after
+ // the creation finishes.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a memtable is made immutable.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ //
+ // Note that if applications would like to use the passed reference
+ // outside this function call, they should make copies from these
+ // returned value.
+ virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before
+ // a column family handle is deleted.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ // @param handle is a pointer to the column family handle to be deleted
+ // which will become a dangling pointer after the deletion.
+ virtual void OnColumnFamilyHandleDeletionStarted(
+ ColumnFamilyHandle* /*handle*/) {}
+
+ // A callback function for RocksDB which will be called after an external
+ // file is ingested using IngestExternalFile.
+ //
+ // Note that the this function will run on the same thread as
+ // IngestExternalFile(), if this function is blocked, IngestExternalFile()
+ // will be blocked from finishing.
+ virtual void OnExternalFileIngested(
+ DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called before setting the
+ // background error status to a non-OK value. The new background error status
+ // is provided in `bg_error` and can be modified by the callback. E.g., a
+ // callback can suppress errors by resetting it to Status::OK(), thus
+ // preventing the database from entering read-only mode. We do not provide any
+ // guarantee when failed flushes/compactions will be rescheduled if the user
+ // suppresses an error.
+ //
+ // Note that this function can run on the same threads as flush, compaction,
+ // and user writes. So, it is extremely important not to perform heavy
+ // computations or blocking calls in this function.
+ virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
+ Status* /* bg_error */) {}
+
+ // A callback function for RocksDB which will be called whenever a change
+ // of superversion triggers a change of the stall conditions.
+ //
+ // Note that the this function must be implemented in a way such that
+ // it should not run for an extended period of time before the function
+ // returns. Otherwise, RocksDB may be blocked.
+ virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
+
+ // A callback function for RocksDB which will be called whenever a file read
+ // operation finishes.
+ virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+ // A callback function for RocksDB which will be called whenever a file write
+ // operation finishes.
+ virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+ // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
+ // false, then they won't be called.
+ virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
+ // A callback function for RocksDB which will be called just before
+ // starting the automatic recovery process for recoverable background
+ // errors, such as NoSpace(). The callback can suppress the automatic
+ // recovery by setting *auto_recovery to false. The database will then
+ // have to be transitioned out of read-only mode by calling DB::Resume()
+ virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+ Status /* bg_error */,
+ bool* /* auto_recovery */) {}
+
+ // A callback function for RocksDB which will be called once the database
+ // is recovered from read-only mode after an error. When this is called, it
+ // means normal writes to the database can be issued and the user can
+ // initiate any further recovery actions needed
+ virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
+
+ virtual ~EventListener() {}
+};
+
+#else
+
+class EventListener {};
+struct FlushJobInfo {};
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h
new file mode 100644
index 000000000..60256a977
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memory_allocator.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+#include <memory>
+
+namespace ROCKSDB_NAMESPACE {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator {
+ public:
+ virtual ~MemoryAllocator() = default;
+
+ // Name of the cache allocator, printed in the log
+ virtual const char* Name() const = 0;
+
+ // Allocate a block of at least size. Has to be thread-safe.
+ virtual void* Allocate(size_t size) = 0;
+
+ // Deallocate previously allocated block. Has to be thread-safe.
+ virtual void Deallocate(void* p) = 0;
+
+ // Returns the memory size of the block allocated at p. The default
+ // implementation that just returns the original allocation_size is fine.
+ virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+ // default implementation just returns the allocation size
+ return allocation_size;
+ }
+};
+
+struct JemallocAllocatorOptions {
+ // Jemalloc tcache cache allocations by size class. For each size class,
+ // it caches between 20 (for large size classes) to 200 (for small size
+ // classes). To reduce tcache memory usage in case the allocator is access
+ // by large number of threads, we can control whether to cache an allocation
+ // by its size.
+ bool limit_tcache_size = false;
+
+ // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+ // When used with block cache, it is recommneded to set it to block_size/4.
+ size_t tcache_size_lower_bound = 1024;
+
+ // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+ // When used with block cache, it is recommneded to set it to block_size.
+ size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocators which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator is through the same arena.
+// The memory allocator hooks memory allocation of the arena, and call
+// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduce of jemalloc
+// metadata for some workload.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incur 0.5M extra memory usage per-thread. The usage
+// can be reduce by limitting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+ JemallocAllocatorOptions& options,
+ std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
new file mode 100644
index 000000000..49723264a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -0,0 +1,385 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+// (1) It does not store duplicate items.
+// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+// equality.
+// (3) It can be accessed concurrently by multiple readers and can support
+// during reads. However, it needn't support multiple concurrent writes.
+// (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+// - SkipListRep: This is the default; it is backed by a skip list.
+// - HashSkipListRep: The memtable rep that is best used for keys that are
+// structured like "prefix:suffix" where iteration within a prefix is
+// common and iteration across different prefixes is rare. It is backed by
+// a hash map where each bucket is a skip list.
+// - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <rocksdb/slice.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <memory>
+#include <stdexcept>
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Allocator;
+class LookupKey;
+class SliceTransform;
+class Logger;
+
+typedef void* KeyHandle;
+
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+class MemTableRep {
+ public:
+ // KeyComparator provides a means to compare keys, which are internal keys
+ // concatenated with values.
+ class KeyComparator {
+ public:
+ typedef ROCKSDB_NAMESPACE::Slice DecodedType;
+
+ virtual DecodedType decode_key(const char* key) const {
+ // The format of key is frozen and can be terated as a part of the API
+ // contract. Refer to MemTable::Add for details.
+ return GetLengthPrefixedSlice(key);
+ }
+
+ // Compare a and b. Return a negative value if a is less than b, 0 if they
+ // are equal, and a positive value if a is greater than b
+ virtual int operator()(const char* prefix_len_key1,
+ const char* prefix_len_key2) const = 0;
+
+ virtual int operator()(const char* prefix_len_key,
+ const Slice& key) const = 0;
+
+ virtual ~KeyComparator() {}
+ };
+
+ explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
+
+ // Allocate a buf of len size for storing key. The idea is that a
+ // specific memtable representation knows its underlying data structure
+ // better. By allowing it to allocate memory, it can possibly put
+ // correlated stuff in consecutive memory area to make processor
+ // prefetching more efficient.
+ virtual KeyHandle Allocate(const size_t len, char** buf);
+
+ // Insert key into the collection. (The caller will pack key and value into a
+ // single buffer and pass that in as the parameter to Insert).
+ // REQUIRES: nothing that compares equal to key is currently in the
+ // collection, and no concurrent modifications to the table in progress
+ virtual void Insert(KeyHandle handle) = 0;
+
+ // Same as ::Insert
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKey(KeyHandle handle) {
+ Insert(handle);
+ return true;
+ }
+
+ // Same as Insert(), but in additional pass a hint to insert location for
+ // the key. If hint points to nullptr, a new hint will be populated.
+ // otherwise the hint will be updated to reflect the last insert location.
+ //
+ // Currently only skip-list based memtable implement the interface. Other
+ // implementations will fallback to Insert() by default.
+ virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
+ // Ignore the hint by default.
+ Insert(handle);
+ }
+
+ // Same as ::InsertWithHint
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
+ InsertWithHint(handle, hint);
+ return true;
+ }
+
+ // Same as ::InsertWithHint, but allow concurrnet write
+ //
+ // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+ // the hint will be updated to reflect the last insert location. The hint is
+ // owned by the caller and it is the caller's responsibility to delete the
+ // hint later.
+ //
+ // Currently only skip-list based memtable implement the interface. Other
+ // implementations will fallback to InsertConcurrently() by default.
+ virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+ // Ignore the hint by default.
+ InsertConcurrently(handle);
+ }
+
+ // Same as ::InsertWithHintConcurrently
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+ InsertWithHintConcurrently(handle, hint);
+ return true;
+ }
+
+ // Like Insert(handle), but may be called concurrent with other calls
+ // to InsertConcurrently for other handles.
+ //
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual void InsertConcurrently(KeyHandle handle);
+
+ // Same as ::InsertConcurrently
+ // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+ // the <key, seq> already exists.
+ virtual bool InsertKeyConcurrently(KeyHandle handle) {
+ InsertConcurrently(handle);
+ return true;
+ }
+
+ // Returns true iff an entry that compares equal to key is in the collection.
+ virtual bool Contains(const char* key) const = 0;
+
+ // Notify this table rep that it will no longer be added to. By default,
+ // does nothing. After MarkReadOnly() is called, this table rep will
+ // not be written to (ie No more calls to Allocate(), Insert(),
+ // or any writes done directly to entries accessed through the iterator.)
+ virtual void MarkReadOnly() {}
+
+ // Notify this table rep that it has been flushed to stable storage.
+ // By default, does nothing.
+ //
+ // Invariant: MarkReadOnly() is called, before MarkFlushed().
+ // Note that this method if overridden, should not run for an extended period
+ // of time. Otherwise, RocksDB may be blocked.
+ virtual void MarkFlushed() {}
+
+ // Look up key from the mem table, since the first key in the mem table whose
+ // user_key matches the one given k, call the function callback_func(), with
+ // callback_args directly forwarded as the first parameter, and the mem table
+ // key as the second parameter. If the return value is false, then terminates.
+ // Otherwise, go through the next key.
+ //
+ // It's safe for Get() to terminate after having finished all the potential
+ // key for the k.user_key(), or not.
+ //
+ // Default:
+ // Get() function with a default value of dynamically construct an iterator,
+ // seek and call the call back function.
+ virtual void Get(const LookupKey& k, void* callback_args,
+ bool (*callback_func)(void* arg, const char* entry));
+
+ virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
+ const Slice& /*end_key*/) {
+ return 0;
+ }
+
+ // Report an approximation of how much memory has been used other than memory
+ // that was allocated through the allocator. Safe to call from any thread.
+ virtual size_t ApproximateMemoryUsage() = 0;
+
+ virtual ~MemTableRep() {}
+
+ // Iteration over the contents of a skip collection
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified collection.
+ // The returned iterator is not valid.
+ // explicit Iterator(const MemTableRep* collection);
+ virtual ~Iterator() {}
+
+ // Returns true iff the iterator is positioned at a valid node.
+ virtual bool Valid() const = 0;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ virtual const char* key() const = 0;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Advance to the first entry with a key >= target
+ virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+ // retreat to the first entry with a key <= target
+ virtual void SeekForPrev(const Slice& internal_key,
+ const char* memtable_key) = 0;
+
+ // Position at the first entry in collection.
+ // Final state of iterator is Valid() iff collection is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last entry in collection.
+ // Final state of iterator is Valid() iff collection is not empty.
+ virtual void SeekToLast() = 0;
+ };
+
+ // Return an iterator over the keys in this representation.
+ // arena: If not null, the arena needs to be used to allocate the Iterator.
+ // When destroying the iterator, the caller will not call "delete"
+ // but Iterator::~Iterator() directly. The destructor needs to destroy
+ // all the states but those allocated in arena.
+ virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
+
+ // Return an iterator that has a special Seek semantics. The result of
+ // a Seek might only include keys with the same prefix as the target key.
+ // arena: If not null, the arena is used to allocate the Iterator.
+ // When destroying the iterator, the caller will not call "delete"
+ // but Iterator::~Iterator() directly. The destructor needs to destroy
+ // all the states but those allocated in arena.
+ virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+ return GetIterator(arena);
+ }
+
+ // Return true if the current MemTableRep supports merge operator.
+ // Default: true
+ virtual bool IsMergeOperatorSupported() const { return true; }
+
+ // Return true if the current MemTableRep supports snapshot
+ // Default: true
+ virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+ // When *key is an internal key concatenated with the value, returns the
+ // user key.
+ virtual Slice UserKey(const char* key) const;
+
+ Allocator* allocator_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory {
+ public:
+ virtual ~MemTableRepFactory() {}
+
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) = 0;
+ virtual MemTableRep* CreateMemTableRep(
+ const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
+ const SliceTransform* slice_transform, Logger* logger,
+ uint32_t /* column_family_id */) {
+ return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
+ }
+
+ virtual const char* Name() const = 0;
+
+ // Return true if the current MemTableRep supports concurrent inserts
+ // Default: false
+ virtual bool IsInsertConcurrentlySupported() const { return false; }
+
+ // Return true if the current MemTableRep supports detecting duplicate
+ // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
+ // false when if the <key,seq> already exists.
+ // Default: false
+ virtual bool CanHandleDuplicatedKey() const { return false; }
+};
+
+// This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+// lookahead: If non-zero, each iterator's seek operation will start the
+// search from the previously visited record (doing at most 'lookahead'
+// steps). This is an optimization for the access pattern including many
+// seeks with consecutive keys.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+ explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
+
+ using MemTableRepFactory::CreateMemTableRep;
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) override;
+ virtual const char* Name() const override { return "SkipListFactory"; }
+
+ bool IsInsertConcurrentlySupported() const override { return true; }
+
+ bool CanHandleDuplicatedKey() const override { return true; }
+
+ private:
+ const size_t lookahead_;
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+// count: Passed to the constructor of the underlying std::vector of each
+// VectorRep. On initialization, the underlying array will be at least count
+// bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+ const size_t count_;
+
+ public:
+ explicit VectorRepFactory(size_t count = 0) : count_(count) {}
+
+ using MemTableRepFactory::CreateMemTableRep;
+ virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+ Allocator*, const SliceTransform*,
+ Logger* logger) override;
+
+ virtual const char* Name() const override { return "VectorRepFactory"; }
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+// link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+ size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+ int32_t skiplist_branching_factor = 4);
+
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+// Otherwise from huge page TLB. The user needs to reserve
+// huge pages for it to be allocated, like:
+// sysctl -w vm.nr_hugepages=20
+// See linux doc Documentation/vm/hugetlbpage.txt
+// @bucket_entries_logging_threshold: if number of entries in one bucket
+// exceeds this number, log about it.
+// @if_log_bucket_dist_when_flash: if true, log distribution of number of
+// entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+// entries exceed this parameter.
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+ size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
+ int bucket_entries_logging_threshold = 4096,
+ bool if_log_bucket_dist_when_flash = true,
+ uint32_t threshold_use_skiplist = 256);
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h
new file mode 100644
index 000000000..86f3d7260
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/merge_operator.h
@@ -0,0 +1,257 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+// a) AssociativeMergeOperator - for most simple semantics (always take
+// two values, and merge them into one value, which is then put back
+// into rocksdb); numeric addition and string concatenation are examples;
+//
+// b) MergeOperator - the generic class for all the more abstract / complex
+// operations; one method (FullMergeV2) to merge a Put/Delete value with a
+// merge operand; and another method (PartialMerge) that merges multiple
+// operands together. this is especially useful if your key values have
+// complex structures but you would still like to support client-specific
+// incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+class MergeOperator {
+ public:
+ virtual ~MergeOperator() {}
+ static const char* Type() { return "MergeOperator"; }
+
+ // Gives the client a way to express the read -> modify -> write semantics
+ // key: (IN) The key that's associated with this merge operation.
+ // Client could multiplex the merge operator based on it
+ // if the key space is partitioned and different subspaces
+ // refer to different types of data which have different
+ // merge operation semantics
+ // existing: (IN) null indicates that the key does not exist before this op
+ // operand_list:(IN) the sequence of merge operations to apply, front() first.
+ // new_value:(OUT) Client is responsible for filling the merge result here.
+ // The string that new_value is pointing to will be empty.
+ // logger: (IN) Client could use this to log errors during merge.
+ //
+ // Return true on success.
+ // All values passed in will be client-specific values. So if this method
+ // returns false, it is because client specified bad data or there was
+ // internal corruption. This will be treated as an error by the library.
+ //
+ // Also make use of the *logger for error messages.
+ virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+ const std::deque<std::string>& /*operand_list*/,
+ std::string* /*new_value*/, Logger* /*logger*/) const {
+ // deprecated, please use FullMergeV2()
+ assert(false);
+ return false;
+ }
+
+ struct MergeOperationInput {
+ explicit MergeOperationInput(const Slice& _key,
+ const Slice* _existing_value,
+ const std::vector<Slice>& _operand_list,
+ Logger* _logger)
+ : key(_key),
+ existing_value(_existing_value),
+ operand_list(_operand_list),
+ logger(_logger) {}
+
+ // The key associated with the merge operation.
+ const Slice& key;
+ // The existing value of the current key, nullptr means that the
+ // value doesn't exist.
+ const Slice* existing_value;
+ // A list of operands to apply.
+ const std::vector<Slice>& operand_list;
+ // Logger could be used by client to log any errors that happen during
+ // the merge operation.
+ Logger* logger;
+ };
+
+ struct MergeOperationOutput {
+ explicit MergeOperationOutput(std::string& _new_value,
+ Slice& _existing_operand)
+ : new_value(_new_value), existing_operand(_existing_operand) {}
+
+ // Client is responsible for filling the merge result here.
+ std::string& new_value;
+ // If the merge result is one of the existing operands (or existing_value),
+ // client can set this field to the operand (or existing_value) instead of
+ // using new_value.
+ Slice& existing_operand;
+ };
+
+ // This function applies a stack of merge operands in chrionological order
+ // on top of an existing value. There are two ways in which this method is
+ // being used:
+ // a) During Get() operation, it used to calculate the final value of a key
+ // b) During compaction, in order to collapse some operands with the based
+ // value.
+ //
+ // Note: The name of the method is somewhat misleading, as both in the cases
+ // of Get() or compaction it may be called on a subset of operands:
+ // K: 0 +1 +2 +7 +4 +5 2 +1 +2
+ // ^
+ // |
+ // snapshot
+ // In the example above, Get(K) operation will call FullMerge with a base
+ // value of 2 and operands [+1, +2]. Compaction process might decide to
+ // collapse the beginning of the history up to the snapshot by performing
+ // full Merge with base value of 0 and operands [+1, +2, +7, +3].
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const;
+
+ // This function performs merge(left_op, right_op)
+ // when both the operands are themselves merge operation types
+ // that you would have passed to a DB::Merge() call in the same order
+ // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+ //
+ // PartialMerge should combine them into a single merge operation that is
+ // saved into *new_value, and then it should return true.
+ // *new_value should be constructed such that a call to
+ // DB::Merge(key, *new_value) would yield the same result as a call
+ // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+ //
+ // The string that new_value is pointing to will be empty.
+ //
+ // The default implementation of PartialMergeMulti will use this function
+ // as a helper, for backward compatibility. Any successor class of
+ // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+ // although implementing PartialMergeMulti is suggested as it is in general
+ // more effective to merge multiple operands at a time instead of two
+ // operands at a time.
+ //
+ // If it is impossible or infeasible to combine the two operations,
+ // leave new_value unchanged and return false. The library will
+ // internally keep track of the operations, and apply them in the
+ // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+ //
+ // TODO: Presently there is no way to differentiate between error/corruption
+ // and simply "return false". For now, the client should simply return
+ // false in any case it cannot perform partial-merge, regardless of reason.
+ // If there is corruption in the data, handle it in the FullMergeV2() function
+ // and return false there. The default implementation of PartialMerge will
+ // always return false.
+ virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+ const Slice& /*right_operand*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const {
+ return false;
+ }
+
+ // This function performs merge when all the operands are themselves merge
+ // operation types that you would have passed to a DB::Merge() call in the
+ // same order (front() first)
+ // (i.e. DB::Merge(key, operand_list[0]), followed by
+ // DB::Merge(key, operand_list[1]), ...)
+ //
+ // PartialMergeMulti should combine them into a single merge operation that is
+ // saved into *new_value, and then it should return true. *new_value should
+ // be constructed such that a call to DB::Merge(key, *new_value) would yield
+ // the same result as subquential individual calls to DB::Merge(key, operand)
+ // for each operand in operand_list from front() to back().
+ //
+ // The string that new_value is pointing to will be empty.
+ //
+ // The PartialMergeMulti function will be called when there are at least two
+ // operands.
+ //
+ // In the default implementation, PartialMergeMulti will invoke PartialMerge
+ // multiple times, where each time it only merges two operands. Developers
+ // should either implement PartialMergeMulti, or implement PartialMerge which
+ // is served as the helper function of the default PartialMergeMulti.
+ virtual bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const;
+
+ // The name of the MergeOperator. Used to check for MergeOperator
+ // mismatches (i.e., a DB created with one MergeOperator is
+ // accessed using a different MergeOperator)
+ // TODO: the name is currently not stored persistently and thus
+ // no checking is enforced. Client is responsible for providing
+ // consistent MergeOperator between DB opens.
+ virtual const char* Name() const = 0;
+
+ // Determines whether the PartialMerge can be called with just a single
+ // merge operand.
+ // Override and return true for allowing a single operand. PartialMerge
+ // and PartialMergeMulti should be overridden and implemented
+ // correctly to properly handle a single operand.
+ virtual bool AllowSingleOperand() const { return false; }
+
+ // Allows to control when to invoke a full merge during Get.
+ // This could be used to limit the number of merge operands that are looked at
+ // during a point lookup, thereby helping in limiting the number of levels to
+ // read from.
+ // Doesn't help with iterators.
+ //
+ // Note: the merge operands are passed to this function in the reversed order
+ // relative to how they were merged (passed to FullMerge or FullMergeV2)
+ // for performance reasons, see also:
+ // https://github.com/facebook/rocksdb/issues/3865
+ virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const {
+ return false;
+ }
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+ ~AssociativeMergeOperator() override {}
+
+ // Gives the client a way to express the read -> modify -> write semantics
+ // key: (IN) The key that's associated with this merge operation.
+ // existing_value:(IN) null indicates the key does not exist before this op
+ // value: (IN) the value to update/merge the existing_value with
+ // new_value: (OUT) Client is responsible for filling the merge result
+ // here. The string that new_value is pointing to will be empty.
+ // logger: (IN) Client could use this to log errors during merge.
+ //
+ // Return true on success.
+ // All values passed in will be client-specific values. So if this method
+ // returns false, it is because client specified bad data or there was
+ // internal corruption. The client should assume that this will be treated
+ // as an error by the library.
+ virtual bool Merge(const Slice& key, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* logger) const = 0;
+
+ private:
+ // Default implementations of the MergeOperator functions
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ bool PartialMerge(const Slice& key, const Slice& left_operand,
+ const Slice& right_operand, std::string* new_value,
+ Logger* logger) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
new file mode 100644
index 000000000..f1a9ee60e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyMetaData;
+struct LevelMetaData;
+struct SstFileMetaData;
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+ ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
+ ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+ const std::vector<LevelMetaData>&& _levels)
+ : size(_size), name(_name), levels(_levels) {}
+
+ // The size of this column family in bytes, which is equal to the sum of
+ // the file size of its "levels".
+ uint64_t size;
+ // The number of files in this column family.
+ size_t file_count;
+ // The name of the column family.
+ std::string name;
+ // The metadata of all levels in this column family.
+ std::vector<LevelMetaData> levels;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+ LevelMetaData(int _level, uint64_t _size,
+ const std::vector<SstFileMetaData>&& _files)
+ : level(_level), size(_size), files(_files) {}
+
+ // The level which this meta data describes.
+ const int level;
+ // The size of this level in bytes, which is equal to the sum of
+ // the file size of its "files".
+ const uint64_t size;
+ // The metadata of all sst files in this level.
+ const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a SST file.
+struct SstFileMetaData {
+ SstFileMetaData()
+ : size(0),
+ file_number(0),
+ smallest_seqno(0),
+ largest_seqno(0),
+ num_reads_sampled(0),
+ being_compacted(false),
+ num_entries(0),
+ num_deletions(0),
+ oldest_blob_file_number(0) {}
+
+ SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+ const std::string& _path, size_t _size,
+ SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
+ const std::string& _smallestkey,
+ const std::string& _largestkey, uint64_t _num_reads_sampled,
+ bool _being_compacted, uint64_t _oldest_blob_file_number,
+ uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+ std::string& _file_checksum,
+ std::string& _file_checksum_func_name)
+ : size(_size),
+ name(_file_name),
+ file_number(_file_number),
+ db_path(_path),
+ smallest_seqno(_smallest_seqno),
+ largest_seqno(_largest_seqno),
+ smallestkey(_smallestkey),
+ largestkey(_largestkey),
+ num_reads_sampled(_num_reads_sampled),
+ being_compacted(_being_compacted),
+ num_entries(0),
+ num_deletions(0),
+ oldest_blob_file_number(_oldest_blob_file_number),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time),
+ file_checksum(_file_checksum),
+ file_checksum_func_name(_file_checksum_func_name) {}
+
+ // File size in bytes.
+ size_t size;
+ // The name of the file.
+ std::string name;
+ // The id of the file.
+ uint64_t file_number;
+ // The full path where the file locates.
+ std::string db_path;
+
+ SequenceNumber smallest_seqno; // Smallest sequence number in file.
+ SequenceNumber largest_seqno; // Largest sequence number in file.
+ std::string smallestkey; // Smallest user defined key in the file.
+ std::string largestkey; // Largest user defined key in the file.
+ uint64_t num_reads_sampled; // How many times the file is read.
+ bool being_compacted; // true if the file is currently being compacted.
+
+ uint64_t num_entries;
+ uint64_t num_deletions;
+
+ uint64_t oldest_blob_file_number; // The id of the oldest blob file
+ // referenced by the file.
+ // An SST file may be generated by compactions whose input files may
+ // in turn be generated by earlier compactions. The creation time of the
+ // oldest SST file that is the compaction ancester of this file.
+ // The timestamp is provided Env::GetCurrentTime().
+ // 0 if the information is not available.
+ uint64_t oldest_ancester_time;
+ // Timestamp when the SST file is created, provided by Env::GetCurrentTime().
+ // 0 if the information is not available.
+ uint64_t file_creation_time;
+
+ // The checksum of a SST file, the value is decided by the file content and
+ // the checksum algorithm used for this SST file. The checksum function is
+ // identified by the file_checksum_func_name. If the checksum function is
+ // not specified, file_checksum is "0" by default.
+ std::string file_checksum;
+
+ // The name of the checksum function used to generate the file checksum
+ // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
+ // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
+ // "Unknown".
+ std::string file_checksum_func_name;
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+ std::string column_family_name; // Name of the column family
+ int level; // Level at which this file resides.
+ LiveFileMetaData() : column_family_name(), level(0) {}
+};
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+ std::string db_comparator_name; // Used to safety check at import.
+ std::vector<LiveFileMetaData> files; // Vector of file metadata.
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
new file mode 100644
index 000000000..f5d44fb74
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -0,0 +1,1587 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#ifdef max
+#undef max
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class ConcurrentTaskLimiter;
+class Env;
+enum InfoLogLevel : unsigned char;
+class SstFileManager;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class MemTableRepFactory;
+class RateLimiter;
+class Slice;
+class Statistics;
+class InternalKeyComparator;
+class WalFilter;
+class FileSystem;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs. Each block may be compressed before
+// being stored in a file. The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType : unsigned char {
+ // NOTE: do not change the values of existing entries, as these are
+ // part of the persistent format on disk.
+ kNoCompression = 0x0,
+ kSnappyCompression = 0x1,
+ kZlibCompression = 0x2,
+ kBZip2Compression = 0x3,
+ kLZ4Compression = 0x4,
+ kLZ4HCCompression = 0x5,
+ kXpressCompression = 0x6,
+ kZSTD = 0x7,
+
+ // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
+ // 0.8.0 or consider a possibility of downgrading the service or copying
+ // the database files to another service running with an older version of
+ // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
+ // eventually remove the option from the public API.
+ kZSTDNotFinalCompression = 0x40,
+
+ // kDisableCompressionOption is used to disable some compression options.
+ kDisableCompressionOption = 0xff,
+};
+
+struct Options;
+struct DbPath;
+
+struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
+ // The function recovers options to a previous version. Only 4.6 or later
+ // versions are supported.
+ ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ // Some functions that make it easier to optimize RocksDB
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ // An optional cache object is passed in to be used as the block cache
+ ColumnFamilyOptions* OptimizeForSmallDb(
+ std::shared_ptr<Cache>* cache = nullptr);
+
+ // Use this if you don't need to keep the data sorted, i.e. you'll never use
+ // an iterator, only Put() and Get() API calls
+ //
+ // Not supported in ROCKSDB_LITE
+ ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
+
+ // Default values for some parameters in ColumnFamilyOptions are not
+ // optimized for heavy workloads and big datasets, which means you might
+ // observe write stalls under some conditions. As a starting point for tuning
+ // RocksDB options, use the following two functions:
+ // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+ // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+ // Universal style compaction is focused on reducing Write Amplification
+ // Factor for big data sets, but increases Space Amplification. You can learn
+ // more about the different styles here:
+ // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+ // Make sure to also call IncreaseParallelism(), which will provide the
+ // biggest performance gains.
+ // Note: we might use more memory than memtable_memory_budget during high
+ // write rate period
+ //
+ // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
+ ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+ uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+ ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+ uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator = BytewiseComparator();
+
+ // REQUIRES: The client must provide a merge operator if Merge operation
+ // needs to be accessed. Calling Merge on a DB without a merge operator
+ // would result in Status::NotSupported. The client must ensure that the
+ // merge operator supplied here has the same name and *exactly* the same
+ // semantics as the merge operator provided to previous open calls on
+ // the same DB. The only exception is reserved for upgrade, where a DB
+ // previously without a merge operator is introduced to Merge operation
+ // for the first time. It's necessary to specify a merge operator when
+ // opening the DB in this case.
+ // Default: nullptr
+ std::shared_ptr<MergeOperator> merge_operator = nullptr;
+
+ // A single CompactionFilter instance to call into during compaction.
+ // Allows an application to modify/delete a key-value during background
+ // compaction.
+ //
+ // If the client requires a new compaction filter to be used for different
+ // compaction runs, it can specify compaction_filter_factory instead of this
+ // option. The client should specify only one of the two.
+ // compaction_filter takes precedence over compaction_filter_factory if
+ // client specifies both.
+ //
+ // If multithreaded compaction is being used, the supplied CompactionFilter
+ // instance may be used from different threads concurrently and so should be
+ // thread-safe.
+ //
+ // Default: nullptr
+ const CompactionFilter* compaction_filter = nullptr;
+
+ // This is a factory that provides compaction filter objects which allow
+ // an application to modify/delete a key-value during background compaction.
+ //
+ // A new filter will be created on each compaction run. If multithreaded
+ // compaction is being used, each created CompactionFilter will only be used
+ // from a single thread and so does not need to be thread-safe.
+ //
+ // Default: nullptr
+ std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
+ //
+ // Larger values increase performance, especially during bulk loads.
+ // Up to max_write_buffer_number write buffers may be held in memory
+ // at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
+ // Also, a larger write buffer will result in a longer recovery time
+ // the next time the database is opened.
+ //
+ // Note that write_buffer_size is enforced per column family.
+ // See db_write_buffer_size for sharing memory across column families.
+ //
+ // Default: 64MB
+ //
+ // Dynamically changeable through SetOptions() API
+ size_t write_buffer_size = 64 << 20;
+
+ // Compress blocks using the specified compression algorithm.
+ //
+ // Default: kSnappyCompression, if it's supported. If snappy is not linked
+ // with the library, the default is kNoCompression.
+ //
+ // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ //
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kSnappyCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ //
+ // If you do not set `compression_opts.level`, or set it to
+ // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+ // default corresponding to `compression` as follows:
+ //
+ // - kZSTD: 3
+ // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+ // - kLZ4HCCompression: 0
+ // - For all others, we do not specify a compression level
+ //
+ // Dynamically changeable through SetOptions() API
+ CompressionType compression;
+
+ // Compression algorithm that will be used for the bottommost level that
+ // contain files.
+ //
+ // Default: kDisableCompressionOption (Disabled)
+ CompressionType bottommost_compression = kDisableCompressionOption;
+
+ // different options for compression algorithms used by bottommost_compression
+ // if it is enabled. To enable it, please see the definition of
+ // CompressionOptions.
+ CompressionOptions bottommost_compression_opts;
+
+ // different options for compression algorithms
+ CompressionOptions compression_opts;
+
+ // Number of files to trigger level-0 compaction. A value <0 means that
+ // level-0 compaction will not be triggered by number of files at all.
+ //
+ // Default: 4
+ //
+ // Dynamically changeable through SetOptions() API
+ int level0_file_num_compaction_trigger = 4;
+
+ // If non-nullptr, use the specified function to determine the
+ // prefixes for keys. These prefixes will be placed in the filter.
+ // Depending on the workload, this can reduce the number of read-IOP
+ // cost for scans when a prefix is passed via ReadOptions to
+ // db.NewIterator(). For prefix filtering to work properly,
+ // "prefix_extractor" and "comparator" must be such that the following
+ // properties hold:
+ //
+ // 1) key.starts_with(prefix(key))
+ // 2) Compare(prefix(key), key) <= 0.
+ // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+ // 4) prefix(prefix(key)) == prefix(key)
+ //
+ // Default: nullptr
+ std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+
+ // Control maximum total data size for a level.
+ // max_bytes_for_level_base is the max total for level-1.
+ // Maximum number of bytes for level L can be calculated as
+ // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+ // For example, if max_bytes_for_level_base is 200MB, and if
+ // max_bytes_for_level_multiplier is 10, total data size for level-1
+ // will be 200MB, total file size for level-2 will be 2GB,
+ // and total file size for level-3 will be 20GB.
+ //
+ // Default: 256MB.
+ //
+ // Dynamically changeable through SetOptions() API
+ uint64_t max_bytes_for_level_base = 256 * 1048576;
+
+ // Deprecated.
+ uint64_t snap_refresh_nanos = 0;
+
+ // Disable automatic compactions. Manual compactions can still
+ // be issued on this column family
+ //
+ // Dynamically changeable through SetOptions() API
+ bool disable_auto_compactions = false;
+
+ // This is a factory that provides TableFactory objects.
+ // Default: a block-based table factory that provides a default
+ // implementation of TableBuilder and TableReader with default
+ // BlockBasedTableOptions.
+ std::shared_ptr<TableFactory> table_factory;
+
+ // A list of paths where SST files for this column family
+ // can be put into, with its target size. Similar to db_paths,
+ // newer data is placed into paths specified earlier in the
+ // vector while older data gradually moves to paths specified
+ // later in the vector.
+ // Note that, if a path is supplied to multiple column
+ // families, it would have files and total size from all
+ // the column families combined. User should provision for the
+ // total size(from all the column families) in such cases.
+ //
+ // If left empty, db_paths will be used.
+ // Default: empty
+ std::vector<DbPath> cf_paths;
+
+ // Compaction concurrent thread limiter for the column family.
+ // If non-nullptr, use given concurrent thread limiter to control
+ // the max outstanding compaction tasks. Limiter can be shared with
+ // multiple column families across db instances.
+ //
+ // Default: nullptr
+ std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
+
+ // Create ColumnFamilyOptions with default values for all fields
+ ColumnFamilyOptions();
+ // Create ColumnFamilyOptions from Options
+ explicit ColumnFamilyOptions(const Options& options);
+
+ void Dump(Logger* log) const;
+};
+
+enum class WALRecoveryMode : char {
+ // Original levelDB recovery
+ // We tolerate incomplete record in trailing data on all logs
+ // Use case : This is legacy behavior
+ kTolerateCorruptedTailRecords = 0x00,
+ // Recover from clean shutdown
+ // We don't expect to find any corruption in the WAL
+ // Use case : This is ideal for unit tests and rare applications that
+ // can require high consistency guarantee
+ kAbsoluteConsistency = 0x01,
+ // Recover to point-in-time consistency (default)
+ // We stop the WAL playback on discovering WAL inconsistency
+ // Use case : Ideal for systems that have disk controller cache like
+ // hard disk, SSD without super capacitor that store related data
+ kPointInTimeRecovery = 0x02,
+ // Recovery after a disaster
+ // We ignore any corruption in the WAL and try to salvage as much data as
+ // possible
+ // Use case : Ideal for last ditch effort to recover data or systems that
+ // operate with low grade unrelated data
+ kSkipAnyCorruptedRecords = 0x03,
+};
+
+struct DbPath {
+ std::string path;
+ uint64_t target_size; // Target size of total files under the path, in byte.
+
+ DbPath() : target_size(0) {}
+ DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
+};
+
+struct DBOptions {
+ // The function recovers options to the option as in version 4.6.
+ DBOptions* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ // Some functions that make it easier to optimize RocksDB
+
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ // An optional cache object is passed in for the memory of the
+ // memtable to cost to
+ DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
+
+#ifndef ROCKSDB_LITE
+ // By default, RocksDB uses only one background thread for flush and
+ // compaction. Calling this function will set it up such that total of
+ // `total_threads` is used. Good value for `total_threads` is the number of
+ // cores. You almost definitely want to call this function if your system is
+ // bottlenecked by RocksDB.
+ DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif // ROCKSDB_LITE
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing = false;
+
+ // If true, missing column families will be automatically created.
+ // Default: false
+ bool create_missing_column_families = false;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists = false;
+
+ // If true, RocksDB will aggressively check consistency of the data.
+ // Also, if any of the writes to the database fails (Put, Delete, Merge,
+ // Write), the database will switch to read-only mode and fail all other
+ // Write operations.
+ // In most cases you want this to be set to true.
+ // Default: true
+ bool paranoid_checks = true;
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc. In the near
+ // future, support for doing storage operations such as read/write files
+ // through env will be deprecated in favor of file_system (see below)
+ // Default: Env::Default()
+ Env* env = Env::Default();
+
+ // Use the specified object to interact with the storage to
+ // read/write files. This is in addition to env. This option should be used
+ // if the desired storage subsystem provides a FileSystem implementation.
+ std::shared_ptr<FileSystem> file_system = nullptr;
+
+ // Use to control write rate of flush and compaction. Flush has higher
+ // priority than compaction. Rate limiting is disabled if nullptr.
+ // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> rate_limiter = nullptr;
+
+ // Use to track SST files and control their file deletion rate.
+ //
+ // Features:
+ // - Throttle the deletion rate of the SST files.
+ // - Keep track the total size of all SST files.
+ // - Set a maximum allowed space limit for SST files that when reached
+ // the DB wont do any further flushes or compactions and will set the
+ // background error.
+ // - Can be shared between multiple dbs.
+ // Limitations:
+ // - Only track and throttle deletes of SST files in
+ // first db_path (db_name if db_paths is empty).
+ //
+ // Default: nullptr
+ std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
+
+ // Any internal progress/error information generated by the db will
+ // be written to info_log if it is non-nullptr, or to a file stored
+ // in the same directory as the DB contents if info_log is nullptr.
+ // Default: nullptr
+ std::shared_ptr<Logger> info_log = nullptr;
+
+#ifdef NDEBUG
+ InfoLogLevel info_log_level = INFO_LEVEL;
+#else
+ InfoLogLevel info_log_level = DEBUG_LEVEL;
+#endif // NDEBUG
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set. Value -1 means
+ // files opened are always kept open. You can estimate number of files based
+ // on target_file_size_base and target_file_size_multiplier for level-based
+ // compaction. For universal-style compaction, you can usually set it to -1.
+ //
+ // Default: -1
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_open_files = -1;
+
+ // If max_open_files is -1, DB will open all files on DB::Open(). You can
+ // use this option to increase the number of threads used to open the files.
+ // Default: 16
+ int max_file_opening_threads = 16;
+
+ // Once write-ahead logs exceed this size, we will start forcing the flush of
+ // column families whose memtables are backed by the oldest live WAL file
+ // (i.e. the ones that are causing all the space amplification). If set to 0
+ // (default), we will dynamically choose the WAL size limit to be
+ // [sum of all write_buffer_size * max_write_buffer_number] * 4
+ // This option takes effect only when there are more than one column family as
+ // otherwise the wal size is dictated by the write_buffer_size.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t max_total_wal_size = 0;
+
+ // If non-null, then we should collect metrics about database operations
+ std::shared_ptr<Statistics> statistics = nullptr;
+
+ // By default, writes to stable storage use fdatasync (on platforms
+ // where this function is available). If this option is true,
+ // fsync is used instead.
+ //
+ // fsync and fdatasync are equally safe for our purposes and fdatasync is
+ // faster, so it is rarely necessary to set this option. It is provided
+ // as a workaround for kernel/filesystem bugs, such as one that affected
+ // fdatasync with ext4 in kernel versions prior to 3.7.
+ bool use_fsync = false;
+
+ // A list of paths where SST files can be put into, with its target size.
+ // Newer data is placed into paths specified earlier in the vector while
+ // older data gradually moves to paths specified later in the vector.
+ //
+ // For example, you have a flash device with 10GB allocated for the DB,
+ // as well as a hard drive of 2TB, you should config it to be:
+ // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+ //
+ // The system will try to guarantee data under each path is close to but
+ // not larger than the target size. But current and future file sizes used
+ // by determining where to place a file are based on best-effort estimation,
+ // which means there is a chance that the actual size under the directory
+ // is slightly more than target size under some workloads. User should give
+ // some buffer room for those cases.
+ //
+ // If none of the paths has sufficient room to place a file, the file will
+ // be placed to the last path anyway, despite to the target size.
+ //
+ // Placing newer data to earlier paths is also best-efforts. User should
+ // expect user files to be placed in higher levels in some extreme cases.
+ //
+ // If left empty, only one path will be used, which is db_name passed when
+ // opening the DB.
+ // Default: empty
+ std::vector<DbPath> db_paths;
+
+ // This specifies the info LOG dir.
+ // If it is empty, the log files will be in the same dir as data.
+ // If it is non empty, the log files will be in the specified dir,
+ // and the db data dir's absolute path will be used as the log file
+ // name's prefix.
+ std::string db_log_dir = "";
+
+ // This specifies the absolute dir path for write-ahead logs (WAL).
+ // If it is empty, the log files will be in the same dir as data,
+ // dbname is used as the data dir by default
+ // If it is non empty, the log files will be in kept the specified dir.
+ // When destroying the db,
+ // all log files in wal_dir and the dir itself is deleted
+ std::string wal_dir = "";
+
+ // The periodicity when obsolete files get deleted. The default
+ // value is 6 hours. The files that get out of scope by compaction
+ // process will still get automatically delete on every compaction,
+ // regardless of this setting
+ //
+ // Default: 6 hours
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
+
+ // Maximum number of concurrent background jobs (compactions and flushes).
+ //
+ // Default: 2
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_background_jobs = 2;
+
+ // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+ // value of max_background_jobs. This option is ignored.
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int base_background_compactions = -1;
+
+ // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+ // value of max_background_jobs. For backwards compatibility we will set
+ // `max_background_jobs = max_background_compactions + max_background_flushes`
+ // in the case where user sets at least one of `max_background_compactions` or
+ // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+ //
+ // Maximum number of concurrent background compaction jobs, submitted to
+ // the default LOW priority thread pool.
+ //
+ // If you're increasing this, also consider increasing number of threads in
+ // LOW priority thread pool. For more information, see
+ // Env::SetBackgroundThreads
+ //
+ // Default: -1
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ int max_background_compactions = -1;
+
+ // This value represents the maximum number of threads that will
+ // concurrently perform a compaction job by breaking it into multiple,
+ // smaller ones that are run simultaneously.
+ // Default: 1 (i.e. no subcompactions)
+ uint32_t max_subcompactions = 1;
+
+ // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+ // value of max_background_jobs. For backwards compatibility we will set
+ // `max_background_jobs = max_background_compactions + max_background_flushes`
+ // in the case where user sets at least one of `max_background_compactions` or
+ // `max_background_flushes`.
+ //
+ // Maximum number of concurrent background memtable flush jobs, submitted by
+ // default to the HIGH priority thread pool. If the HIGH priority thread pool
+ // is configured to have zero threads, flush jobs will share the LOW priority
+ // thread pool with compaction jobs.
+ //
+ // It is important to use both thread pools when the same Env is shared by
+ // multiple db instances. Without a separate pool, long running compaction
+ // jobs could potentially block memtable flush jobs of other db instances,
+ // leading to unnecessary Put stalls.
+ //
+ // If you're increasing this, also consider increasing number of threads in
+ // HIGH priority thread pool. For more information, see
+ // Env::SetBackgroundThreads
+ // Default: -1
+ int max_background_flushes = -1;
+
+ // Specify the maximal size of the info log file. If the log file
+ // is larger than `max_log_file_size`, a new info log file will
+ // be created.
+ // If max_log_file_size == 0, all logs will be written to one
+ // log file.
+ size_t max_log_file_size = 0;
+
+ // Time for the info log file to roll (in seconds).
+ // If specified with non-zero value, log file will be rolled
+ // if it has been active longer than `log_file_time_to_roll`.
+ // Default: 0 (disabled)
+ // Not supported in ROCKSDB_LITE mode!
+ size_t log_file_time_to_roll = 0;
+
+ // Maximal info log files to be kept.
+ // Default: 1000
+ size_t keep_log_file_num = 1000;
+
+ // Recycle log files.
+ // If non-zero, we will reuse previously written log files for new
+ // logs, overwriting the old data. The value indicates how many
+ // such files we will keep around at any point in time for later
+ // use. This is more efficient because the blocks are already
+ // allocated and fdatasync does not need to update the inode after
+ // each write.
+ // Default: 0
+ size_t recycle_log_file_num = 0;
+
+ // manifest file is rolled over on reaching this limit.
+ // The older manifest file be deleted.
+ // The default value is 1GB so that the manifest file can grow, but not
+ // reach the limit of storage capacity.
+ uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
+
+ // Number of shards used for table cache.
+ int table_cache_numshardbits = 6;
+
+ // NOT SUPPORTED ANYMORE
+ // int table_cache_remove_scan_count_limit;
+
+ // The following two fields affect how archived logs will be deleted.
+ // 1. If both set to 0, logs will be deleted asap and will not get into
+ // the archive.
+ // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+ // WAL files will be checked every 10 min and if total size is greater
+ // then WAL_size_limit_MB, they will be deleted starting with the
+ // earliest until size_limit is met. All empty files will be deleted.
+ // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+ // WAL files will be checked every WAL_ttl_seconds / 2 and those that
+ // are older than WAL_ttl_seconds will be deleted.
+ // 4. If both are not 0, WAL files will be checked every 10 min and both
+ // checks will be performed with ttl being first.
+ uint64_t WAL_ttl_seconds = 0;
+ uint64_t WAL_size_limit_MB = 0;
+
+ // Number of bytes to preallocate (via fallocate) the manifest
+ // files. Default is 4mb, which is reasonable to reduce random IO
+ // as well as prevent overallocation for mounts that preallocate
+ // large amounts of data (such as xfs's allocsize option).
+ size_t manifest_preallocation_size = 4 * 1024 * 1024;
+
+ // Allow the OS to mmap file for reading sst tables. Default: false
+ bool allow_mmap_reads = false;
+
+ // Allow the OS to mmap file for writing.
+ // DB::SyncWAL() only works if this is set to false.
+ // Default: false
+ bool allow_mmap_writes = false;
+
+ // Enable direct I/O mode for read/write
+ // they may or may not improve performance depending on the use case
+ //
+ // Files will be opened in "direct I/O" mode
+ // which means that data r/w from the disk will not be cached or
+ // buffered. The hardware buffer of the devices may however still
+ // be used. Memory mapped files are not impacted by these parameters.
+
+ // Use O_DIRECT for user and compaction reads.
+ // When true, we also force new_table_reader_for_compaction_inputs to true.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool use_direct_reads = false;
+
+ // Use O_DIRECT for writes in background flush and compactions.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool use_direct_io_for_flush_and_compaction = false;
+
+ // If false, fallocate() calls are bypassed
+ bool allow_fallocate = true;
+
+ // Disable child process inherit open files. Default: true
+ bool is_fd_close_on_exec = true;
+
+ // NOT SUPPORTED ANYMORE -- this options is no longer used
+ bool skip_log_error_on_recovery = false;
+
+ // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+ //
+ // Default: 600 (10 min)
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ unsigned int stats_dump_period_sec = 600;
+
+ // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
+ // Default: 600
+ unsigned int stats_persist_period_sec = 600;
+
+ // If true, automatically persist stats to a hidden column family (column
+ // family name: ___rocksdb_stats_history___) every
+ // stats_persist_period_sec seconds; otherwise, write to an in-memory
+ // struct. User can query through `GetStatsHistory` API.
+ // If user attempts to create a column family with the same name on a DB
+ // which have previously set persist_stats_to_disk to true, the column family
+ // creation will fail, but the hidden column family will survive, as well as
+ // the previously persisted statistics.
+ // When peristing stats to disk, the stat name will be limited at 100 bytes.
+ // Default: false
+ bool persist_stats_to_disk = false;
+
+ // if not zero, periodically take stats snapshots and store in memory, the
+ // memory size for stats snapshots is capped at stats_history_buffer_size
+ // Default: 1MB
+ size_t stats_history_buffer_size = 1024 * 1024;
+
+ // If set true, will hint the underlying file system that the file
+ // access pattern is random, when a sst file is opened.
+ // Default: true
+ bool advise_random_on_open = true;
+
+ // Amount of data to build up in memtables across all column
+ // families before writing to disk.
+ //
+ // This is distinct from write_buffer_size, which enforces a limit
+ // for a single memtable.
+ //
+ // This feature is disabled by default. Specify a non-zero value
+ // to enable it.
+ //
+ // Default: 0 (disabled)
+ size_t db_write_buffer_size = 0;
+
+ // The memory usage of memtable will report to this object. The same object
+ // can be passed into multiple DBs and it will track the sum of size of all
+ // the DBs. If the total size of all live memtables of all the DBs exceeds
+ // a limit, a flush will be triggered in the next DB to which the next write
+ // is issued.
+ //
+ // If the object is only passed to one DB, the behavior is the same as
+ // db_write_buffer_size. When write_buffer_manager is set, the value set will
+ // override db_write_buffer_size.
+ //
+ // This feature is disabled by default. Specify a non-zero value
+ // to enable it.
+ //
+ // Default: null
+ std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
+
+ // Specify the file access pattern once a compaction is started.
+ // It will be applied to all input files of a compaction.
+ // Default: NORMAL
+ enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
+ AccessHint access_hint_on_compaction_start = NORMAL;
+
+ // If true, always create a new file descriptor and new table reader
+ // for compaction inputs. Turn this parameter on may introduce extra
+ // memory usage in the table reader, if it allocates extra memory
+ // for indexes. This will allow file descriptor prefetch options
+ // to be set for compaction input files and not to impact file
+ // descriptors for the same file used by user queries.
+ // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
+ // for this mode if using block-based table.
+ //
+ // Default: false
+ // This flag has no affect on the behavior of compaction and plan to delete
+ // in the future.
+ bool new_table_reader_for_compaction_inputs = false;
+
+ // If non-zero, we perform bigger reads when doing compaction. If you're
+ // running RocksDB on spinning disks, you should set this to at least 2MB.
+ // That way RocksDB's compaction is doing sequential instead of random reads.
+ //
+ // When non-zero, we also force new_table_reader_for_compaction_inputs to
+ // true.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ size_t compaction_readahead_size = 0;
+
+ // This is a maximum buffer size that is used by WinMmapReadableFile in
+ // unbuffered disk I/O mode. We need to maintain an aligned buffer for
+ // reads. We allow the buffer to grow until the specified value and then
+ // for bigger requests allocate one shot buffers. In unbuffered mode we
+ // always bypass read-ahead buffer at ReadaheadRandomAccessFile
+ // When read-ahead is required we then make use of compaction_readahead_size
+ // value and always try to read ahead. With read-ahead we always
+ // pre-allocate buffer to the size instead of growing it up to a limit.
+ //
+ // This option is currently honored only on Windows
+ //
+ // Default: 1 Mb
+ //
+ // Special value: 0 - means do not maintain per instance buffer. Allocate
+ // per request buffer and avoid locking.
+ size_t random_access_max_buffer_size = 1024 * 1024;
+
+ // This is the maximum buffer size that is used by WritableFileWriter.
+ // On Windows, we need to maintain an aligned buffer for writes.
+ // We allow the buffer to grow until it's size hits the limit in buffered
+ // IO and fix the buffer size when using direct IO to ensure alignment of
+ // write requests if the logical sector size is unusual
+ //
+ // Default: 1024 * 1024 (1 MB)
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ size_t writable_file_max_buffer_size = 1024 * 1024;
+
+ // Use adaptive mutex, which spins in the user space before resorting
+ // to kernel. This could reduce context switch when the mutex is not
+ // heavily contended. However, if the mutex is hot, we could end up
+ // wasting spin time.
+ // Default: false
+ bool use_adaptive_mutex = false;
+
+ // Create DBOptions with default values for all fields
+ DBOptions();
+ // Create DBOptions from Options
+ explicit DBOptions(const Options& options);
+
+ void Dump(Logger* log) const;
+
+ // Allows OS to incrementally sync files to disk while they are being
+ // written, asynchronously, in the background. This operation can be used
+ // to smooth out write I/Os over time. Users shouldn't rely on it for
+ // persistency guarantee.
+ // Issue one request for every bytes_per_sync written. 0 turns it off.
+ //
+ // You may consider using rate_limiter to regulate write rate to device.
+ // When rate limiter is enabled, it automatically enables bytes_per_sync
+ // to 1MB.
+ //
+ // This option applies to table files
+ //
+ // Default: 0, turned off
+ //
+ // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t bytes_per_sync = 0;
+
+ // Same as bytes_per_sync, but applies to WAL files
+ //
+ // Default: 0, turned off
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t wal_bytes_per_sync = 0;
+
+ // When true, guarantees WAL files have at most `wal_bytes_per_sync`
+ // bytes submitted for writeback at any given time, and SST files have at most
+ // `bytes_per_sync` bytes pending writeback at any given time. This can be
+ // used to handle cases where processing speed exceeds I/O speed during file
+ // generation, which can lead to a huge sync when the file is finished, even
+ // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
+ //
+ // - If `sync_file_range` is supported it achieves this by waiting for any
+ // prior `sync_file_range`s to finish before proceeding. In this way,
+ // processing (compression, etc.) can proceed uninhibited in the gap
+ // between `sync_file_range`s, and we block only when I/O falls behind.
+ // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+ // always blocks, thus preventing the interleaving of I/O and processing.
+ //
+ // Note: Enabling this option does not provide any additional persistence
+ // guarantees, as it may use `sync_file_range`, which does not write out
+ // metadata.
+ //
+ // Default: false
+ bool strict_bytes_per_sync = false;
+
+ // A vector of EventListeners whose callback functions will be called
+ // when specific RocksDB event happens.
+ std::vector<std::shared_ptr<EventListener>> listeners;
+
+ // If true, then the status of the threads involved in this DB will
+ // be tracked and available via GetThreadList() API.
+ //
+ // Default: false
+ bool enable_thread_tracking = false;
+
+ // The limited write rate to DB if soft_pending_compaction_bytes_limit or
+ // level0_slowdown_writes_trigger is triggered, or we are writing to the
+ // last mem table allowed and we allow more than 3 mem tables. It is
+ // calculated using size of user write requests before compression.
+ // RocksDB may decide to slow down more if the compaction still
+ // gets behind further.
+ // If the value is 0, we will infer a value from `rater_limiter` value
+ // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+ // if users change the rate in `rate_limiter` after DB is opened,
+ // `delayed_write_rate` won't be adjusted.
+ //
+ // Unit: byte per second.
+ //
+ // Default: 0
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ uint64_t delayed_write_rate = 0;
+
+ // By default, a single write thread queue is maintained. The thread gets
+ // to the head of the queue becomes write batch group leader and responsible
+ // for writing to WAL and memtable for the batch group.
+ //
+ // If enable_pipelined_write is true, separate write thread queue is
+ // maintained for WAL write and memtable write. A write thread first enter WAL
+ // writer queue and then memtable writer queue. Pending thread on the WAL
+ // writer queue thus only have to wait for previous writers to finish their
+ // WAL writing but not the memtable writing. Enabling the feature may improve
+ // write throughput and reduce latency of the prepare phase of two-phase
+ // commit.
+ //
+ // Default: false
+ bool enable_pipelined_write = false;
+
+ // Setting unordered_write to true trades higher write throughput with
+ // relaxing the immutability guarantee of snapshots. This violates the
+ // repeatability one expects from ::Get from a snapshot, as well as
+ // ::MultiGet and Iterator's consistent-point-in-time view property.
+ // If the application cannot tolerate the relaxed guarantees, it can implement
+ // its own mechanisms to work around that and yet benefit from the higher
+ // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+ // two_write_queues=true is one way to achieve immutable snapshots despite
+ // unordered_write.
+ //
+ // By default, i.e., when it is false, rocksdb does not advance the sequence
+ // number for new snapshots unless all the writes with lower sequence numbers
+ // are already finished. This provides the immutability that we except from
+ // snapshots. Moreover, since Iterator and MultiGet internally depend on
+ // snapshots, the snapshot immutability results into Iterator and MultiGet
+ // offering consistent-point-in-time view. If set to true, although
+ // Read-Your-Own-Write property is still provided, the snapshot immutability
+ // property is relaxed: the writes issued after the snapshot is obtained (with
+ // larger sequence numbers) will be still not visible to the reads from that
+ // snapshot, however, there still might be pending writes (with lower sequence
+ // number) that will change the state visible to the snapshot after they are
+ // landed to the memtable.
+ //
+ // Default: false
+ bool unordered_write = false;
+
+ // If true, allow multi-writers to update mem tables in parallel.
+ // Only some memtable_factory-s support concurrent writes; currently it
+ // is implemented only for SkipListFactory. Concurrent memtable writes
+ // are not compatible with inplace_update_support or filter_deletes.
+ // It is strongly recommended to set enable_write_thread_adaptive_yield
+ // if you are going to use this feature.
+ //
+ // Default: true
+ bool allow_concurrent_memtable_write = true;
+
+ // If true, threads synchronizing with the write batch group leader will
+ // wait for up to write_thread_max_yield_usec before blocking on a mutex.
+ // This can substantially improve throughput for concurrent workloads,
+ // regardless of whether allow_concurrent_memtable_write is enabled.
+ //
+ // Default: true
+ bool enable_write_thread_adaptive_yield = true;
+
+ // The maximum limit of number of bytes that are written in a single batch
+ // of WAL or memtable write. It is followed when the leader write size
+ // is larger than 1/8 of this limit.
+ //
+ // Default: 1 MB
+ uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
+ // The maximum number of microseconds that a write operation will use
+ // a yielding spin loop to coordinate with other write threads before
+ // blocking on a mutex. (Assuming write_thread_slow_yield_usec is
+ // set properly) increasing this value is likely to increase RocksDB
+ // throughput at the expense of increased CPU usage.
+ //
+ // Default: 100
+ uint64_t write_thread_max_yield_usec = 100;
+
+ // The latency in microseconds after which a std::this_thread::yield
+ // call (sched_yield on Linux) is considered to be a signal that
+ // other processes or threads would like to use the current core.
+ // Increasing this makes writer threads more likely to take CPU
+ // by spinning, which will show up as an increase in the number of
+ // involuntary context switches.
+ //
+ // Default: 3
+ uint64_t write_thread_slow_yield_usec = 3;
+
+ // If true, then DB::Open() will not update the statistics used to optimize
+ // compaction decision by loading table properties from many files.
+ // Turning off this feature will improve DBOpen time especially in
+ // disk environment.
+ //
+ // Default: false
+ bool skip_stats_update_on_db_open = false;
+
+ // If true, then DB::Open() will not fetch and check sizes of all sst files.
+ // This may significantly speed up startup if there are many sst files,
+ // especially when using non-default Env with expensive GetFileSize().
+ // We'll still check that all required sst files exist.
+ // If paranoid_checks is false, this option is ignored, and sst files are
+ // not checked at all.
+ //
+ // Default: false
+ bool skip_checking_sst_file_sizes_on_db_open = false;
+
+ // Recovery mode to control the consistency while replaying WAL
+ // Default: kPointInTimeRecovery
+ WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+ // if set to false then recovery will fail when a prepared
+ // transaction is encountered in the WAL
+ bool allow_2pc = false;
+
+ // A global cache for table-level rows.
+ // Default: nullptr (disabled)
+ // Not supported in ROCKSDB_LITE mode!
+ std::shared_ptr<Cache> row_cache = nullptr;
+
+#ifndef ROCKSDB_LITE
+ // A filter object supplied to be invoked while processing write-ahead-logs
+ // (WALs) during recovery. The filter provides a way to inspect log
+ // records, ignoring a particular record or skipping replay.
+ // The filter is invoked at startup and is invoked from a single-thread
+ // currently.
+ WalFilter* wal_filter = nullptr;
+#endif // ROCKSDB_LITE
+
+ // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+ // / SetOptions will fail if options file is not detected or properly
+ // persisted.
+ //
+ // DEFAULT: false
+ bool fail_if_options_file_error = false;
+
+ // If true, then print malloc stats together with rocksdb.stats
+ // when printing to LOG.
+ // DEFAULT: false
+ bool dump_malloc_stats = false;
+
+ // By default RocksDB replay WAL logs and flush them on DB open, which may
+ // create very small SST files. If this option is enabled, RocksDB will try
+ // to avoid (but not guarantee not to) flush during recovery. Also, existing
+ // WAL logs will be kept, so that if crash happened before flush, we still
+ // have logs to recover from.
+ //
+ // DEFAULT: false
+ bool avoid_flush_during_recovery = false;
+
+ // By default RocksDB will flush all memtables on DB close if there are
+ // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+ // DB close. Unpersisted data WILL BE LOST.
+ //
+ // DEFAULT: false
+ //
+ // Dynamically changeable through SetDBOptions() API.
+ bool avoid_flush_during_shutdown = false;
+
+ // Set this option to true during creation of database if you want
+ // to be able to ingest behind (call IngestExternalFile() skipping keys
+ // that already exist, rather than overwriting matching keys).
+ // Setting this option to true will affect 2 things:
+ // 1) Disable some internal optimizations around SST file compression
+ // 2) Reserve bottom-most level for ingested files only.
+ // 3) Note that num_levels should be >= 3 if this option is turned on.
+ //
+ // DEFAULT: false
+ // Immutable.
+ bool allow_ingest_behind = false;
+
+ // Needed to support differential snapshots.
+ // If set to true then DB will only process deletes with sequence number
+ // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
+ // Clients are responsible to periodically call this method to advance
+ // the cutoff time. If this method is never called and preserve_deletes
+ // is set to true NO deletes will ever be processed.
+ // At the moment this only keeps normal deletes, SingleDeletes will
+ // not be preserved.
+ // DEFAULT: false
+ // Immutable (TODO: make it dynamically changeable)
+ bool preserve_deletes = false;
+
+ // If enabled it uses two queues for writes, one for the ones with
+ // disable_memtable and one for the ones that also write to memtable. This
+ // allows the memtable writes not to lag behind other writes. It can be used
+ // to optimize MySQL 2PC in which only the commits, which are serial, write to
+ // memtable.
+ bool two_write_queues = false;
+
+ // If true WAL is not flushed automatically after each write. Instead it
+ // relies on manual invocation of FlushWAL to write the WAL buffer to its
+ // file.
+ bool manual_wal_flush = false;
+
+ // If true, RocksDB supports flushing multiple column families and committing
+ // their results atomically to MANIFEST. Note that it is not
+ // necessary to set atomic_flush to true if WAL is always enabled since WAL
+ // allows the database to be restored to the last persistent state in WAL.
+ // This option is useful when there are column families with writes NOT
+ // protected by WAL.
+ // For manual flush, application has to specify which column families to
+ // flush atomically in DB::Flush.
+ // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+ //
+ // Currently, any WAL-enabled writes after atomic flush may be replayed
+ // independently if the process crashes later and tries to recover.
+ bool atomic_flush = false;
+
+ // If true, working thread may avoid doing unnecessary and long-latency
+ // operation (such as deleting obsolete files directly or deleting memtable)
+ // and will instead schedule a background job to do it.
+ // Use it if you're latency-sensitive.
+ // If set to true, takes precedence over
+ // ReadOptions::background_purge_on_iterator_cleanup.
+ bool avoid_unnecessary_blocking_io = false;
+
+ // Historically DB ID has always been stored in Identity File in DB folder.
+ // If this flag is true, the DB ID is written to Manifest file in addition
+ // to the Identity file. By doing this 2 problems are solved
+ // 1. We don't checksum the Identity file where as Manifest file is.
+ // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+ // the source of truth. Previously the Identity file could be copied
+ // independent of Manifest and that can result in wrong DB ID.
+ // We recommend setting this flag to true.
+ // Default: false
+ bool write_dbid_to_manifest = false;
+
+ // The number of bytes to prefetch when reading the log. This is mostly useful
+ // for reading a remotely located log, as it can save the number of
+ // round-trips. If 0, then the prefetching is disabled.
+ //
+ // Default: 0
+ size_t log_readahead_size = 0;
+
+ // If user does NOT provide SST file checksum function, the SST file checksum
+ // will NOT be used. The single checksum instance are shared by options and
+ // file writers. Make sure the algorithm is thread safe.
+ //
+ // Default: nullptr
+ std::shared_ptr<FileChecksumFunc> sst_file_checksum_func = nullptr;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+ // Create an Options object with default values for all fields.
+ Options() : DBOptions(), ColumnFamilyOptions() {}
+
+ Options(const DBOptions& db_options,
+ const ColumnFamilyOptions& column_family_options)
+ : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+ // The function recovers options to the option as in version 4.6.
+ Options* OldDefaults(int rocksdb_major_version = 4,
+ int rocksdb_minor_version = 6);
+
+ void Dump(Logger* log) const;
+
+ void DumpCFOptions(Logger* log) const;
+
+ // Some functions that make it easier to optimize RocksDB
+
+ // Set appropriate parameters for bulk loading.
+ // The reason that this is a function that returns "this" instead of a
+ // constructor is to enable chaining of multiple similar calls in the future.
+ //
+
+ // All data will be in level 0 without any automatic compaction.
+ // It's recommended to manually call CompactRange(NULL, NULL) before reading
+ // from the database, because otherwise the read can be very slow.
+ Options* PrepareForBulkLoad();
+
+ // Use this if your DB is very small (like under 1GB) and you don't want to
+ // spend lots of memory for memtables.
+ Options* OptimizeForSmallDb();
+};
+
+//
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+ kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
+ kBlockCacheTier = 0x1, // data in memtable or block cache
+ kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option
+ // will skip data in memtable.
+ // Note that this ReadTier currently only supports
+ // Get and MultiGet and does not support iterators.
+ kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators.
+};
+
+// Options that control read operations
+struct ReadOptions {
+ // If "snapshot" is non-nullptr, read as of the supplied snapshot
+ // (which must belong to the DB that is being read and which must
+ // not have been released). If "snapshot" is nullptr, use an implicit
+ // snapshot of the state at the beginning of this read operation.
+ // Default: nullptr
+ const Snapshot* snapshot;
+
+ // `iterate_lower_bound` defines the smallest key at which the backward
+ // iterator can return an entry. Once the bound is passed, Valid() will be
+ // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
+ // entry.
+ //
+ // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+ // need to have the same prefix. This is because ordering is not guaranteed
+ // outside of prefix domain.
+ //
+ // Default: nullptr
+ const Slice* iterate_lower_bound;
+
+ // "iterate_upper_bound" defines the extent upto which the forward iterator
+ // can returns entries. Once the bound is reached, Valid() will be false.
+ // "iterate_upper_bound" is exclusive ie the bound value is
+ // not a valid entry. If prefix_extractor is not null, the Seek target
+ // and iterate_upper_bound need to have the same prefix.
+ // This is because ordering is not guaranteed outside of prefix domain.
+ //
+ // Default: nullptr
+ const Slice* iterate_upper_bound;
+
+ // RocksDB does auto-readahead for iterators on noticing more than two reads
+ // for a table file. The readahead starts at 8KB and doubles on every
+ // additional read upto 256KB.
+ // This option can help if most of the range scans are large, and if it is
+ // determined that a larger readahead than that enabled by auto-readahead is
+ // needed.
+ // Using a large readahead size (> 2MB) can typically improve the performance
+ // of forward iteration on spinning disks.
+ // Default: 0
+ size_t readahead_size;
+
+ // A threshold for the number of keys that can be skipped before failing an
+ // iterator seek as incomplete. The default value of 0 should be used to
+ // never fail a request as incomplete, even on skipping too many keys.
+ // Default: 0
+ uint64_t max_skippable_internal_keys;
+
+ // Specify if this read request should process data that ALREADY
+ // resides on a particular cache. If the required data is not
+ // found at the specified cache, then Status::Incomplete is returned.
+ // Default: kReadAllTier
+ ReadTier read_tier;
+
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: true
+ bool verify_checksums;
+
+ // Should the "data block"/"index block"" read for this iteration be placed in
+ // block cache?
+ // Callers may wish to set this field to false for bulk scans.
+ // This would help not to the change eviction order of existing items in the
+ // block cache. Default: true
+ bool fill_cache;
+
+ // Specify to create a tailing iterator -- a special iterator that has a
+ // view of the complete database (i.e. it can also be used to read newly
+ // added data) and is optimized for sequential reads. It will return records
+ // that were inserted into the database after the creation of the iterator.
+ // Default: false
+ // Not supported in ROCKSDB_LITE mode!
+ bool tailing;
+
+ // This options is not used anymore. It was to turn on a functionality that
+ // has been removed.
+ bool managed;
+
+ // Enable a total order seek regardless of index format (e.g. hash index)
+ // used in the table. Some table format (e.g. plain table) may not support
+ // this option.
+ // If true when calling Get(), we also skip prefix bloom when reading from
+ // block based table. It provides a way to read existing data after
+ // changing implementation of prefix extractor.
+ bool total_order_seek;
+
+ // When true, by default use total_order_seek = true, and RocksDB can
+ // selectively enable prefix seek mode if won't generate a different result
+ // from total_order_seek, based on seek key, and iterator upper bound.
+ // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
+ // prefix mode is not used.
+ bool auto_prefix_mode;
+
+ // Enforce that the iterator only iterates over the same prefix as the seek.
+ // This option is effective only for prefix seeks, i.e. prefix_extractor is
+ // non-null for the column family and total_order_seek is false. Unlike
+ // iterate_upper_bound, prefix_same_as_start only works within a prefix
+ // but in both directions.
+ // Default: false
+ bool prefix_same_as_start;
+
+ // Keep the blocks loaded by the iterator pinned in memory as long as the
+ // iterator is not deleted, If used when reading from tables created with
+ // BlockBasedTableOptions::use_delta_encoding = false,
+ // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
+ // return 1.
+ // Default: false
+ bool pin_data;
+
+ // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+ // schedule a background job in the flush job queue and delete obsolete files
+ // in background.
+ // Default: false
+ bool background_purge_on_iterator_cleanup;
+
+ // If true, keys deleted using the DeleteRange() API will be visible to
+ // readers until they are naturally deleted during compaction. This improves
+ // read performance in DBs with many range deletions.
+ // Default: false
+ bool ignore_range_deletions;
+
+ // A callback to determine whether relevant keys for this scan exist in a
+ // given table based on the table's properties. The callback is passed the
+ // properties of each table during iteration. If the callback returns false,
+ // the table will not be scanned. This option only affects Iterators and has
+ // no impact on point lookups.
+ // Default: empty (every table will be scanned)
+ std::function<bool(const TableProperties&)> table_filter;
+
+ // Needed to support differential snapshots. Has 2 effects:
+ // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
+ // 2) if this param > 0 iterator will return INTERNAL keys instead of
+ // user keys; e.g. return tombstones as well.
+ // Default: 0 (don't filter by seqnum, return user keys)
+ SequenceNumber iter_start_seqnum;
+
+ // Timestamp of operation. Read should return the latest data visible to the
+ // specified timestamp. All timestamps of the same database must be of the
+ // same length and format. The user is responsible for providing a customized
+ // compare function via Comparator to order <key, timestamp> tuples.
+ // The user-specified timestamp feature is still under active development,
+ // and the API is subject to change.
+ const Slice* timestamp;
+
+ ReadOptions();
+ ReadOptions(bool cksum, bool cache);
+};
+
+// Options that control write operations
+struct WriteOptions {
+ // If true, the write will be flushed from the operating system
+ // buffer cache (by calling WritableFile::Sync()) before the write
+ // is considered complete. If this flag is true, writes will be
+ // slower.
+ //
+ // If this flag is false, and the machine crashes, some recent
+ // writes may be lost. Note that if it is just the process that
+ // crashes (i.e., the machine does not reboot), no writes will be
+ // lost even if sync==false.
+ //
+ // In other words, a DB write with sync==false has similar
+ // crash semantics as the "write()" system call. A DB write
+ // with sync==true has similar crash semantics to a "write()"
+ // system call followed by "fdatasync()".
+ //
+ // Default: false
+ bool sync;
+
+ // If true, writes will not first go to the write ahead log,
+ // and the write may get lost after a crash. The backup engine
+ // relies on write-ahead logs to back up the memtable, so if
+ // you disable write-ahead logs, you must create backups with
+ // flush_before_backup=true to avoid losing unflushed memtable data.
+ // Default: false
+ bool disableWAL;
+
+ // If true and if user is trying to write to column families that don't exist
+ // (they were dropped), ignore the write (don't return an error). If there
+ // are multiple writes in a WriteBatch, other writes will succeed.
+ // Default: false
+ bool ignore_missing_column_families;
+
+ // If true and we need to wait or sleep for the write request, fails
+ // immediately with Status::Incomplete().
+ // Default: false
+ bool no_slowdown;
+
+ // If true, this write request is of lower priority if compaction is
+ // behind. In this case, no_slowdown = true, the request will be cancelled
+ // immediately with Status::Incomplete() returned. Otherwise, it will be
+ // slowed down. The slowdown value is determined by RocksDB to guarantee
+ // it introduces minimum impacts to high priority writes.
+ //
+ // Default: false
+ bool low_pri;
+
+ // If true, this writebatch will maintain the last insert positions of each
+ // memtable as hints in concurrent write. It can improve write performance
+ // in concurrent writes if keys in one writebatch are sequential. In
+ // non-concurrent writes (when concurrent_memtable_writes is false) this
+ // option will be ignored.
+ //
+ // Default: false
+ bool memtable_insert_hint_per_batch;
+
+ // Timestamp of write operation, e.g. Put. All timestamps of the same
+ // database must share the same length and format. The user is also
+ // responsible for providing a customized compare function via Comparator to
+ // order <key, timestamp> tuples. If the user wants to enable timestamp, then
+ // all write operations must be associated with timestamp because RocksDB, as
+ // a single-node storage engine currently has no knowledge of global time,
+ // thus has to rely on the application.
+ // The user-specified timestamp feature is still under active development,
+ // and the API is subject to change.
+ const Slice* timestamp;
+
+ WriteOptions()
+ : sync(false),
+ disableWAL(false),
+ ignore_missing_column_families(false),
+ no_slowdown(false),
+ low_pri(false),
+ memtable_insert_hint_per_batch(false),
+ timestamp(nullptr) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+ // If true, the flush will wait until the flush is done.
+ // Default: true
+ bool wait;
+ // If true, the flush would proceed immediately even it means writes will
+ // stall for the duration of the flush; if false the operation will wait
+ // until it's possible to do flush w/o causing stall or until required flush
+ // is performed by someone else (foreground call or background thread).
+ // Default: false
+ bool allow_write_stall;
+ FlushOptions() : wait(true), allow_write_stall(false) {}
+};
+
+// Create a Logger from provided DBOptions
+extern Status CreateLoggerFromOptions(const std::string& dbname,
+ const DBOptions& options,
+ std::shared_ptr<Logger>* logger);
+
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+ // Compaction output compression type
+ // Default: snappy
+ // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+ // according to the `ColumnFamilyOptions`, taking into account the output
+ // level if `compression_per_level` is specified.
+ CompressionType compression;
+ // Compaction will create files of size `output_file_size_limit`.
+ // Default: MAX, which means that compaction will create a single file
+ uint64_t output_file_size_limit;
+ // If > 0, it will replace the option in the DBOptions for this compaction.
+ uint32_t max_subcompactions;
+
+ CompactionOptions()
+ : compression(kSnappyCompression),
+ output_file_size_limit(std::numeric_limits<uint64_t>::max()),
+ max_subcompactions(0) {}
+};
+
+// For level based compaction, we can configure if we want to skip/force
+// bottommost level compaction.
+enum class BottommostLevelCompaction {
+ // Skip bottommost level compaction
+ kSkip,
+ // Only compact bottommost level if there is a compaction filter
+ // This is the default option
+ kIfHaveCompactionFilter,
+ // Always compact bottommost level
+ kForce,
+ // Always compact bottommost level but in bottommost level avoid
+ // double-compacting files created in the same compaction
+ kForceOptimized,
+};
+
+// CompactRangeOptions is used by CompactRange() call.
+struct CompactRangeOptions {
+ // If true, no other compaction will run at the same time as this
+ // manual compaction
+ bool exclusive_manual_compaction = true;
+ // If true, compacted files will be moved to the minimum level capable
+ // of holding the data or given level (specified non-negative target_level).
+ bool change_level = false;
+ // If change_level is true and target_level have non-negative value, compacted
+ // files will be moved to target_level.
+ int target_level = -1;
+ // Compaction outputs will be placed in options.db_paths[target_path_id].
+ // Behavior is undefined if target_path_id is out of range.
+ uint32_t target_path_id = 0;
+ // By default level based compaction will only compact the bottommost level
+ // if there is a compaction filter
+ BottommostLevelCompaction bottommost_level_compaction =
+ BottommostLevelCompaction::kIfHaveCompactionFilter;
+ // If true, will execute immediately even if doing so would cause the DB to
+ // enter write stall mode. Otherwise, it'll sleep until load is low enough.
+ bool allow_write_stall = false;
+ // If > 0, it will replace the option in the DBOptions for this compaction.
+ uint32_t max_subcompactions = 0;
+};
+
+// IngestExternalFileOptions is used by IngestExternalFile()
+struct IngestExternalFileOptions {
+ // Can be set to true to move the files instead of copying them.
+ bool move_files = false;
+ // If set to true, ingestion falls back to copy when move fails.
+ bool failed_move_fall_back_to_copy = true;
+ // If set to false, an ingested file keys could appear in existing snapshots
+ // that where created before the file was ingested.
+ bool snapshot_consistency = true;
+ // If set to false, IngestExternalFile() will fail if the file key range
+ // overlaps with existing keys or tombstones in the DB.
+ bool allow_global_seqno = true;
+ // If set to false and the file key range overlaps with the memtable key range
+ // (memtable flush required), IngestExternalFile will fail.
+ bool allow_blocking_flush = true;
+ // Set to true if you would like duplicate keys in the file being ingested
+ // to be skipped rather than overwriting existing data under that key.
+ // Usecase: back-fill of some historical data in the database without
+ // over-writing existing newer version of data.
+ // This option could only be used if the DB has been running
+ // with allow_ingest_behind=true since the dawn of time.
+ // All files will be ingested at the bottommost level with seqno=0.
+ bool ingest_behind = false;
+ // Set to true if you would like to write global_seqno to a given offset in
+ // the external SST file for backward compatibility. Older versions of
+ // RocksDB writes a global_seqno to a given offset within ingested SST files,
+ // and new versions of RocksDB do not. If you ingest an external SST using
+ // new version of RocksDB and would like to be able to downgrade to an
+ // older version of RocksDB, you should set 'write_global_seqno' to true. If
+ // your service is just starting to use the new RocksDB, we recommend that
+ // you set this option to false, which brings two benefits:
+ // 1. No extra random write for global_seqno during ingestion.
+ // 2. Without writing external SST file, it's possible to do checksum.
+ // We have a plan to set this option to false by default in the future.
+ bool write_global_seqno = true;
+ // Set to true if you would like to verify the checksums of each block of the
+ // external SST file before ingestion.
+ // Warning: setting this to true causes slowdown in file ingestion because
+ // the external SST file has to be read.
+ bool verify_checksums_before_ingest = false;
+ // When verify_checksums_before_ingest = true, RocksDB uses default
+ // readahead setting to scan the file while verifying checksums before
+ // ingestion.
+ // Users can override the default value using this option.
+ // Using a large readahead size (> 2MB) can typically improve the performance
+ // of forward iteration on spinning disks.
+ size_t verify_checksums_readahead_size = 0;
+};
+
+enum TraceFilterType : uint64_t {
+ // Trace all the operations
+ kTraceFilterNone = 0x0,
+ // Do not trace the get operations
+ kTraceFilterGet = 0x1 << 0,
+ // Do not trace the write operations
+ kTraceFilterWrite = 0x1 << 1
+};
+
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+ // To avoid the trace file size grows large than the storage space,
+ // user can set the max trace file size in Bytes. Default is 64GB
+ uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+ // Specify trace sampling option, i.e. capture one per how many requests.
+ // Default to 1 (capture every request).
+ uint64_t sampling_frequency = 1;
+ // Note: The filtering happens before sampling.
+ uint64_t filter = kTraceFilterNone;
+};
+
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+ // Can be set to true to move the files instead of copying them.
+ bool move_files = false;
+};
+
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+ // Defines whether the returned size should include the recently written
+ // data in the mem-tables. If set to false, include_files must be true.
+ bool include_memtabtles = false;
+ // Defines whether the returned size should include data serialized to disk.
+ // If set to false, include_memtabtles must be true.
+ bool include_files = true;
+ // When approximating the files total size that is used to store a keys range
+ // using DB::GetApproximateSizes, allow approximation with an error margin of
+ // up to total_files_size * files_size_error_margin. This allows to take some
+ // shortcuts in files size approximation, resulting in better performance,
+ // while guaranteeing the resulting error is within a reasonable margin.
+ // E.g., if the value is 0.1, then the error margin of the returned files size
+ // approximation will be within 10%.
+ // If the value is non-positive - a more precise yet more CPU intensive
+ // estimation is performed.
+ double files_size_error_margin = -1.0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
new file mode 100644
index 000000000..123a21bc9
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <map>
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+ // # of times bloom filter has avoided file reads, i.e., negatives.
+ uint64_t bloom_filter_useful = 0;
+ // # of times bloom FullFilter has not avoided the reads.
+ uint64_t bloom_filter_full_positive = 0;
+ // # of times bloom FullFilter has not avoided the reads and data actually
+ // exist.
+ uint64_t bloom_filter_full_true_positive = 0;
+
+ // total number of user key returned (only include keys that are found, does
+ // not include keys that are deleted or merged without a final put
+ uint64_t user_key_return_count;
+
+ // total nanos spent on reading data from SST files
+ uint64_t get_from_table_nanos;
+
+ uint64_t block_cache_hit_count = 0; // total number of block cache hits
+ uint64_t block_cache_miss_count = 0; // total number of block cache misses
+
+ void Reset(); // reset all performance counters to zero
+};
+
+struct PerfContext {
+ ~PerfContext();
+
+ PerfContext() {}
+
+ PerfContext(const PerfContext&);
+ PerfContext& operator=(const PerfContext&);
+ PerfContext(PerfContext&&) noexcept;
+
+ void Reset(); // reset all performance counters to zero
+
+ std::string ToString(bool exclude_zero_counters = false) const;
+
+ // enable per level perf context and allocate storage for PerfContextByLevel
+ void EnablePerLevelPerfContext();
+
+ // temporarily disable per level perf contxt by setting the flag to false
+ void DisablePerLevelPerfContext();
+
+ // free the space for PerfContextByLevel, also disable per level perf context
+ void ClearPerLevelPerfContext();
+
+ uint64_t user_key_comparison_count; // total number of user key comparisons
+ uint64_t block_cache_hit_count; // total number of block cache hits
+ uint64_t block_read_count; // total number of block reads (with IO)
+ uint64_t block_read_byte; // total number of bytes from block reads
+ uint64_t block_read_time; // total nanos spent on block reads
+ uint64_t block_cache_index_hit_count; // total number of index block hits
+ uint64_t index_block_read_count; // total number of index block reads
+ uint64_t block_cache_filter_hit_count; // total number of filter block hits
+ uint64_t filter_block_read_count; // total number of filter block reads
+ uint64_t compression_dict_block_read_count; // total number of compression
+ // dictionary block reads
+ uint64_t block_checksum_time; // total nanos spent on block checksum
+ uint64_t block_decompress_time; // total nanos spent on block decompression
+
+ uint64_t get_read_bytes; // bytes for vals returned by Get
+ uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet
+ uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator
+
+ // total number of internal keys skipped over during iteration.
+ // There are several reasons for it:
+ // 1. when calling Next(), the iterator is in the position of the previous
+ // key, so that we'll need to skip it. It means this counter will always
+ // be incremented in Next().
+ // 2. when calling Next(), we need to skip internal entries for the previous
+ // keys that are overwritten.
+ // 3. when calling Next(), Seek() or SeekToFirst(), after previous key
+ // before calling Next(), the seek key in Seek() or the beginning for
+ // SeekToFirst(), there may be one or more deleted keys before the next
+ // valid key that the operation should place the iterator to. We need
+ // to skip both of the tombstone and updates hidden by the tombstones. The
+ // tombstones are not included in this counter, while previous updates
+ // hidden by the tombstones will be included here.
+ // 4. symmetric cases for Prev() and SeekToLast()
+ // internal_recent_skipped_count is not included in this counter.
+ //
+ uint64_t internal_key_skipped_count;
+ // Total number of deletes and single deletes skipped over during iteration
+ // When calling Next(), Seek() or SeekToFirst(), after previous position
+ // before calling Next(), the seek key in Seek() or the beginning for
+ // SeekToFirst(), there may be one or more deleted keys before the next valid
+ // key. Every deleted key is counted once. We don't recount here if there are
+ // still older updates invalidated by the tombstones.
+ //
+ uint64_t internal_delete_skipped_count;
+ // How many times iterators skipped over internal keys that are more recent
+ // than the snapshot that iterator is using.
+ //
+ uint64_t internal_recent_skipped_count;
+ // How many values were fed into merge operator by iterators.
+ //
+ uint64_t internal_merge_count;
+
+ uint64_t get_snapshot_time; // total nanos spent on getting snapshot
+ uint64_t get_from_memtable_time; // total nanos spent on querying memtables
+ uint64_t get_from_memtable_count; // number of mem tables queried
+ // total nanos spent after Get() finds a key
+ uint64_t get_post_process_time;
+ uint64_t get_from_output_files_time; // total nanos reading from output files
+ // total nanos spent on seeking memtable
+ uint64_t seek_on_memtable_time;
+ // number of seeks issued on memtable
+ // (including SeekForPrev but not SeekToFirst and SeekToLast)
+ uint64_t seek_on_memtable_count;
+ // number of Next()s issued on memtable
+ uint64_t next_on_memtable_count;
+ // number of Prev()s issued on memtable
+ uint64_t prev_on_memtable_count;
+ // total nanos spent on seeking child iters
+ uint64_t seek_child_seek_time;
+ // number of seek issued in child iterators
+ uint64_t seek_child_seek_count;
+ uint64_t seek_min_heap_time; // total nanos spent on the merge min heap
+ uint64_t seek_max_heap_time; // total nanos spent on the merge max heap
+ // total nanos spent on seeking the internal entries
+ uint64_t seek_internal_seek_time;
+ // total nanos spent on iterating internal entries to find the next user entry
+ uint64_t find_next_user_entry_time;
+
+ // This group of stats provide a breakdown of time spent by Write().
+ // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+ // are enabled.
+ //
+ // total nanos spent on writing to WAL
+ uint64_t write_wal_time;
+ // total nanos spent on writing to mem tables
+ uint64_t write_memtable_time;
+ // total nanos spent on delaying or throttling write
+ uint64_t write_delay_time;
+ // total nanos spent on switching memtable/wal and scheduling
+ // flushes/compactions.
+ uint64_t write_scheduling_flushes_compactions_time;
+ // total nanos spent on writing a record, excluding the above four things
+ uint64_t write_pre_and_post_process_time;
+
+ // time spent waiting for other threads of the batch group
+ uint64_t write_thread_wait_nanos;
+
+ // time spent on acquiring DB mutex.
+ uint64_t db_mutex_lock_nanos;
+ // Time spent on waiting with a condition variable created with DB mutex.
+ uint64_t db_condition_wait_nanos;
+ // Time spent on merge operator.
+ uint64_t merge_operator_time_nanos;
+
+ // Time spent on reading index block from block cache or SST file
+ uint64_t read_index_block_nanos;
+ // Time spent on reading filter block from block cache or SST file
+ uint64_t read_filter_block_nanos;
+ // Time spent on creating data block iterator
+ uint64_t new_table_block_iter_nanos;
+ // Time spent on creating a iterator of an SST file.
+ uint64_t new_table_iterator_nanos;
+ // Time spent on seeking a key in data/index blocks
+ uint64_t block_seek_nanos;
+ // Time spent on finding or creating a table reader
+ uint64_t find_table_nanos;
+ // total number of mem table bloom hits
+ uint64_t bloom_memtable_hit_count;
+ // total number of mem table bloom misses
+ uint64_t bloom_memtable_miss_count;
+ // total number of SST table bloom hits
+ uint64_t bloom_sst_hit_count;
+ // total number of SST table bloom misses
+ uint64_t bloom_sst_miss_count;
+
+ // Time spent waiting on key locks in transaction lock manager.
+ uint64_t key_lock_wait_time;
+ // number of times acquiring a lock was blocked by another transaction.
+ uint64_t key_lock_wait_count;
+
+ // Total time spent in Env filesystem operations. These are only populated
+ // when TimedEnv is used.
+ uint64_t env_new_sequential_file_nanos;
+ uint64_t env_new_random_access_file_nanos;
+ uint64_t env_new_writable_file_nanos;
+ uint64_t env_reuse_writable_file_nanos;
+ uint64_t env_new_random_rw_file_nanos;
+ uint64_t env_new_directory_nanos;
+ uint64_t env_file_exists_nanos;
+ uint64_t env_get_children_nanos;
+ uint64_t env_get_children_file_attributes_nanos;
+ uint64_t env_delete_file_nanos;
+ uint64_t env_create_dir_nanos;
+ uint64_t env_create_dir_if_missing_nanos;
+ uint64_t env_delete_dir_nanos;
+ uint64_t env_get_file_size_nanos;
+ uint64_t env_get_file_modification_time_nanos;
+ uint64_t env_rename_file_nanos;
+ uint64_t env_link_file_nanos;
+ uint64_t env_lock_file_nanos;
+ uint64_t env_unlock_file_nanos;
+ uint64_t env_new_logger_nanos;
+
+ uint64_t get_cpu_nanos;
+ uint64_t iter_next_cpu_nanos;
+ uint64_t iter_prev_cpu_nanos;
+ uint64_t iter_seek_cpu_nanos;
+
+ std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+ bool per_level_perf_context_enabled = false;
+};
+
+// Get Thread-local PerfContext object pointer
+// if defined(NPERF_CONTEXT), then the pointer is not thread-local
+PerfContext* get_perf_context();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h
new file mode 100644
index 000000000..e6a768904
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_level.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// How much perf stats to collect. Affects perf_context and iostats_context.
+enum PerfLevel : unsigned char {
+ kUninitialized = 0, // unknown setting
+ kDisable = 1, // disable perf stats
+ kEnableCount = 2, // enable only count stats
+ kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time
+ // stats except for mutexes
+ // Other than time, also measure CPU time counters. Still don't measure
+ // time (neither wall time nor CPU time) for mutexes.
+ kEnableTimeAndCPUTimeExceptForMutex = 4,
+ kEnableTime = 5, // enable count and time stats
+ kOutOfBounds = 6 // N.B. Must always be the last value!
+};
+
+// set the perf stats level for current thread
+void SetPerfLevel(PerfLevel level);
+
+// get current perf stats level for current thread
+PerfLevel GetPerfLevel();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h
new file mode 100644
index 000000000..9651812c8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/persistent_cache.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCache
+//
+// Persistent cache interface for caching IO pages on a persistent medium. The
+// cache interface is specifically designed for persistent read cache.
+class PersistentCache {
+ public:
+ typedef std::vector<std::map<std::string, double>> StatsType;
+
+ virtual ~PersistentCache() {}
+
+ // Insert to page cache
+ //
+ // page_key Identifier to identify a page uniquely across restarts
+ // data Page data
+ // size Size of the page
+ virtual Status Insert(const Slice& key, const char* data,
+ const size_t size) = 0;
+
+ // Lookup page cache by page identifier
+ //
+ // page_key Page identifier
+ // buf Buffer where the data should be copied
+ // size Size of the page
+ virtual Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+ size_t* size) = 0;
+
+ // Is cache storing uncompressed data ?
+ //
+ // True if the cache is configured to store uncompressed data else false
+ virtual bool IsCompressed() = 0;
+
+ // Return stats as map of {string, double} per-tier
+ //
+ // Persistent cache can be initialized as a tier of caches. The stats are per
+ // tire top-down
+ virtual StatsType Stats() = 0;
+
+ virtual std::string GetPrintableOptions() const = 0;
+};
+
+// Factor method to create a new persistent cache
+Status NewPersistentCache(Env* const env, const std::string& path,
+ const uint64_t size,
+ const std::shared_ptr<Logger>& log,
+ const bool optimized_for_nvm,
+ std::shared_ptr<PersistentCache>* cache);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
new file mode 100644
index 000000000..0ee89f5c8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RateLimiter {
+ public:
+ enum class OpType {
+ // Limitation: we currently only invoke Request() with OpType::kRead for
+ // compactions when DBOptions::new_table_reader_for_compaction_inputs is set
+ kRead,
+ kWrite,
+ };
+ enum class Mode {
+ kReadsOnly,
+ kWritesOnly,
+ kAllIo,
+ };
+
+ // For API compatibility, default to rate-limiting writes only.
+ explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {}
+
+ virtual ~RateLimiter() {}
+
+ // This API allows user to dynamically change rate limiter's bytes per second.
+ // REQUIRED: bytes_per_second > 0
+ virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
+
+ // Deprecated. New RateLimiter derived classes should override
+ // Request(const int64_t, const Env::IOPriority, Statistics*) or
+ // Request(const int64_t, const Env::IOPriority, Statistics*, OpType)
+ // instead.
+ //
+ // Request for token for bytes. If this request can not be satisfied, the call
+ // is blocked. Caller is responsible to make sure
+ // bytes <= GetSingleBurstBytes()
+ virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
+ assert(false);
+ }
+
+ // Request for token for bytes and potentially update statistics. If this
+ // request can not be satisfied, the call is blocked. Caller is responsible to
+ // make sure bytes <= GetSingleBurstBytes().
+ virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* /* stats */) {
+ // For API compatibility, default implementation calls the older API in
+ // which statistics are unsupported.
+ Request(bytes, pri);
+ }
+
+ // Requests token to read or write bytes and potentially updates statistics.
+ //
+ // If this request can not be satisfied, the call is blocked. Caller is
+ // responsible to make sure bytes <= GetSingleBurstBytes().
+ virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* stats, OpType op_type) {
+ if (IsRateLimited(op_type)) {
+ Request(bytes, pri, stats);
+ }
+ }
+
+ // Requests token to read or write bytes and potentially updates statistics.
+ // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of
+ // direct I/O) to allocate an appropriate number of bytes, which may be less
+ // than the number of bytes requested.
+ virtual size_t RequestToken(size_t bytes, size_t alignment,
+ Env::IOPriority io_priority, Statistics* stats,
+ RateLimiter::OpType op_type);
+
+ // Max bytes can be granted in a single burst
+ virtual int64_t GetSingleBurstBytes() const = 0;
+
+ // Total bytes that go through rate limiter
+ virtual int64_t GetTotalBytesThrough(
+ const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+ // Total # of requests that go through rate limiter
+ virtual int64_t GetTotalRequests(
+ const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+ virtual int64_t GetBytesPerSecond() const = 0;
+
+ virtual bool IsRateLimited(OpType op_type) {
+ if ((mode_ == RateLimiter::Mode::kWritesOnly &&
+ op_type == RateLimiter::OpType::kRead) ||
+ (mode_ == RateLimiter::Mode::kReadsOnly &&
+ op_type == RateLimiter::OpType::kWrite)) {
+ return false;
+ }
+ return true;
+ }
+
+ protected:
+ Mode GetMode() { return mode_; }
+
+ private:
+ const Mode mode_;
+};
+
+// Create a RateLimiter object, which can be shared among RocksDB instances to
+// control write rate of flush and compaction.
+// @rate_bytes_per_sec: this is the only parameter you want to set most of the
+// time. It controls the total write rate of compaction and flush in bytes per
+// second. Currently, RocksDB does not enforce rate limit for anything other
+// than flush and compaction, e.g. write to WAL.
+// @refill_period_us: this controls how often tokens are refilled. For example,
+// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+// burstier writes while smaller value introduces more CPU overhead.
+// The default should work for most cases.
+// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
+// A low-pri request is usually blocked in favor of hi-pri request. Currently,
+// RocksDB assigns low-pri to request from compaction and high-pri to request
+// from flush. Low-pri requests can get blocked if flush requests come in
+// continuously. This fairness parameter grants low-pri requests permission by
+// 1/fairness chance even though high-pri requests exist to avoid starvation.
+// You should be good by leaving it at default 10.
+// @mode: Mode indicates which types of operations count against the limit.
+// @auto_tuned: Enables dynamic adjustment of rate limit within the range
+// `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to
+// the recent demand for background I/O.
+extern RateLimiter* NewGenericRateLimiter(
+ int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
+ int32_t fairness = 10,
+ RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+ bool auto_tuned = false);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
new file mode 100644
index 000000000..e9f8620d0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_NAMESPACE
+#define ROCKSDB_NAMESPACE rocksdb
+#endif
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
new file mode 100644
index 000000000..c17b32c5c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -0,0 +1,269 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size. The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <cstdio>
+#include <string>
+
+#ifdef __cpp_lib_string_view
+#include <string_view>
+#endif
+
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice {
+ public:
+ // Create an empty slice.
+ Slice() : data_(""), size_(0) {}
+
+ // Create a slice that refers to d[0,n-1].
+ Slice(const char* d, size_t n) : data_(d), size_(n) {}
+
+ // Create a slice that refers to the contents of "s"
+ /* implicit */
+ Slice(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+#ifdef __cpp_lib_string_view
+ // Create a slice that refers to the same contents as "sv"
+ /* implicit */
+ Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {}
+#endif
+
+ // Create a slice that refers to s[0,strlen(s)-1]
+ /* implicit */
+ Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); }
+
+ // Create a single slice from SliceParts using buf as storage.
+ // buf must exist as long as the returned Slice exists.
+ Slice(const struct SliceParts& parts, std::string* buf);
+
+ // Return a pointer to the beginning of the referenced data
+ const char* data() const { return data_; }
+
+ // Return the length (in bytes) of the referenced data
+ size_t size() const { return size_; }
+
+ // Return true iff the length of the referenced data is zero
+ bool empty() const { return size_ == 0; }
+
+ // Return the ith byte in the referenced data.
+ // REQUIRES: n < size()
+ char operator[](size_t n) const {
+ assert(n < size());
+ return data_[n];
+ }
+
+ // Change this slice to refer to an empty array
+ void clear() {
+ data_ = "";
+ size_ = 0;
+ }
+
+ // Drop the first "n" bytes from this slice.
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ data_ += n;
+ size_ -= n;
+ }
+
+ void remove_suffix(size_t n) {
+ assert(n <= size());
+ size_ -= n;
+ }
+
+ // Return a string that contains the copy of the referenced data.
+ // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
+ std::string ToString(bool hex = false) const;
+
+#ifdef __cpp_lib_string_view
+ // Return a string_view that references the same data as this slice.
+ std::string_view ToStringView() const {
+ return std::string_view(data_, size_);
+ }
+#endif
+
+ // Decodes the current slice interpreted as an hexadecimal string into result,
+ // if successful returns true, if this isn't a valid hex string
+ // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
+ // This slice is expected to have an even number of 0-9A-F characters
+ // also accepts lowercase (a-f)
+ bool DecodeHex(std::string* result) const;
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "*this" < "b",
+ // == 0 iff "*this" == "b",
+ // > 0 iff "*this" > "b"
+ int compare(const Slice& b) const;
+
+ // Return true iff "x" is a prefix of "*this"
+ bool starts_with(const Slice& x) const {
+ return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
+ }
+
+ bool ends_with(const Slice& x) const {
+ return ((size_ >= x.size_) &&
+ (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0));
+ }
+
+ // Compare two slices and returns the first byte where they differ
+ size_t difference_offset(const Slice& b) const;
+
+ // private: make these public for rocksdbjni access
+ const char* data_;
+ size_t size_;
+
+ // Intentionally copyable
+};
+
+/**
+ * A Slice that can be pinned with some cleanup tasks, which will be run upon
+ * ::Reset() or object destruction, whichever is invoked first. This can be used
+ * to avoid memcpy by having the PinnableSlice object referring to the data
+ * that is locked in the memory and release them after the data is consumed.
+ */
+class PinnableSlice : public Slice, public Cleanable {
+ public:
+ PinnableSlice() { buf_ = &self_space_; }
+ explicit PinnableSlice(std::string* buf) { buf_ = buf; }
+
+ PinnableSlice(PinnableSlice&& other);
+ PinnableSlice& operator=(PinnableSlice&& other);
+
+ // No copy constructor and copy assignment allowed.
+ PinnableSlice(PinnableSlice&) = delete;
+ PinnableSlice& operator=(PinnableSlice&) = delete;
+
+ inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
+ void* arg2) {
+ assert(!pinned_);
+ pinned_ = true;
+ data_ = s.data();
+ size_ = s.size();
+ RegisterCleanup(f, arg1, arg2);
+ assert(pinned_);
+ }
+
+ inline void PinSlice(const Slice& s, Cleanable* cleanable) {
+ assert(!pinned_);
+ pinned_ = true;
+ data_ = s.data();
+ size_ = s.size();
+ cleanable->DelegateCleanupsTo(this);
+ assert(pinned_);
+ }
+
+ inline void PinSelf(const Slice& slice) {
+ assert(!pinned_);
+ buf_->assign(slice.data(), slice.size());
+ data_ = buf_->data();
+ size_ = buf_->size();
+ assert(!pinned_);
+ }
+
+ inline void PinSelf() {
+ assert(!pinned_);
+ data_ = buf_->data();
+ size_ = buf_->size();
+ assert(!pinned_);
+ }
+
+ void remove_suffix(size_t n) {
+ assert(n <= size());
+ if (pinned_) {
+ size_ -= n;
+ } else {
+ buf_->erase(size() - n, n);
+ PinSelf();
+ }
+ }
+
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ if (pinned_) {
+ data_ += n;
+ size_ -= n;
+ } else {
+ buf_->erase(0, n);
+ PinSelf();
+ }
+ }
+
+ void Reset() {
+ Cleanable::Reset();
+ pinned_ = false;
+ size_ = 0;
+ }
+
+ inline std::string* GetSelf() { return buf_; }
+
+ inline bool IsPinned() const { return pinned_; }
+
+ private:
+ friend class PinnableSlice4Test;
+ std::string self_space_;
+ std::string* buf_;
+ bool pinned_ = false;
+};
+
+// A set of Slices that are virtually concatenated together. 'parts' points
+// to an array of Slices. The number of elements in the array is 'num_parts'.
+struct SliceParts {
+ SliceParts(const Slice* _parts, int _num_parts)
+ : parts(_parts), num_parts(_num_parts) {}
+ SliceParts() : parts(nullptr), num_parts(0) {}
+
+ const Slice* parts;
+ int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+ return ((x.size() == y.size()) &&
+ (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); }
+
+inline int Slice::compare(const Slice& b) const {
+ assert(data_ != nullptr && b.data_ != nullptr);
+ const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+ int r = memcmp(data_, b.data_, min_len);
+ if (r == 0) {
+ if (size_ < b.size_)
+ r = -1;
+ else if (size_ > b.size_)
+ r = +1;
+ }
+ return r;
+}
+
+inline size_t Slice::difference_offset(const Slice& b) const {
+ size_t off = 0;
+ const size_t len = (size_ < b.size_) ? size_ : b.size_;
+ for (; off < len; off++) {
+ if (data_[off] != b.data_[off]) break;
+ }
+ return off;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h
new file mode 100644
index 000000000..54f61f9d2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice_transform.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice. It is not required that every slice
+// belong to the domain and/or range of a function. Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+/*
+ * A SliceTransform is a generic pluggable way of transforming one string
+ * to another. Its primary use-case is in configuring rocksdb
+ * to store prefix blooms by setting prefix_extractor in
+ * ColumnFamilyOptions.
+ */
+class SliceTransform {
+ public:
+ virtual ~SliceTransform(){};
+
+ // Return the name of this transformation.
+ virtual const char* Name() const = 0;
+
+ // Extract a prefix from a specified key. This method is called when
+ // a key is inserted into the db, and the returned slice is used to
+ // create a bloom filter.
+ virtual Slice Transform(const Slice& key) const = 0;
+
+ // Determine whether the specified key is compatible with the logic
+ // specified in the Transform method. This method is invoked for every
+ // key that is inserted into the db. If this method returns true,
+ // then Transform is called to translate the key to its prefix and
+ // that returned prefix is inserted into the bloom filter. If this
+ // method returns false, then the call to Transform is skipped and
+ // no prefix is inserted into the bloom filters.
+ //
+ // For example, if the Transform method operates on a fixed length
+ // prefix of size 4, then an invocation to InDomain("abc") returns
+ // false because the specified key length(3) is shorter than the
+ // prefix size of 4.
+ //
+ // Wiki documentation here:
+ // https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+ //
+ virtual bool InDomain(const Slice& key) const = 0;
+
+ // This is currently not used and remains here for backward compatibility.
+ virtual bool InRange(const Slice& /*dst*/) const { return false; }
+
+ // Some SliceTransform will have a full length which can be used to
+ // determine if two keys are consecuitive. Can be disabled by always
+ // returning 0
+ virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
+
+ // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+ //
+ // This function is not used by RocksDB, but for users. If users pass
+ // Options by string to RocksDB, they might not know what prefix extractor
+ // they are using. This function is to help users can determine:
+ // if they want to iterate all keys prefixing `prefix`, whether it is
+ // safe to use prefix bloom filter and seek to key `prefix`.
+ // If this function returns true, this means a user can Seek() to a prefix
+ // using the bloom filter. Otherwise, user needs to skip the bloom filter
+ // by setting ReadOptions.total_order_seek = true.
+ //
+ // Here is an example: Suppose we implement a slice transform that returns
+ // the first part of the string after splitting it using delimiter ",":
+ // 1. SameResultWhenAppended("abc,") should return true. If applying prefix
+ // bloom filter using it, all slices matching "abc:.*" will be extracted
+ // to "abc,", so any SST file or memtable containing any of those key
+ // will not be filtered out.
+ // 2. SameResultWhenAppended("abc") should return false. A user will not
+ // guaranteed to see all the keys matching "abc.*" if a user seek to "abc"
+ // against a DB with the same setting. If one SST file only contains
+ // "abcd,e", the file can be filtered out and the key will be invisible.
+ //
+ // i.e., an implementation always returning false is safe.
+ virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const {
+ return false;
+ }
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
+extern const SliceTransform* NewNoopTransform();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h
new file mode 100644
index 000000000..6a7212d60
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/snapshot.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+//
+// To Create a Snapshot, call DB::GetSnapshot().
+// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot).
+class Snapshot {
+ public:
+ // returns Snapshot's sequence number
+ virtual SequenceNumber GetSequenceNumber() const = 0;
+
+ protected:
+ virtual ~Snapshot();
+};
+
+// Simple RAII wrapper class for Snapshot.
+// Constructing this object will create a snapshot. Destructing will
+// release the snapshot.
+class ManagedSnapshot {
+ public:
+ explicit ManagedSnapshot(DB* db);
+
+ // Instead of creating a snapshot, take ownership of the input snapshot.
+ ManagedSnapshot(DB* db, const Snapshot* _snapshot);
+
+ ~ManagedSnapshot();
+
+ const Snapshot* snapshot();
+
+ private:
+ DB* db_;
+ const Snapshot* snapshot_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 000000000..ecb692e31
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SSTDumpTool {
+ public:
+ int Run(int argc, char** argv, Options options = Options());
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h
new file mode 100644
index 000000000..92d0bbbf8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_manager.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST files in the DB and control their
+// deletion rate.
+// All SstFileManager public functions are thread-safe.
+// SstFileManager is not extensible.
+class SstFileManager {
+ public:
+ virtual ~SstFileManager() {}
+
+ // Update the maximum allowed space that should be used by RocksDB, if
+ // the total size of the SST files exceeds max_allowed_space, writes to
+ // RocksDB will fail.
+ //
+ // Setting max_allowed_space to 0 will disable this feature; maximum allowed
+ // space will be infinite (Default value).
+ //
+ // thread-safe.
+ virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
+
+ // Set the amount of buffer room each compaction should be able to leave.
+ // In other words, at its maximum disk space consumption, the compaction
+ // should still leave compaction_buffer_size available on the disk so that
+ // other background functions may continue, such as logging and flushing.
+ virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
+
+ // Return true if the total size of SST files exceeded the maximum allowed
+ // space usage.
+ //
+ // thread-safe.
+ virtual bool IsMaxAllowedSpaceReached() = 0;
+
+ // Returns true if the total size of SST files as well as estimated size
+ // of ongoing compactions exceeds the maximums allowed space usage.
+ virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
+
+ // Return the total size of all tracked files.
+ // thread-safe
+ virtual uint64_t GetTotalSize() = 0;
+
+ // Return a map containing all tracked files and their corresponding sizes.
+ // thread-safe
+ virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
+
+ // Return delete rate limit in bytes per second.
+ // thread-safe
+ virtual int64_t GetDeleteRateBytesPerSecond() = 0;
+
+ // Update the delete rate limit in bytes per second.
+ // zero means disable delete rate limiting and delete files immediately
+ // thread-safe
+ virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
+
+ // Return trash/DB size ratio where new files will be deleted immediately
+ // thread-safe
+ virtual double GetMaxTrashDBRatio() = 0;
+
+ // Update trash/DB size ratio where new files will be deleted immediately
+ // thread-safe
+ virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+ // Return the total size of trash files
+ // thread-safe
+ virtual uint64_t GetTotalTrashSize() = 0;
+};
+
+// Create a new SstFileManager that can be shared among multiple RocksDB
+// instances to track SST file and control there deletion rate.
+// Even though SstFileManager don't track WAL files but it still control
+// there deletion rate.
+//
+// @param env: Pointer to Env object, please see "rocksdb/env.h".
+// @param fs: Pointer to FileSystem object (rocksdb/file_system.h"
+// @param info_log: If not nullptr, info_log will be used to log errors.
+//
+// == Deletion rate limiting specific arguments ==
+// @param trash_dir: Deprecated, this argument have no effect
+// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
+// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
+// in 1 second, we will wait for another 3 seconds before we delete other
+// files, Set to 0 to disable deletion rate limiting.
+// This option also affect the delete rate of WAL files in the DB.
+// @param delete_existing_trash: Deprecated, this argument have no effect, but
+// if user provide trash_dir we will schedule deletes for files in the dir
+// @param status: If not nullptr, status will contain any errors that happened
+// during creating the missing trash_dir or deleting existing files in trash.
+// @param max_trash_db_ratio: If the trash size constitutes for more than this
+// fraction of the total DB size we will start deleting new files passed to
+// DeleteScheduler immediately
+// @param bytes_max_delete_chunk: if a file to delete is larger than delete
+// chunk, ftruncate the file by this size each time, rather than dropping the
+// whole file. 0 means to always delete the whole file. If the file has more
+// than one linked names, the file will be deleted as a whole. Either way,
+// `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
+// files already renamed as a trash may be partial, so users should not
+// directly recover them without checking.
+extern SstFileManager* NewSstFileManager(
+ Env* env, std::shared_ptr<FileSystem> fs,
+ std::shared_ptr<Logger> info_log = nullptr,
+ const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0,
+ bool delete_existing_trash = true, Status* status = nullptr,
+ double max_trash_db_ratio = 0.25,
+ uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+// Same as above, but takes a pointer to a legacy Env object, instead of
+// Env and FileSystem objects
+extern SstFileManager* NewSstFileManager(
+ Env* env, std::shared_ptr<Logger> info_log = nullptr,
+ std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
+ bool delete_existing_trash = true, Status* status = nullptr,
+ double max_trash_db_ratio = 0.25,
+ uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h
new file mode 100644
index 000000000..4b8642480
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+ SstFileReader(const Options& options);
+
+ ~SstFileReader();
+
+ // Prepares to read from the file located at "file_path".
+ Status Open(const std::string& file_path);
+
+ // Returns a new iterator over the table contents.
+ // Most read options provide the same control as we read from DB.
+ // If "snapshot" is nullptr, the iterator returns only the latest keys.
+ Iterator* NewIterator(const ReadOptions& options);
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+ // Verifies whether there is corruption in this table.
+ Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+ Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+ private:
+ struct Rep;
+ std::unique_ptr<Rep> rep_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h
new file mode 100644
index 000000000..e83383fea
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_writer.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+
+// ExternalSstFileInfo include information about sst files created
+// using SstFileWriter.
+struct ExternalSstFileInfo {
+ ExternalSstFileInfo()
+ : file_path(""),
+ smallest_key(""),
+ largest_key(""),
+ smallest_range_del_key(""),
+ largest_range_del_key(""),
+ sequence_number(0),
+ file_size(0),
+ num_entries(0),
+ num_range_del_entries(0),
+ version(0) {}
+
+ ExternalSstFileInfo(const std::string& _file_path,
+ const std::string& _smallest_key,
+ const std::string& _largest_key,
+ SequenceNumber _sequence_number, uint64_t _file_size,
+ int32_t _num_entries, int32_t _version)
+ : file_path(_file_path),
+ smallest_key(_smallest_key),
+ largest_key(_largest_key),
+ smallest_range_del_key(""),
+ largest_range_del_key(""),
+ sequence_number(_sequence_number),
+ file_size(_file_size),
+ num_entries(_num_entries),
+ num_range_del_entries(0),
+ version(_version) {}
+
+ std::string file_path; // external sst file path
+ std::string smallest_key; // smallest user key in file
+ std::string largest_key; // largest user key in file
+ std::string
+ smallest_range_del_key; // smallest range deletion user key in file
+ std::string largest_range_del_key; // largest range deletion user key in file
+ SequenceNumber sequence_number; // sequence number of all keys in file
+ uint64_t file_size; // file size in bytes
+ uint64_t num_entries; // number of entries in file
+ uint64_t num_range_del_entries; // number of range deletion entries in file
+ int32_t version; // file version
+};
+
+// SstFileWriter is used to create sst files that can be added to database later
+// All keys in files generated by SstFileWriter will have sequence number = 0.
+class SstFileWriter {
+ public:
+ // User can pass `column_family` to specify that the generated file will
+ // be ingested into this column_family, note that passing nullptr means that
+ // the column_family is unknown.
+ // If invalidate_page_cache is set to true, SstFileWriter will give the OS a
+ // hint that this file pages is not needed every time we write 1MB to the
+ // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
+ // passed.
+ SstFileWriter(const EnvOptions& env_options, const Options& options,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool invalidate_page_cache = true,
+ Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+ bool skip_filters = false)
+ : SstFileWriter(env_options, options, options.comparator, column_family,
+ invalidate_page_cache, io_priority, skip_filters) {}
+
+ // Deprecated API
+ SstFileWriter(const EnvOptions& env_options, const Options& options,
+ const Comparator* user_comparator,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool invalidate_page_cache = true,
+ Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+ bool skip_filters = false);
+
+ ~SstFileWriter();
+
+ // Prepare SstFileWriter to write into file located at "file_path".
+ Status Open(const std::string& file_path);
+
+ // Add a Put key with value to currently opened file (deprecated)
+ // REQUIRES: key is after any previously added key according to comparator.
+ ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
+
+ // Add a Put key with value to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ Status Put(const Slice& user_key, const Slice& value);
+
+ // Add a Merge key with value to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ Status Merge(const Slice& user_key, const Slice& value);
+
+ // Add a deletion key to currently opened file
+ // REQUIRES: key is after any previously added key according to comparator.
+ Status Delete(const Slice& user_key);
+
+ // Add a range deletion tombstone to currently opened file
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key);
+
+ // Finalize writing to sst file and close file.
+ //
+ // An optional ExternalSstFileInfo pointer can be passed to the function
+ // which will be populated with information about the created sst file.
+ Status Finish(ExternalSstFileInfo* file_info = nullptr);
+
+ // Return the current file size.
+ uint64_t FileSize();
+
+ private:
+ void InvalidatePageCache(bool closing);
+ struct Rep;
+ std::unique_ptr<Rep> rep_;
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
new file mode 100644
index 000000000..3bda6d718
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -0,0 +1,548 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/**
+ * Keep adding ticker's here.
+ * 1. Any ticker should be added before TICKER_ENUM_MAX.
+ * 2. Add a readable string in TickersNameMap below for the newly added ticker.
+ * 3. Add a corresponding enum value to TickerType.java in the java API
+ * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
+ * and toCppTickers
+ */
+enum Tickers : uint32_t {
+ // total block cache misses
+ // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+ // BLOCK_CACHE_FILTER_MISS +
+ // BLOCK_CACHE_DATA_MISS;
+ BLOCK_CACHE_MISS = 0,
+ // total block cache hit
+ // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+ // BLOCK_CACHE_FILTER_HIT +
+ // BLOCK_CACHE_DATA_HIT;
+ BLOCK_CACHE_HIT,
+ // # of blocks added to block cache.
+ BLOCK_CACHE_ADD,
+ // # of failures when adding blocks to block cache.
+ BLOCK_CACHE_ADD_FAILURES,
+ // # of times cache miss when accessing index block from block cache.
+ BLOCK_CACHE_INDEX_MISS,
+ // # of times cache hit when accessing index block from block cache.
+ BLOCK_CACHE_INDEX_HIT,
+ // # of index blocks added to block cache.
+ BLOCK_CACHE_INDEX_ADD,
+ // # of bytes of index blocks inserted into cache
+ BLOCK_CACHE_INDEX_BYTES_INSERT,
+ // # of bytes of index block erased from cache
+ BLOCK_CACHE_INDEX_BYTES_EVICT,
+ // # of times cache miss when accessing filter block from block cache.
+ BLOCK_CACHE_FILTER_MISS,
+ // # of times cache hit when accessing filter block from block cache.
+ BLOCK_CACHE_FILTER_HIT,
+ // # of filter blocks added to block cache.
+ BLOCK_CACHE_FILTER_ADD,
+ // # of bytes of bloom filter blocks inserted into cache
+ BLOCK_CACHE_FILTER_BYTES_INSERT,
+ // # of bytes of bloom filter block erased from cache
+ BLOCK_CACHE_FILTER_BYTES_EVICT,
+ // # of times cache miss when accessing data block from block cache.
+ BLOCK_CACHE_DATA_MISS,
+ // # of times cache hit when accessing data block from block cache.
+ BLOCK_CACHE_DATA_HIT,
+ // # of data blocks added to block cache.
+ BLOCK_CACHE_DATA_ADD,
+ // # of bytes of data blocks inserted into cache
+ BLOCK_CACHE_DATA_BYTES_INSERT,
+ // # of bytes read from cache.
+ BLOCK_CACHE_BYTES_READ,
+ // # of bytes written into cache.
+ BLOCK_CACHE_BYTES_WRITE,
+
+ // # of times bloom filter has avoided file reads, i.e., negatives.
+ BLOOM_FILTER_USEFUL,
+ // # of times bloom FullFilter has not avoided the reads.
+ BLOOM_FILTER_FULL_POSITIVE,
+ // # of times bloom FullFilter has not avoided the reads and data actually
+ // exist.
+ BLOOM_FILTER_FULL_TRUE_POSITIVE,
+
+ BLOOM_FILTER_MICROS,
+
+ // # persistent cache hit
+ PERSISTENT_CACHE_HIT,
+ // # persistent cache miss
+ PERSISTENT_CACHE_MISS,
+
+ // # total simulation block cache hits
+ SIM_BLOCK_CACHE_HIT,
+ // # total simulation block cache misses
+ SIM_BLOCK_CACHE_MISS,
+
+ // # of memtable hits.
+ MEMTABLE_HIT,
+ // # of memtable misses.
+ MEMTABLE_MISS,
+
+ // # of Get() queries served by L0
+ GET_HIT_L0,
+ // # of Get() queries served by L1
+ GET_HIT_L1,
+ // # of Get() queries served by L2 and up
+ GET_HIT_L2_AND_UP,
+
+ /**
+ * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+ * There are 4 reasons currently.
+ */
+ COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
+ // Also includes keys dropped for range del.
+ COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
+ COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone.
+ COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
+ COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted.
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ // If a compaction was cancelled in sfm to prevent ENOSPC
+ COMPACTION_CANCELLED,
+
+ // Number of keys written to the database via the Put and Write call's
+ NUMBER_KEYS_WRITTEN,
+ // Number of Keys read,
+ NUMBER_KEYS_READ,
+ // Number keys updated, if inplace update is enabled
+ NUMBER_KEYS_UPDATED,
+ // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+ // DB::Merge(), and DB::Write().
+ BYTES_WRITTEN,
+ // The number of uncompressed bytes read from DB::Get(). It could be
+ // either from memtables, cache, or table files.
+ // For the number of logical bytes read from DB::MultiGet(),
+ // please use NUMBER_MULTIGET_BYTES_READ.
+ BYTES_READ,
+ // The number of calls to seek/next/prev
+ NUMBER_DB_SEEK,
+ NUMBER_DB_NEXT,
+ NUMBER_DB_PREV,
+ // The number of calls to seek/next/prev that returned data
+ NUMBER_DB_SEEK_FOUND,
+ NUMBER_DB_NEXT_FOUND,
+ NUMBER_DB_PREV_FOUND,
+ // The number of uncompressed bytes read from an iterator.
+ // Includes size of key and value.
+ ITER_BYTES_READ,
+ NO_FILE_CLOSES,
+ NO_FILE_OPENS,
+ NO_FILE_ERRORS,
+ // DEPRECATED Time system had to wait to do LO-L1 compactions
+ STALL_L0_SLOWDOWN_MICROS,
+ // DEPRECATED Time system had to wait to move memtable to L1.
+ STALL_MEMTABLE_COMPACTION_MICROS,
+ // DEPRECATED write throttle because of too many files in L0
+ STALL_L0_NUM_FILES_MICROS,
+ // Writer has to wait for compaction or flush to finish.
+ STALL_MICROS,
+ // The wait time for db mutex.
+ // Disabled by default. To enable it set stats level to kAll
+ DB_MUTEX_WAIT_MICROS,
+ RATE_LIMIT_DELAY_MILLIS,
+ // DEPRECATED number of iterators currently open
+ NO_ITERATORS,
+
+ // Number of MultiGet calls, keys read, and bytes read
+ NUMBER_MULTIGET_CALLS,
+ NUMBER_MULTIGET_KEYS_READ,
+ NUMBER_MULTIGET_BYTES_READ,
+
+ // Number of deletes records that were not required to be
+ // written to storage because key does not exist
+ NUMBER_FILTERED_DELETES,
+ NUMBER_MERGE_FAILURES,
+
+ // number of times bloom was checked before creating iterator on a
+ // file, and the number of times the check was useful in avoiding
+ // iterator creation (and thus likely IOPs).
+ BLOOM_FILTER_PREFIX_CHECKED,
+ BLOOM_FILTER_PREFIX_USEFUL,
+
+ // Number of times we had to reseek inside an iteration to skip
+ // over large number of keys with same userkey.
+ NUMBER_OF_RESEEKS_IN_ITERATION,
+
+ // Record the number of calls to GetUpadtesSince. Useful to keep track of
+ // transaction log iterator refreshes
+ GET_UPDATES_SINCE_CALLS,
+ BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
+ BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
+ // Number of blocks added to compressed block cache
+ BLOCK_CACHE_COMPRESSED_ADD,
+ // Number of failures when adding blocks to compressed block cache
+ BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+ WAL_FILE_SYNCED, // Number of times WAL sync is done
+ WAL_FILE_BYTES, // Number of bytes written to WAL
+
+ // Writes can be processed by requesting thread or by the thread at the
+ // head of the writers queue.
+ WRITE_DONE_BY_SELF,
+ WRITE_DONE_BY_OTHER, // Equivalent to writes done for others
+ WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
+ WRITE_WITH_WAL, // Number of Write calls that request WAL
+ COMPACT_READ_BYTES, // Bytes read during compaction
+ COMPACT_WRITE_BYTES, // Bytes written during compaction
+ FLUSH_WRITE_BYTES, // Bytes written during flush
+
+ // Number of table's properties loaded directly from file, without creating
+ // table reader object.
+ NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+ NUMBER_SUPERVERSION_ACQUIRES,
+ NUMBER_SUPERVERSION_RELEASES,
+ NUMBER_SUPERVERSION_CLEANUPS,
+
+ // # of compressions/decompressions executed
+ NUMBER_BLOCK_COMPRESSED,
+ NUMBER_BLOCK_DECOMPRESSED,
+
+ NUMBER_BLOCK_NOT_COMPRESSED,
+ MERGE_OPERATION_TOTAL_TIME,
+ FILTER_OPERATION_TOTAL_TIME,
+
+ // Row cache.
+ ROW_CACHE_HIT,
+ ROW_CACHE_MISS,
+
+ // Read amplification statistics.
+ // Read amplification can be calculated using this formula
+ // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+ //
+ // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
+ READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used.
+ READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks.
+
+ // Number of refill intervals where rate limiter's bytes are fully consumed.
+ NUMBER_RATE_LIMITER_DRAINS,
+
+ // Number of internal keys skipped by Iterator
+ NUMBER_ITER_SKIP,
+
+ // BlobDB specific stats
+ // # of Put/PutTTL/PutUntil to BlobDB.
+ BLOB_DB_NUM_PUT,
+ // # of Write to BlobDB.
+ BLOB_DB_NUM_WRITE,
+ // # of Get to BlobDB.
+ BLOB_DB_NUM_GET,
+ // # of MultiGet to BlobDB.
+ BLOB_DB_NUM_MULTIGET,
+ // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+ BLOB_DB_NUM_SEEK,
+ // # of Next to BlobDB iterator.
+ BLOB_DB_NUM_NEXT,
+ // # of Prev to BlobDB iterator.
+ BLOB_DB_NUM_PREV,
+ // # of keys written to BlobDB.
+ BLOB_DB_NUM_KEYS_WRITTEN,
+ // # of keys read from BlobDB.
+ BLOB_DB_NUM_KEYS_READ,
+ // # of bytes (key + value) written to BlobDB.
+ BLOB_DB_BYTES_WRITTEN,
+ // # of bytes (keys + value) read from BlobDB.
+ BLOB_DB_BYTES_READ,
+ // # of keys written by BlobDB as non-TTL inlined value.
+ BLOB_DB_WRITE_INLINED,
+ // # of keys written by BlobDB as TTL inlined value.
+ BLOB_DB_WRITE_INLINED_TTL,
+ // # of keys written by BlobDB as non-TTL blob value.
+ BLOB_DB_WRITE_BLOB,
+ // # of keys written by BlobDB as TTL blob value.
+ BLOB_DB_WRITE_BLOB_TTL,
+ // # of bytes written to blob file.
+ BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ // # of bytes read from blob file.
+ BLOB_DB_BLOB_FILE_BYTES_READ,
+ // # of times a blob files being synced.
+ BLOB_DB_BLOB_FILE_SYNCED,
+ // # of blob index evicted from base DB by BlobDB compaction filter because
+ // of expiration.
+ BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+ // size of blob index evicted from base DB by BlobDB compaction filter
+ // because of expiration.
+ BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
+ // # of blob index evicted from base DB by BlobDB compaction filter because
+ // of corresponding file deleted.
+ BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+ // size of blob index evicted from base DB by BlobDB compaction filter
+ // because of corresponding file deleted.
+ BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
+ // # of blob files that were obsoleted by garbage collection.
+ BLOB_DB_GC_NUM_FILES,
+ // # of blob files generated by garbage collection.
+ BLOB_DB_GC_NUM_NEW_FILES,
+ // # of BlobDB garbage collection failures.
+ BLOB_DB_GC_FAILURES,
+ // # of keys dropped by BlobDB garbage collection because they had been
+ // overwritten. DEPRECATED.
+ BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+ // # of keys dropped by BlobDB garbage collection because of expiration.
+ // DEPRECATED.
+ BLOB_DB_GC_NUM_KEYS_EXPIRED,
+ // # of keys relocated to new blob file by garbage collection.
+ BLOB_DB_GC_NUM_KEYS_RELOCATED,
+ // # of bytes dropped by BlobDB garbage collection because they had been
+ // overwritten. DEPRECATED.
+ BLOB_DB_GC_BYTES_OVERWRITTEN,
+ // # of bytes dropped by BlobDB garbage collection because of expiration.
+ // DEPRECATED.
+ BLOB_DB_GC_BYTES_EXPIRED,
+ // # of bytes relocated to new blob file by garbage collection.
+ BLOB_DB_GC_BYTES_RELOCATED,
+ // # of blob files evicted because of BlobDB is full.
+ BLOB_DB_FIFO_NUM_FILES_EVICTED,
+ // # of keys in the blob files evicted because of BlobDB is full.
+ BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+ // # of bytes in the blob files evicted because of BlobDB is full.
+ BLOB_DB_FIFO_BYTES_EVICTED,
+
+ // These counters indicate a performance issue in WritePrepared transactions.
+ // We should not seem them ticking them much.
+ // # of times prepare_mutex_ is acquired in the fast path.
+ TXN_PREPARE_MUTEX_OVERHEAD,
+ // # of times old_commit_map_mutex_ is acquired in the fast path.
+ TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+ // # of times we checked a batch for duplicate keys.
+ TXN_DUPLICATE_KEY_OVERHEAD,
+ // # of times snapshot_mutex_ is acquired in the fast path.
+ TXN_SNAPSHOT_MUTEX_OVERHEAD,
+ // # of times ::Get returned TryAgain due to expired snapshot seq
+ TXN_GET_TRY_AGAIN,
+
+ // Number of keys actually found in MultiGet calls (vs number requested by
+ // caller)
+ // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
+ NUMBER_MULTIGET_KEYS_FOUND,
+
+ NO_ITERATOR_CREATED, // number of iterators created
+ NO_ITERATOR_DELETED, // number of iterators deleted
+
+ BLOCK_CACHE_COMPRESSION_DICT_MISS,
+ BLOCK_CACHE_COMPRESSION_DICT_HIT,
+ BLOCK_CACHE_COMPRESSION_DICT_ADD,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+ BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+ TICKER_ENUM_MAX
+};
+
+// The order of items listed in Tickers should be the same as
+// the order listed in TickersNameMap
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram should have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ * Add a corresponding enum value to HistogramType.java in the java API
+ */
+enum Histograms : uint32_t {
+ DB_GET = 0,
+ DB_WRITE,
+ COMPACTION_TIME,
+ COMPACTION_CPU_TIME,
+ SUBCOMPACTION_SETUP_TIME,
+ TABLE_SYNC_MICROS,
+ COMPACTION_OUTFILE_SYNC_MICROS,
+ WAL_FILE_SYNC_MICROS,
+ MANIFEST_FILE_SYNC_MICROS,
+ // TIME SPENT IN IO DURING TABLE OPEN
+ TABLE_OPEN_IO_MICROS,
+ DB_MULTIGET,
+ READ_BLOCK_COMPACTION_MICROS,
+ READ_BLOCK_GET_MICROS,
+ WRITE_RAW_BLOCK_MICROS,
+ STALL_L0_SLOWDOWN_COUNT,
+ STALL_MEMTABLE_COMPACTION_COUNT,
+ STALL_L0_NUM_FILES_COUNT,
+ HARD_RATE_LIMIT_DELAY_COUNT,
+ SOFT_RATE_LIMIT_DELAY_COUNT,
+ NUM_FILES_IN_SINGLE_COMPACTION,
+ DB_SEEK,
+ WRITE_STALL,
+ SST_READ_MICROS,
+ // The number of subcompactions actually scheduled during a compaction
+ NUM_SUBCOMPACTIONS_SCHEDULED,
+ // Value size distribution in each operation
+ BYTES_PER_READ,
+ BYTES_PER_WRITE,
+ BYTES_PER_MULTIGET,
+
+ // number of bytes compressed/decompressed
+ // number of bytes is when uncompressed; i.e. before/after respectively
+ BYTES_COMPRESSED,
+ BYTES_DECOMPRESSED,
+ COMPRESSION_TIMES_NANOS,
+ DECOMPRESSION_TIMES_NANOS,
+ // Number of merge operands passed to the merge operator in user read
+ // requests.
+ READ_NUM_MERGE_OPERANDS,
+
+ // BlobDB specific stats
+ // Size of keys written to BlobDB.
+ BLOB_DB_KEY_SIZE,
+ // Size of values written to BlobDB.
+ BLOB_DB_VALUE_SIZE,
+ // BlobDB Put/PutWithTTL/PutUntil/Write latency.
+ BLOB_DB_WRITE_MICROS,
+ // BlobDB Get lagency.
+ BLOB_DB_GET_MICROS,
+ // BlobDB MultiGet latency.
+ BLOB_DB_MULTIGET_MICROS,
+ // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+ BLOB_DB_SEEK_MICROS,
+ // BlobDB Next latency.
+ BLOB_DB_NEXT_MICROS,
+ // BlobDB Prev latency.
+ BLOB_DB_PREV_MICROS,
+ // Blob file write latency.
+ BLOB_DB_BLOB_FILE_WRITE_MICROS,
+ // Blob file read latency.
+ BLOB_DB_BLOB_FILE_READ_MICROS,
+ // Blob file sync latency.
+ BLOB_DB_BLOB_FILE_SYNC_MICROS,
+ // BlobDB garbage collection time. DEPRECATED.
+ BLOB_DB_GC_MICROS,
+ // BlobDB compression time.
+ BLOB_DB_COMPRESSION_MICROS,
+ // BlobDB decompression time.
+ BLOB_DB_DECOMPRESSION_MICROS,
+ // Time spent flushing memtable to disk
+ FLUSH_TIME,
+ SST_BATCH_SIZE,
+
+ HISTOGRAM_ENUM_MAX,
+};
+
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
+
+struct HistogramData {
+ double median;
+ double percentile95;
+ double percentile99;
+ double average;
+ double standard_deviation;
+ // zero-initialize new members since old Statistics::histogramData()
+ // implementations won't write them.
+ double max = 0.0;
+ uint64_t count = 0;
+ uint64_t sum = 0;
+ double min = 0.0;
+};
+
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+enum StatsLevel : uint8_t {
+ // Disable timer stats, and skip histogram stats
+ kExceptHistogramOrTimers,
+ // Skip timer stats
+ kExceptTimers,
+ // Collect all stats except time inside mutex lock AND time spent on
+ // compression.
+ kExceptDetailedTimers,
+ // Collect all stats except the counters requiring to get time inside the
+ // mutex lock.
+ kExceptTimeForMutex,
+ // Collect all stats, including measuring duration of mutex operations.
+ // If getting time is expensive on the platform to run, it can
+ // reduce scalability to more threads, especially for writes.
+ kAll,
+};
+
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+// Options options;
+// options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+// Status s = DB::Open(options, kDBPath, &db);
+// ...
+// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+// HistogramData hist;
+// options.statistics->histogramData(FLUSH_TIME, &hist);
+class Statistics {
+ public:
+ virtual ~Statistics() {}
+ static const char* Type() { return "Statistics"; }
+ virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
+ virtual void histogramData(uint32_t type,
+ HistogramData* const data) const = 0;
+ virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
+ virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
+ virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
+ virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
+ virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
+ if (get_stats_level() <= StatsLevel::kExceptTimers) {
+ return;
+ }
+ recordInHistogram(histogramType, time);
+ }
+ // The function is here only for backward compatibility reason.
+ // Users implementing their own Statistics class should override
+ // recordInHistogram() instead and leave measureTime() as it is.
+ virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
+ // This is not supposed to be called.
+ assert(false);
+ }
+ virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
+ // measureTime() is the old and inaccurate function name.
+ // To keep backward compatible. If users implement their own
+ // statistics, which overrides measureTime() but doesn't override
+ // this function. We forward to measureTime().
+ measureTime(histogramType, time);
+ }
+
+ // Resets all ticker and histogram stats
+ virtual Status Reset() { return Status::NotSupported("Not implemented"); }
+
+ // String representation of the statistic object.
+ virtual std::string ToString() const {
+ // Do nothing by default
+ return std::string("ToString(): not implemented");
+ }
+
+ virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
+ // Do nothing by default
+ return false;
+ }
+
+ // Override this function to disable particular histogram collection
+ virtual bool HistEnabledForType(uint32_t type) const {
+ return type < HISTOGRAM_ENUM_MAX;
+ }
+ void set_stats_level(StatsLevel sl) {
+ stats_level_.store(sl, std::memory_order_relaxed);
+ }
+ StatsLevel get_stats_level() const {
+ return stats_level_.load(std::memory_order_relaxed);
+ }
+
+ private:
+ std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h
new file mode 100644
index 000000000..4acaad26f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/stats_history.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+// std::unique_ptr<StatsHistoryIterator> stats_iter;
+// Status s = db->GetStatsHistory(0 /* start_time */,
+// env->NowMicros() /* end_time*/,
+// &stats_iter);
+// if (s.ok) {
+// for (; stats_iter->Valid(); stats_iter->Next()) {
+// uint64_t stats_time = stats_iter->GetStatsTime();
+// const std::map<std::string, uint64_t>& stats_map =
+// stats_iter->GetStatsMap();
+// process(stats_time, stats_map);
+// }
+// }
+class StatsHistoryIterator {
+ public:
+ StatsHistoryIterator() {}
+ virtual ~StatsHistoryIterator() {}
+
+ virtual bool Valid() const = 0;
+
+ // Moves to the next stats history record. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Return the time stamp (in seconds) when stats history is recorded.
+ // REQUIRES: Valid()
+ virtual uint64_t GetStatsTime() const = 0;
+
+ virtual int GetFormatVersion() const { return -1; }
+
+ // Return the current stats history as an std::map which specifies the
+ // mapping from stats name to stats value . The underlying storage
+ // for the returned map is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
new file mode 100644
index 000000000..cf62512c3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status {
+ public:
+ // Create a success status.
+ Status() : code_(kOk), subcode_(kNone), sev_(kNoError), state_(nullptr) {}
+ ~Status() { delete[] state_; }
+
+ // Copy the specified status.
+ Status(const Status& s);
+ Status& operator=(const Status& s);
+ Status(Status&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ ;
+ Status& operator=(Status&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ ;
+ bool operator==(const Status& rhs) const;
+ bool operator!=(const Status& rhs) const;
+
+ enum Code : unsigned char {
+ kOk = 0,
+ kNotFound = 1,
+ kCorruption = 2,
+ kNotSupported = 3,
+ kInvalidArgument = 4,
+ kIOError = 5,
+ kMergeInProgress = 6,
+ kIncomplete = 7,
+ kShutdownInProgress = 8,
+ kTimedOut = 9,
+ kAborted = 10,
+ kBusy = 11,
+ kExpired = 12,
+ kTryAgain = 13,
+ kCompactionTooLarge = 14,
+ kColumnFamilyDropped = 15,
+ kMaxCode
+ };
+
+ Code code() const { return code_; }
+
+ enum SubCode : unsigned char {
+ kNone = 0,
+ kMutexTimeout = 1,
+ kLockTimeout = 2,
+ kLockLimit = 3,
+ kNoSpace = 4,
+ kDeadlock = 5,
+ kStaleFile = 6,
+ kMemoryLimit = 7,
+ kSpaceLimit = 8,
+ kPathNotFound = 9,
+ KMergeOperandsInsufficientCapacity = 10,
+ kManualCompactionPaused = 11,
+ kMaxSubCode
+ };
+
+ SubCode subcode() const { return subcode_; }
+
+ enum Severity : unsigned char {
+ kNoError = 0,
+ kSoftError = 1,
+ kHardError = 2,
+ kFatalError = 3,
+ kUnrecoverableError = 4,
+ kMaxSeverity
+ };
+
+ Status(const Status& s, Severity sev);
+ Severity severity() const { return sev_; }
+
+ // Returns a C style string indicating the message of the Status
+ const char* getState() const { return state_; }
+
+ // Return a success status.
+ static Status OK() { return Status(); }
+
+ // Return error status of an appropriate type.
+ static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotFound, msg, msg2);
+ }
+ // Fast path for not found without malloc;
+ static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
+
+ static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kCorruption, msg, msg2);
+ }
+ static Status Corruption(SubCode msg = kNone) {
+ return Status(kCorruption, msg);
+ }
+
+ static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotSupported, msg, msg2);
+ }
+ static Status NotSupported(SubCode msg = kNone) {
+ return Status(kNotSupported, msg);
+ }
+
+ static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kInvalidArgument, msg, msg2);
+ }
+ static Status InvalidArgument(SubCode msg = kNone) {
+ return Status(kInvalidArgument, msg);
+ }
+
+ static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, msg, msg2);
+ }
+ static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); }
+
+ static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kMergeInProgress, msg, msg2);
+ }
+ static Status MergeInProgress(SubCode msg = kNone) {
+ return Status(kMergeInProgress, msg);
+ }
+
+ static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIncomplete, msg, msg2);
+ }
+ static Status Incomplete(SubCode msg = kNone) {
+ return Status(kIncomplete, msg);
+ }
+
+ static Status ShutdownInProgress(SubCode msg = kNone) {
+ return Status(kShutdownInProgress, msg);
+ }
+ static Status ShutdownInProgress(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kShutdownInProgress, msg, msg2);
+ }
+ static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); }
+ static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kAborted, msg, msg2);
+ }
+
+ static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); }
+ static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kBusy, msg, msg2);
+ }
+
+ static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); }
+ static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kTimedOut, msg, msg2);
+ }
+
+ static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); }
+ static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kExpired, msg, msg2);
+ }
+
+ static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); }
+ static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kTryAgain, msg, msg2);
+ }
+
+ static Status CompactionTooLarge(SubCode msg = kNone) {
+ return Status(kCompactionTooLarge, msg);
+ }
+ static Status CompactionTooLarge(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kCompactionTooLarge, msg, msg2);
+ }
+
+ static Status ColumnFamilyDropped(SubCode msg = kNone) {
+ return Status(kColumnFamilyDropped, msg);
+ }
+
+ static Status ColumnFamilyDropped(const Slice& msg,
+ const Slice& msg2 = Slice()) {
+ return Status(kColumnFamilyDropped, msg, msg2);
+ }
+
+ static Status NoSpace() { return Status(kIOError, kNoSpace); }
+ static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kNoSpace, msg, msg2);
+ }
+
+ static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); }
+ static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kAborted, kMemoryLimit, msg, msg2);
+ }
+
+ static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); }
+ static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kSpaceLimit, msg, msg2);
+ }
+
+ static Status PathNotFound() { return Status(kIOError, kPathNotFound); }
+ static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, kPathNotFound, msg, msg2);
+ }
+
+ // Returns true iff the status indicates success.
+ bool ok() const { return code() == kOk; }
+
+ // Returns true iff the status indicates a NotFound error.
+ bool IsNotFound() const { return code() == kNotFound; }
+
+ // Returns true iff the status indicates a Corruption error.
+ bool IsCorruption() const { return code() == kCorruption; }
+
+ // Returns true iff the status indicates a NotSupported error.
+ bool IsNotSupported() const { return code() == kNotSupported; }
+
+ // Returns true iff the status indicates an InvalidArgument error.
+ bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
+ // Returns true iff the status indicates an IOError.
+ bool IsIOError() const { return code() == kIOError; }
+
+ // Returns true iff the status indicates an MergeInProgress.
+ bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+
+ // Returns true iff the status indicates Incomplete
+ bool IsIncomplete() const { return code() == kIncomplete; }
+
+ // Returns true iff the status indicates Shutdown In progress
+ bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
+
+ bool IsTimedOut() const { return code() == kTimedOut; }
+
+ bool IsAborted() const { return code() == kAborted; }
+
+ bool IsLockLimit() const {
+ return code() == kAborted && subcode() == kLockLimit;
+ }
+
+ // Returns true iff the status indicates that a resource is Busy and
+ // temporarily could not be acquired.
+ bool IsBusy() const { return code() == kBusy; }
+
+ bool IsDeadlock() const { return code() == kBusy && subcode() == kDeadlock; }
+
+ // Returns true iff the status indicated that the operation has Expired.
+ bool IsExpired() const { return code() == kExpired; }
+
+ // Returns true iff the status indicates a TryAgain error.
+ // This usually means that the operation failed, but may succeed if
+ // re-attempted.
+ bool IsTryAgain() const { return code() == kTryAgain; }
+
+ // Returns true iff the status indicates the proposed compaction is too large
+ bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; }
+
+ // Returns true iff the status indicates Column Family Dropped
+ bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; }
+
+ // Returns true iff the status indicates a NoSpace error
+ // This is caused by an I/O error returning the specific "out of space"
+ // error condition. Stricto sensu, an NoSpace error is an I/O error
+ // with a specific subcode, enabling users to take the appropriate action
+ // if needed
+ bool IsNoSpace() const {
+ return (code() == kIOError) && (subcode() == kNoSpace);
+ }
+
+ // Returns true iff the status indicates a memory limit error. There may be
+ // cases where we limit the memory used in certain operations (eg. the size
+ // of a write batch) in order to avoid out of memory exceptions.
+ bool IsMemoryLimit() const {
+ return (code() == kAborted) && (subcode() == kMemoryLimit);
+ }
+
+ // Returns true iff the status indicates a PathNotFound error
+ // This is caused by an I/O error returning the specific "no such file or
+ // directory" error condition. A PathNotFound error is an I/O error with
+ // a specific subcode, enabling users to take appropriate action if necessary
+ bool IsPathNotFound() const {
+ return (code() == kIOError) && (subcode() == kPathNotFound);
+ }
+
+ // Returns true iff the status indicates manual compaction paused. This
+ // is caused by a call to PauseManualCompaction
+ bool IsManualCompactionPaused() const {
+ return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+ }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ std::string ToString() const;
+
+ protected:
+ // A nullptr state_ (which is always the case for OK) means the message
+ // is empty.
+ // of the following form:
+ // state_[0..3] == length of message
+ // state_[4..] == message
+ Code code_;
+ SubCode subcode_;
+ Severity sev_;
+ const char* state_;
+
+ explicit Status(Code _code, SubCode _subcode = kNone)
+ : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {}
+
+ Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+ Status(Code _code, const Slice& msg, const Slice& msg2)
+ : Status(_code, kNone, msg, msg2) {}
+
+ static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s)
+ : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) {
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline Status::Status(const Status& s, Severity sev)
+ : code_(s.code_), subcode_(s.subcode_), sev_(sev) {
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline Status& Status::operator=(const Status& s) {
+ // The following condition catches both aliasing (when this == &s),
+ // and the common case where both s and *this are ok.
+ if (this != &s) {
+ code_ = s.code_;
+ subcode_ = s.subcode_;
+ sev_ = s.sev_;
+ delete[] state_;
+ state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+ }
+ return *this;
+}
+
+inline Status::Status(Status&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+ : Status() {
+ *this = std::move(s);
+}
+
+inline Status& Status::operator=(Status&& s)
+#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900))
+ noexcept
+#endif
+{
+ if (this != &s) {
+ code_ = std::move(s.code_);
+ s.code_ = kOk;
+ subcode_ = std::move(s.subcode_);
+ s.subcode_ = kNone;
+ sev_ = std::move(s.sev_);
+ s.sev_ = kNoError;
+ delete[] state_;
+ state_ = nullptr;
+ std::swap(state_, s.state_);
+ }
+ return *this;
+}
+
+inline bool Status::operator==(const Status& rhs) const {
+ return (code_ == rhs.code_);
+}
+
+inline bool Status::operator!=(const Status& rhs) const {
+ return !(*this == rhs);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
new file mode 100644
index 000000000..fb5d67114
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -0,0 +1,607 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+// 1. Block-based table: this is the default table type that we inherited from
+// LevelDB, which was designed for storing data in hard disk or flash
+// device.
+// 2. Plain table: it is one of RocksDB's SST file format optimized
+// for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Block-based Table
+class FlushBlockPolicyFactory;
+class PersistentCache;
+class RandomAccessFile;
+struct TableReaderOptions;
+struct TableBuilderOptions;
+class TableBuilder;
+class TableReader;
+class WritableFileWriter;
+struct EnvOptions;
+struct Options;
+
+enum ChecksumType : char {
+ kNoChecksum = 0x0,
+ kCRC32c = 0x1,
+ kxxHash = 0x2,
+ kxxHash64 = 0x3,
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+ // @flush_block_policy_factory creates the instances of flush block policy.
+ // which provides a configurable way to determine when to flush a block in
+ // the block based tables. If not set, table builder will use the default
+ // block flush policy, which cut blocks by block size (please refer to
+ // `FlushBlockBySizePolicy`).
+ std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+ // TODO(kailiu) Temporarily disable this feature by making the default value
+ // to be false.
+ //
+ // TODO(ajkr) we need to update names of variables controlling meta-block
+ // caching as they should now apply to range tombstone and compression
+ // dictionary meta-blocks, in addition to index and filter meta-blocks.
+ //
+ // Indicating if we'd put index/filter blocks to the block cache.
+ // If not specified, each "table reader" object will pre-load index/filter
+ // block during table initialization.
+ bool cache_index_and_filter_blocks = false;
+
+ // If cache_index_and_filter_blocks is enabled, cache index and filter
+ // blocks with high priority. If set to true, depending on implementation of
+ // block cache, index and filter blocks may be less likely to be evicted
+ // than data blocks.
+ bool cache_index_and_filter_blocks_with_high_priority = true;
+
+ // if cache_index_and_filter_blocks is true and the below is true, then
+ // filter and index blocks are stored in the cache, but a reference is
+ // held in the "table reader" object so the blocks are pinned and only
+ // evicted from cache when the table reader is freed.
+ bool pin_l0_filter_and_index_blocks_in_cache = false;
+
+ // If cache_index_and_filter_blocks is true and the below is true, then
+ // the top-level index of partitioned filter and index blocks are stored in
+ // the cache, but a reference is held in the "table reader" object so the
+ // blocks are pinned and only evicted from cache when the table reader is
+ // freed. This is not limited to l0 in LSM tree.
+ bool pin_top_level_index_and_filter = true;
+
+ // The index type that will be used for this table.
+ enum IndexType : char {
+ // A space efficient index block that is optimized for
+ // binary-search-based index.
+ kBinarySearch = 0x00,
+
+ // The hash index, if enabled, will do the hash lookup when
+ // `Options.prefix_extractor` is provided.
+ kHashSearch = 0x01,
+
+ // A two-level index implementation. Both levels are binary search indexes.
+ kTwoLevelIndexSearch = 0x02,
+
+ // Like kBinarySearch, but index also contains first key of each block.
+ // This allows iterators to defer reading the block until it's actually
+ // needed. May significantly reduce read amplification of short range scans.
+ // Without it, iterator seek usually reads one block from each level-0 file
+ // and from each level, which may be expensive.
+ // Works best in combination with:
+ // - IndexShorteningMode::kNoShortening,
+ // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+ // e.g. when prefix changes.
+ // Makes the index significantly bigger (2x or more), especially when keys
+ // are long.
+ //
+ // IO errors are not handled correctly in this mode right now: if an error
+ // happens when lazily reading a block in value(), value() returns empty
+ // slice, and you need to call Valid()/status() afterwards.
+ // TODO(kolmike): Fix it.
+ kBinarySearchWithFirstKey = 0x03,
+ };
+
+ IndexType index_type = kBinarySearch;
+
+ // The index type that will be used for the data block.
+ enum DataBlockIndexType : char {
+ kDataBlockBinarySearch = 0, // traditional block type
+ kDataBlockBinaryAndHash = 1, // additional hash index
+ };
+
+ DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+ // #entries/#buckets. It is valid only when data_block_hash_index_type is
+ // kDataBlockBinaryAndHash.
+ double data_block_hash_table_util_ratio = 0.75;
+
+ // This option is now deprecated. No matter what value it is set to,
+ // it will behave as if hash_index_allow_collision=true.
+ bool hash_index_allow_collision = true;
+
+ // Use the specified checksum type. Newly created table files will be
+ // protected with this checksum type. Old table files will still be readable,
+ // even though they have different checksum type.
+ ChecksumType checksum = kCRC32c;
+
+ // Disable block cache. If this is set to true,
+ // then no block cache should be used, and the block_cache should
+ // point to a nullptr object.
+ bool no_block_cache = false;
+
+ // If non-NULL use the specified cache for blocks.
+ // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+ std::shared_ptr<Cache> block_cache = nullptr;
+
+ // If non-NULL use the specified cache for pages read from device
+ // IF NULL, no page cache is used
+ std::shared_ptr<PersistentCache> persistent_cache = nullptr;
+
+ // If non-NULL use the specified cache for compressed blocks.
+ // If NULL, rocksdb will not use a compressed block cache.
+ // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+ // same type of object there.
+ std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ size_t block_size = 4 * 1024;
+
+ // This is used to close a block before it reaches the configured
+ // 'block_size'. If the percentage of free space in the current block is less
+ // than this specified number and adding a new record to the block will
+ // exceed the configured block size, then this block will be closed and the
+ // new record will be written to the next block.
+ int block_size_deviation = 10;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone. The minimum value allowed is 1. Any smaller
+ // value will be silently overwritten with 1.
+ int block_restart_interval = 16;
+
+ // Same as block_restart_interval but used for the index block.
+ int index_block_restart_interval = 1;
+
+ // Block size for partitioned metadata. Currently applied to indexes when
+ // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
+ // Note: Since in the current implementation the filters and index partitions
+ // are aligned, an index/filter block is created when either index or filter
+ // block size reaches the specified limit.
+ // Note: this limit is currently applied to only index blocks; a filter
+ // partition is cut right after an index block is cut
+ // TODO(myabandeh): remove the note above when filter partitions are cut
+ // separately
+ uint64_t metadata_block_size = 4096;
+
+ // Note: currently this option requires kTwoLevelIndexSearch to be set as
+ // well.
+ // TODO(myabandeh): remove the note above once the limitation is lifted
+ // Use partitioned full filters for each SST file. This option is
+ // incompatible with block-based filters.
+ bool partition_filters = false;
+
+ // Use delta encoding to compress keys in blocks.
+ // ReadOptions::pin_data requires this option to be disabled.
+ //
+ // Default: true
+ bool use_delta_encoding = true;
+
+ // If non-nullptr, use the specified filter policy to reduce disk reads.
+ // Many applications will benefit from passing the result of
+ // NewBloomFilterPolicy() here.
+ std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+ // If true, place whole keys in the filter (not just prefixes).
+ // This must generally be true for gets to be efficient.
+ bool whole_key_filtering = true;
+
+ // Verify that decompressing the compressed block gives back the input. This
+ // is a verification mode that we use to detect bugs in compression
+ // algorithms.
+ bool verify_compression = false;
+
+ // If used, For every data block we load into memory, we will create a bitmap
+ // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+ // will be used to figure out the percentage we actually read of the blocks.
+ //
+ // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+ // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+ // read amplification using this formula
+ // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+ //
+ // value => memory usage (percentage of loaded blocks memory)
+ // 1 => 12.50 %
+ // 2 => 06.25 %
+ // 4 => 03.12 %
+ // 8 => 01.56 %
+ // 16 => 00.78 %
+ //
+ // Note: This number must be a power of 2, if not it will be sanitized
+ // to be the next lowest power of 2, for example a value of 7 will be
+ // treated as 4, a value of 19 will be treated as 16.
+ //
+ // Default: 0 (disabled)
+ uint32_t read_amp_bytes_per_bit = 0;
+
+ // We currently have five versions:
+ // 0 -- This version is currently written out by all RocksDB's versions by
+ // default. Can be read by really old RocksDB's. Doesn't support changing
+ // checksum (default is CRC32).
+ // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+ // checksum, like xxHash. It is written by RocksDB when
+ // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+ // 0 is silently upconverted)
+ // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+ // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+ // don't plan to run RocksDB before version 3.10, you should probably use
+ // this.
+ // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
+ // encode the keys in index blocks. If you don't plan to run RocksDB before
+ // version 5.15, you should probably use this.
+ // This option only affects newly written tables. When reading existing
+ // tables, the information about version is read from the footer.
+ // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+ // encode the values in index blocks. If you don't plan to run RocksDB before
+ // version 5.16 and you are using index_block_restart_interval > 1, you should
+ // probably use this as it would reduce the index size.
+ // This option only affects newly written tables. When reading existing
+ // tables, the information about version is read from the footer.
+ // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
+ // filters use a generally faster and more accurate Bloom filter
+ // implementation, with a different schema.
+ uint32_t format_version = 2;
+
+ // Store index blocks on disk in compressed format. Changing this option to
+ // false will avoid the overhead of decompression if index blocks are evicted
+ // and read back
+ bool enable_index_compression = true;
+
+ // Align data blocks on lesser of page size and block size
+ bool block_align = false;
+
+ // This enum allows trading off increased index size for improved iterator
+ // seek performance in some situations, particularly when block cache is
+ // disabled (ReadOptions::fill_cache = false) and direct IO is
+ // enabled (DBOptions::use_direct_reads = true).
+ // The default mode is the best tradeoff for most use cases.
+ // This option only affects newly written tables.
+ //
+ // The index contains a key separating each pair of consecutive blocks.
+ // Let A be the highest key in one block, B the lowest key in the next block,
+ // and I the index entry separating these two blocks:
+ // [ ... A] I [B ...]
+ // I is allowed to be anywhere in [A, B).
+ // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+ // first block, then immediately fall through to the second block.
+ // However, if I=A, this can't happen, and we'll read only the second block.
+ // In kNoShortening mode, we use I=A. In other modes, we use the shortest
+ // key in [A, B), which usually significantly reduces index size.
+ //
+ // There's a similar story for the last index entry, which is an upper bound
+ // of the highest key in the file. If it's shortened and therefore
+ // overestimated, iterator is likely to unnecessarily read the last data block
+ // from each file on each seek.
+ enum class IndexShorteningMode : char {
+ // Use full keys.
+ kNoShortening,
+ // Shorten index keys between blocks, but use full key for the last index
+ // key, which is the upper bound of the whole file.
+ kShortenSeparators,
+ // Shorten both keys between blocks and key after last block.
+ kShortenSeparatorsAndSuccessor,
+ };
+
+ IndexShorteningMode index_shortening =
+ IndexShorteningMode::kShortenSeparators;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+ // value of this properties is a fixed int32 number.
+ static const std::string kIndexType;
+ // value is "1" for true and "0" for false.
+ static const std::string kWholeKeyFiltering;
+ // value is "1" for true and "0" for false.
+ static const std::string kPrefixFiltering;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+ const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+
+enum EncodingType : char {
+ // Always write full keys without any special encoding.
+ kPlain,
+ // Find opportunity to write the same prefix once for multiple rows.
+ // In some cases, when a key follows a previous key with the same prefix,
+ // instead of writing out the full key, it just writes out the size of the
+ // shared prefix, as well as other bytes, to save some bytes.
+ //
+ // When using this option, the user is required to use the same prefix
+ // extractor to make sure the same prefix will be extracted from the same key.
+ // The Name() value of the prefix extractor will be stored in the file. When
+ // reopening the file, the name of the options.prefix_extractor given will be
+ // bitwise compared to the prefix extractors stored in the file. An error
+ // will be returned if the two don't match.
+ kPrefix,
+};
+
+// Table Properties that are specific to plain table properties.
+struct PlainTablePropertyNames {
+ static const std::string kEncodingType;
+ static const std::string kBloomVersion;
+ static const std::string kNumBloomBlocks;
+};
+
+const uint32_t kPlainTableVariableLength = 0;
+
+struct PlainTableOptions {
+ // @user_key_len: plain table has optimization for fix-sized keys, which can
+ // be specified via user_key_len. Alternatively, you can pass
+ // `kPlainTableVariableLength` if your keys have variable
+ // lengths.
+ uint32_t user_key_len = kPlainTableVariableLength;
+
+ // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+ // You may disable it by passing a zero.
+ int bloom_bits_per_key = 10;
+
+ // @hash_table_ratio: the desired utilization of the hash table used for
+ // prefix hashing.
+ // hash_table_ratio = number of prefixes / #buckets in the
+ // hash table
+ double hash_table_ratio = 0.75;
+
+ // @index_sparseness: inside each prefix, need to build one index record for
+ // how many keys for binary search inside each hash bucket.
+ // For encoding type kPrefix, the value will be used when
+ // writing to determine an interval to rewrite the full
+ // key. It will also be used as a suggestion and satisfied
+ // when possible.
+ size_t index_sparseness = 16;
+
+ // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+ // Otherwise from huge page TLB. The user needs to
+ // reserve huge pages for it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ size_t huge_page_tlb_size = 0;
+
+ // @encoding_type: how to encode the keys. See enum EncodingType above for
+ // the choices. The value will determine how to encode keys
+ // when writing to a new SST file. This value will be stored
+ // inside the SST file which will be used when reading from
+ // the file, which makes it possible for users to choose
+ // different encoding type when reopening a DB. Files with
+ // different encoding types can co-exist in the same DB and
+ // can be read.
+ EncodingType encoding_type = kPlain;
+
+ // @full_scan_mode: mode for reading the whole file one record by one without
+ // using the index.
+ bool full_scan_mode = false;
+
+ // @store_index_in_file: compute plain table index and bloom filter during
+ // file building and store it in file. When reading
+ // file, index will be mmaped instead of recomputation.
+ bool store_index_in_file = false;
+};
+
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extractor properly to make
+// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
+// the hash bucket found, a binary search is executed for hash conflicts.
+// Finally, a linear search is used.
+
+extern TableFactory* NewPlainTableFactory(
+ const PlainTableOptions& options = PlainTableOptions());
+
+struct CuckooTablePropertyNames {
+ // The key that is used to fill empty buckets.
+ static const std::string kEmptyKey;
+ // Fixed length of value.
+ static const std::string kValueLength;
+ // Number of hash functions used in Cuckoo Hash.
+ static const std::string kNumHashFunc;
+ // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+ // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+ // where starting bucket id is given by the hash function on the key. In case
+ // of a collision during inserting the key, the builder tries to insert the
+ // key in other locations of the cuckoo block before using the next hash
+ // function. This reduces cache miss during read operation in case of
+ // collision.
+ static const std::string kCuckooBlockSize;
+ // Size of the hash table. Use this number to compute the modulo of hash
+ // function. The actual number of buckets will be kMaxHashTableSize +
+ // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+ // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+ // implementation.
+ static const std::string kHashTableSize;
+ // Denotes if the key sorted in the file is Internal Key (if false)
+ // or User Key only (if true).
+ static const std::string kIsLastLevel;
+ // Indicate if using identity function for the first hash function.
+ static const std::string kIdentityAsFirstHash;
+ // Indicate if using module or bit and to calculate hash value
+ static const std::string kUseModuleHash;
+ // Fixed user key length
+ static const std::string kUserKeyLength;
+};
+
+struct CuckooTableOptions {
+ // Determines the utilization of hash tables. Smaller values
+ // result in larger hash tables with fewer collisions.
+ double hash_table_ratio = 0.9;
+ // A property used by builder to determine the depth to go to
+ // to search for a path to displace elements in case of
+ // collision. See Builder.MakeSpaceForKey method. Higher
+ // values result in more efficient hash tables with fewer
+ // lookups but take more time to build.
+ uint32_t max_search_depth = 100;
+ // In case of collision while inserting, the builder
+ // attempts to insert in the next cuckoo_block_size
+ // locations before skipping over to the next Cuckoo hash
+ // function. This makes lookups more cache friendly in case
+ // of collisions.
+ uint32_t cuckoo_block_size = 5;
+ // If this option is enabled, user key is treated as uint64_t and its value
+ // is used as hash value directly. This option changes builder's behavior.
+ // Reader ignore this option and behave according to what specified in table
+ // property.
+ bool identity_as_first_hash = false;
+ // If this option is set to true, module is used during hash calculation.
+ // This often yields better space efficiency at the cost of performance.
+ // If this option is set to false, # of entries in table is constrained to be
+ // power of two, and bit and is used to calculate hash, which is faster in
+ // general.
+ bool use_module_hash = true;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+ const CuckooTableOptions& table_options = CuckooTableOptions());
+
+#endif // ROCKSDB_LITE
+
+class RandomAccessFileReader;
+
+// A base class for table factories.
+class TableFactory {
+ public:
+ virtual ~TableFactory() {}
+
+ // The type of the table.
+ //
+ // The client of this package should switch to a new name whenever
+ // the table format implementation changes.
+ //
+ // Names starting with "rocksdb." are reserved and should not be used
+ // by any clients of this package.
+ virtual const char* Name() const = 0;
+
+ // Returns a Table object table that can fetch data from file specified
+ // in parameter file. It's the caller's responsibility to make sure
+ // file is in the correct format.
+ //
+ // NewTableReader() is called in three places:
+ // (1) TableCache::FindTable() calls the function when table cache miss
+ // and cache the table object returned.
+ // (2) SstFileDumper (for SST Dump) opens the table and dump the table
+ // contents using the iterator of the table.
+ // (3) DBImpl::IngestExternalFile() calls this function to read the contents
+ // of the sst file it's attempting to add
+ //
+ // table_reader_options is a TableReaderOptions which contain all the
+ // needed parameters and configuration to open the table.
+ // file is a file handler to handle the file for the table.
+ // file_size is the physical file size of the file.
+ // table_reader is the output table reader.
+ virtual Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache = true) const = 0;
+
+ // Return a table builder to write to a file for this table type.
+ //
+ // It is called in several places:
+ // (1) When flushing memtable to a level-0 output file, it creates a table
+ // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+ // (2) During compaction, it gets the builder for writing compaction output
+ // files in DBImpl::OpenCompactionOutputFile().
+ // (3) When recovering from transaction logs, it creates a table builder to
+ // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+ // by calling BuildTable())
+ // (4) When running Repairer, it creates a table builder to convert logs to
+ // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+ //
+ // Multiple configured can be accessed from there, including and not limited
+ // to compression options. file is a handle of a writable file.
+ // It is the caller's responsibility to keep the file open and close the file
+ // after closing the table builder. compression_type is the compression type
+ // to use in this table.
+ virtual TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_family_id, WritableFileWriter* file) const = 0;
+
+ // Sanitizes the specified DB Options and ColumnFamilyOptions.
+ //
+ // If the function cannot find a way to sanitize the input DB Options,
+ // a non-ok Status will be returned.
+ virtual Status SanitizeOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const = 0;
+
+ // Return a string that contains printable format of table configurations.
+ // RocksDB prints configurations at DB Open().
+ virtual std::string GetPrintableTableOptions() const = 0;
+
+ virtual Status GetOptionString(std::string* /*opt_string*/,
+ const std::string& /*delimiter*/) const {
+ return Status::NotSupported(
+ "The table factory doesn't implement GetOptionString().");
+ }
+
+ // Returns the raw pointer of the table options that is used by this
+ // TableFactory, or nullptr if this function is not supported.
+ // Since the return value is a raw pointer, the TableFactory owns the
+ // pointer and the caller should not delete the pointer.
+ //
+ // In certain case, it is desirable to alter the underlying options when the
+ // TableFactory is not used by any open DB by casting the returned pointer
+ // to the right class. For instance, if BlockBasedTableFactory is used,
+ // then the pointer can be casted to BlockBasedTableOptions.
+ //
+ // Note that changing the underlying TableFactory options while the
+ // TableFactory is currently used by any open DB is undefined behavior.
+ // Developers should use DB::SetOption() instead to dynamically change
+ // options while the DB is open.
+ virtual void* GetOptions() { return nullptr; }
+
+ // Return is delete range supported
+ virtual bool IsDeleteRangeSupported() const { return false; }
+};
+
+#ifndef ROCKSDB_LITE
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
+// convert a DB from one table format to another.
+// @table_factory_to_write: the table factory used when writing to new files.
+// @block_based_table_factory: block based table factory to use. If NULL, use
+// a default one.
+// @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
+// one.
+extern TableFactory* NewAdaptiveTableFactory(
+ std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
+ std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
+ std::shared_ptr<TableFactory> plain_table_factory = nullptr,
+ std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
new file mode 100644
index 000000000..d0ac02310
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -0,0 +1,250 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <stdint.h>
+#include <map>
+#include <string>
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interpret these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+// ++pos) {
+// ...
+// }
+typedef std::map<std::string, std::string> UserCollectedProperties;
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+ static const std::string kDataSize;
+ static const std::string kIndexSize;
+ static const std::string kIndexPartitions;
+ static const std::string kTopLevelIndexSize;
+ static const std::string kIndexKeyIsUserKey;
+ static const std::string kIndexValueIsDeltaEncoded;
+ static const std::string kFilterSize;
+ static const std::string kRawKeySize;
+ static const std::string kRawValueSize;
+ static const std::string kNumDataBlocks;
+ static const std::string kNumEntries;
+ static const std::string kDeletedKeys;
+ static const std::string kMergeOperands;
+ static const std::string kNumRangeDeletions;
+ static const std::string kFormatVersion;
+ static const std::string kFixedKeyLen;
+ static const std::string kFilterPolicy;
+ static const std::string kColumnFamilyName;
+ static const std::string kColumnFamilyId;
+ static const std::string kComparator;
+ static const std::string kMergeOperator;
+ static const std::string kPrefixExtractorName;
+ static const std::string kPropertyCollectors;
+ static const std::string kCompression;
+ static const std::string kCompressionOptions;
+ static const std::string kCreationTime;
+ static const std::string kOldestKeyTime;
+ static const std::string kFileCreationTime;
+};
+
+extern const std::string kPropertiesBlock;
+extern const std::string kCompressionDictBlock;
+extern const std::string kRangeDelBlock;
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own properties that they are interested in. This class is essentially
+// a collection of callback functions that will be invoked during table
+// building. It is constructed with TablePropertiesCollectorFactory. The methods
+// don't need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially
+class TablePropertiesCollector {
+ public:
+ virtual ~TablePropertiesCollector() {}
+
+ // DEPRECATE User defined collector should implement AddUserKey(), though
+ // this old function still works for backward compatible reason.
+ // Add() will be called when a new key/value pair is inserted into the table.
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) {
+ return Status::InvalidArgument(
+ "TablePropertiesCollector::Add() deprecated.");
+ }
+
+ // AddUserKey() will be called when a new key/value pair is inserted into the
+ // table.
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ virtual Status AddUserKey(const Slice& key, const Slice& value,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) {
+ // For backwards-compatibility.
+ return Add(key, value);
+ }
+
+ // Called after each new block is cut
+ virtual void BlockAdd(uint64_t /* blockRawBytes */,
+ uint64_t /* blockCompressedBytesFast */,
+ uint64_t /* blockCompressedBytesSlow */) {
+ // Nothing to do here. Callback registers can override.
+ return;
+ }
+
+ // Finish() will be called when a table has already been built and is ready
+ // for writing the properties block.
+ // @params properties User will add their collected statistics to
+ // `properties`.
+ virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+ // Return the human-readable properties, where the key is property name and
+ // the value is the human-readable form of value.
+ virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const = 0;
+
+ // EXPERIMENTAL Return whether the output file should be further compacted
+ virtual bool NeedCompact() const { return false; }
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+class TablePropertiesCollectorFactory {
+ public:
+ struct Context {
+ uint32_t column_family_id;
+ static const uint32_t kUnknownColumnFamily;
+ };
+
+ virtual ~TablePropertiesCollectorFactory() {}
+ // has to be thread-safe
+ virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) = 0;
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const = 0;
+};
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+ // the total size of all data blocks.
+ uint64_t data_size = 0;
+ // the size of index block.
+ uint64_t index_size = 0;
+ // Total number of index partitions if kTwoLevelIndexSearch is used
+ uint64_t index_partitions = 0;
+ // Size of the top-level index if kTwoLevelIndexSearch is used
+ uint64_t top_level_index_size = 0;
+ // Whether the index key is user key. Otherwise it includes 8 byte of sequence
+ // number added by internal key format.
+ uint64_t index_key_is_user_key = 0;
+ // Whether delta encoding is used to encode the index values.
+ uint64_t index_value_is_delta_encoded = 0;
+ // the size of filter block.
+ uint64_t filter_size = 0;
+ // total raw key size
+ uint64_t raw_key_size = 0;
+ // total raw value size
+ uint64_t raw_value_size = 0;
+ // the number of blocks in this table
+ uint64_t num_data_blocks = 0;
+ // the number of entries in this table
+ uint64_t num_entries = 0;
+ // the number of deletions in the table
+ uint64_t num_deletions = 0;
+ // the number of merge operands in the table
+ uint64_t num_merge_operands = 0;
+ // the number of range deletions in this table
+ uint64_t num_range_deletions = 0;
+ // format version, reserved for backward compatibility
+ uint64_t format_version = 0;
+ // If 0, key is variable length. Otherwise number of bytes for each key.
+ uint64_t fixed_key_len = 0;
+ // ID of column family for this SST file, corresponding to the CF identified
+ // by column_family_name.
+ uint64_t column_family_id = ROCKSDB_NAMESPACE::
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+ // Timestamp of the latest key. 0 means unknown.
+ // TODO(sagar0): Should be changed to latest_key_time ... but don't know the
+ // full implications of backward compatibility. Hence retaining for now.
+ uint64_t creation_time = 0;
+ // Timestamp of the earliest key. 0 means unknown.
+ uint64_t oldest_key_time = 0;
+ // Actual SST file creation time. 0 means unknown.
+ uint64_t file_creation_time = 0;
+
+ // Name of the column family with which this SST file is associated.
+ // If column family is unknown, `column_family_name` will be an empty string.
+ std::string column_family_name;
+
+ // The name of the filter policy used in this table.
+ // If no filter policy is used, `filter_policy_name` will be an empty string.
+ std::string filter_policy_name;
+
+ // The name of the comparator used in this table.
+ std::string comparator_name;
+
+ // The name of the merge operator used in this table.
+ // If no merge operator is used, `merge_operator_name` will be "nullptr".
+ std::string merge_operator_name;
+
+ // The name of the prefix extractor used in this table
+ // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr".
+ std::string prefix_extractor_name;
+
+ // The names of the property collectors factories used in this table
+ // separated by commas
+ // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
+ std::string property_collectors_names;
+
+ // The compression algo used to compress the SST files.
+ std::string compression_name;
+
+ // Compression options used to compress the SST files.
+ std::string compression_options;
+
+ // user collected properties
+ UserCollectedProperties user_collected_properties;
+ UserCollectedProperties readable_properties;
+
+ // The offset of the value of each property in the file.
+ std::map<std::string, uint64_t> properties_offsets;
+
+ // convert this object to a human readable form
+ // @prop_delim: delimiter for each property.
+ std::string ToString(const std::string& prop_delim = "; ",
+ const std::string& kv_delim = "=") const;
+
+ // Aggregate the numerical member variables of the specified
+ // TableProperties.
+ void Add(const TableProperties& tp);
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
+ bool* property_present);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
new file mode 100644
index 000000000..6b2f5c885
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -0,0 +1,188 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread. Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \
+ defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#define ROCKSDB_USING_THREAD_STATUS
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): remove this function once c++14 is available
+// as std::max will be able to cover this.
+// Current MS compiler does not support constexpr
+template <int A, int B>
+struct constexpr_max {
+ static const int result = (A > B) ? A : B;
+};
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// ROCKSDB_NAMESPACE::GetThreadList().
+struct ThreadStatus {
+ // The type of a thread.
+ enum ThreadType : int {
+ HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool
+ LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool
+ USER, // User thread (Non-RocksDB BG thread)
+ BOTTOM_PRIORITY, // RocksDB BG thread in bottom-pri thread pool
+ NUM_THREAD_TYPES
+ };
+
+ // The type used to refer to a thread operation.
+ // A thread operation describes high-level action of a thread.
+ // Examples include compaction and flush.
+ enum OperationType : int {
+ OP_UNKNOWN = 0,
+ OP_COMPACTION,
+ OP_FLUSH,
+ NUM_OP_TYPES
+ };
+
+ enum OperationStage : int {
+ STAGE_UNKNOWN = 0,
+ STAGE_FLUSH_RUN,
+ STAGE_FLUSH_WRITE_L0,
+ STAGE_COMPACTION_PREPARE,
+ STAGE_COMPACTION_RUN,
+ STAGE_COMPACTION_PROCESS_KV,
+ STAGE_COMPACTION_INSTALL,
+ STAGE_COMPACTION_SYNC_FILE,
+ STAGE_PICK_MEMTABLES_TO_FLUSH,
+ STAGE_MEMTABLE_ROLLBACK,
+ STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+ NUM_OP_STAGES
+ };
+
+ enum CompactionPropertyType : int {
+ COMPACTION_JOB_ID = 0,
+ COMPACTION_INPUT_OUTPUT_LEVEL,
+ COMPACTION_PROP_FLAGS,
+ COMPACTION_TOTAL_INPUT_BYTES,
+ COMPACTION_BYTES_READ,
+ COMPACTION_BYTES_WRITTEN,
+ NUM_COMPACTION_PROPERTIES
+ };
+
+ enum FlushPropertyType : int {
+ FLUSH_JOB_ID = 0,
+ FLUSH_BYTES_MEMTABLES,
+ FLUSH_BYTES_WRITTEN,
+ NUM_FLUSH_PROPERTIES
+ };
+
+ // The maximum number of properties of an operation.
+ // This number should be set to the biggest NUM_XXX_PROPERTIES.
+ static const int kNumOperationProperties =
+ constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+
+ // The type used to refer to a thread state.
+ // A state describes lower-level action of a thread
+ // such as reading / writing a file or waiting for a mutex.
+ enum StateType : int {
+ STATE_UNKNOWN = 0,
+ STATE_MUTEX_WAIT = 1,
+ NUM_STATE_TYPES
+ };
+
+ ThreadStatus(const uint64_t _id, const ThreadType _thread_type,
+ const std::string& _db_name, const std::string& _cf_name,
+ const OperationType _operation_type,
+ const uint64_t _op_elapsed_micros,
+ const OperationStage _operation_stage,
+ const uint64_t _op_props[], const StateType _state_type)
+ : thread_id(_id),
+ thread_type(_thread_type),
+ db_name(_db_name),
+ cf_name(_cf_name),
+ operation_type(_operation_type),
+ op_elapsed_micros(_op_elapsed_micros),
+ operation_stage(_operation_stage),
+ state_type(_state_type) {
+ for (int i = 0; i < kNumOperationProperties; ++i) {
+ op_properties[i] = _op_props[i];
+ }
+ }
+
+ // An unique ID for the thread.
+ const uint64_t thread_id;
+
+ // The type of the thread, it could be HIGH_PRIORITY,
+ // LOW_PRIORITY, and USER
+ const ThreadType thread_type;
+
+ // The name of the DB instance where the thread is currently
+ // involved with. It would be set to empty string if the thread
+ // does not involve in any DB operation.
+ const std::string db_name;
+
+ // The name of the column family where the thread is currently
+ // It would be set to empty string if the thread does not involve
+ // in any column family.
+ const std::string cf_name;
+
+ // The operation (high-level action) that the current thread is involved.
+ const OperationType operation_type;
+
+ // The elapsed time of the current thread operation in microseconds.
+ const uint64_t op_elapsed_micros;
+
+ // An integer showing the current stage where the thread is involved
+ // in the current operation.
+ const OperationStage operation_stage;
+
+ // A list of properties that describe some details about the current
+ // operation. Same field in op_properties[] might have different
+ // meanings for different operations.
+ uint64_t op_properties[kNumOperationProperties];
+
+ // The state (lower-level action) that the current thread is involved.
+ const StateType state_type;
+
+ // The followings are a set of utility functions for interpreting
+ // the information of ThreadStatus
+
+ static std::string GetThreadTypeName(ThreadType thread_type);
+
+ // Obtain the name of an operation given its type.
+ static const std::string& GetOperationName(OperationType op_type);
+
+ static const std::string MicrosToString(uint64_t op_elapsed_time);
+
+ // Obtain a human-readable string describing the specified operation stage.
+ static const std::string& GetOperationStageName(OperationStage stage);
+
+ // Obtain the name of the "i"th operation property of the
+ // specified operation.
+ static const std::string& GetOperationPropertyName(OperationType op_type,
+ int i);
+
+ // Translate the "i"th property of the specified operation given
+ // a property value.
+ static std::map<std::string, uint64_t> InterpretOperationProperties(
+ OperationType op_type, const uint64_t* op_properties);
+
+ // Obtain the name of a state given its type.
+ static const std::string& GetStateName(StateType state_type);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h
new file mode 100644
index 000000000..b39321fe8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/threadpool.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <functional>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+ * ThreadPool is a component that will spawn N background threads that will
+ * be used to execute scheduled work, The number of background threads could
+ * be modified by calling SetBackgroundThreads().
+ * */
+class ThreadPool {
+ public:
+ virtual ~ThreadPool() {}
+
+ // Wait for all threads to finish.
+ // Discard those threads that did not start
+ // executing
+ virtual void JoinAllThreads() = 0;
+
+ // Set the number of background threads that will be executing the
+ // scheduled jobs.
+ virtual void SetBackgroundThreads(int num) = 0;
+ virtual int GetBackgroundThreads() = 0;
+
+ // Get the number of jobs scheduled in the ThreadPool queue.
+ virtual unsigned int GetQueueLen() const = 0;
+
+ // Waits for all jobs to complete those
+ // that already started running and those that did not
+ // start yet. This ensures that everything that was thrown
+ // on the TP runs even though
+ // we may not have specified enough threads for the amount
+ // of jobs
+ virtual void WaitForJobsAndJoinAllThreads() = 0;
+
+ // Submit a fire and forget jobs
+ // This allows to submit the same job multiple times
+ virtual void SubmitJob(const std::function<void()>&) = 0;
+ // This moves the function in for efficiency
+ virtual void SubmitJob(std::function<void()>&&) = 0;
+};
+
+// NewThreadPool() is a function that could be used to create a ThreadPool
+// with `num_threads` background threads.
+extern ThreadPool* NewThreadPool(int num_threads);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h
new file mode 100644
index 000000000..d58ed47b2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+ TraceWriter() {}
+ virtual ~TraceWriter() {}
+
+ virtual Status Write(const Slice& data) = 0;
+ virtual Status Close() = 0;
+ virtual uint64_t GetFileSize() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay opertions.
+class TraceReader {
+ public:
+ TraceReader() {}
+ virtual ~TraceReader() {}
+
+ virtual Status Read(std::string* data) = 0;
+ virtual Status Close() = 0;
+};
+
+// Factory methods to read/write traces from/to a file.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceReader>* trace_reader);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h
new file mode 100644
index 000000000..48d0e5c0b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/transaction_log.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFile;
+typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+
+enum WalFileType {
+ /* Indicates that WAL file is in archive directory. WAL files are moved from
+ * the main db directory to archive directory once they are not live and stay
+ * there until cleaned up. Files are cleaned depending on archive size
+ * (Options::WAL_size_limit_MB) and time since last cleaning
+ * (Options::WAL_ttl_seconds).
+ */
+ kArchivedLogFile = 0,
+
+ /* Indicates that WAL file is live and resides in the main db directory */
+ kAliveLogFile = 1
+};
+
+class LogFile {
+ public:
+ LogFile() {}
+ virtual ~LogFile() {}
+
+ // Returns log file's pathname relative to the main db dir
+ // Eg. For a live-log-file = /000003.log
+ // For an archived-log-file = /archive/000003.log
+ virtual std::string PathName() const = 0;
+
+ // Primary identifier for log file.
+ // This is directly proportional to creation time of the log file
+ virtual uint64_t LogNumber() const = 0;
+
+ // Log file can be either alive or archived
+ virtual WalFileType Type() const = 0;
+
+ // Starting sequence number of writebatch written in this log file
+ virtual SequenceNumber StartSequence() const = 0;
+
+ // Size of log file on disk in Bytes
+ virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+ SequenceNumber sequence = 0;
+ std::unique_ptr<WriteBatch> writeBatchPtr;
+
+ // Add empty __ctor and __dtor for the rule of five
+ // However, preserve the original semantics and prohibit copying
+ // as the std::unique_ptr member does not copy.
+ BatchResult() {}
+
+ ~BatchResult() {}
+
+ BatchResult(const BatchResult&) = delete;
+
+ BatchResult& operator=(const BatchResult&) = delete;
+
+ BatchResult(BatchResult&& bResult)
+ : sequence(std::move(bResult.sequence)),
+ writeBatchPtr(std::move(bResult.writeBatchPtr)) {}
+
+ BatchResult& operator=(BatchResult&& bResult) {
+ sequence = std::move(bResult.sequence);
+ writeBatchPtr = std::move(bResult.writeBatchPtr);
+ return *this;
+ }
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+ TransactionLogIterator() {}
+ virtual ~TransactionLogIterator() {}
+
+ // An iterator is either positioned at a WriteBatch or not valid.
+ // This method returns true if the iterator is valid.
+ // Can read data from a valid iterator.
+ virtual bool Valid() = 0;
+
+ // Moves the iterator to the next WriteBatch.
+ // REQUIRES: Valid() to be true.
+ virtual void Next() = 0;
+
+ // Returns ok if the iterator is valid.
+ // Returns the Error when something has gone wrong.
+ virtual Status status() = 0;
+
+ // If valid return's the current write_batch and the sequence number of the
+ // earliest transaction contained in the batch.
+ // ONLY use if Valid() is true and status() is OK.
+ virtual BatchResult GetBatch() = 0;
+
+ // The read options for TransactionLogIterator.
+ struct ReadOptions {
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: true
+ bool verify_checksums_;
+
+ ReadOptions() : verify_checksums_(true) {}
+
+ explicit ReadOptions(bool verify_checksums)
+ : verify_checksums_(verify_checksums) {}
+ };
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h
new file mode 100644
index 000000000..4d004b69d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/types.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Define all public custom types here.
+
+// Represents a sequence number in a WAL file.
+typedef uint64_t SequenceNumber;
+
+const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed
+
+// User-oriented representation of internal key types.
+enum EntryType {
+ kEntryPut,
+ kEntryDelete,
+ kEntrySingleDelete,
+ kEntryMerge,
+ kEntryRangeDeletion,
+ kEntryBlobIndex,
+ kEntryOther,
+};
+
+// <user key, sequence number, and entry type> tuple.
+struct FullKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ EntryType type;
+
+ FullKey() : sequence(0) {} // Intentionally left uninitialized (for speed)
+ FullKey(const Slice& u, const SequenceNumber& seq, EntryType t)
+ : user_key(u), sequence(seq), type(t) {}
+ std::string DebugString(bool hex = false) const;
+
+ void clear() {
+ user_key.clear();
+ sequence = 0;
+ type = EntryType::kEntryPut;
+ }
+};
+
+// Parse slice representing internal key to FullKey
+// Parsed FullKey is valid for as long as the memory pointed to by
+// internal_key is alive.
+bool ParseFullKey(const Slice& internal_key, FullKey* result);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
new file mode 100644
index 000000000..e3aeee6ce
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <climits>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+ kCompactionStopStyleSimilarSize, // pick files of similar size
+ kCompactionStopStyleTotalSize // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+ // Percentage flexibility while comparing file size. If the candidate file(s)
+ // size is 1% smaller than the next file's size, then include next file into
+ // this candidate set. // Default: 1
+ unsigned int size_ratio;
+
+ // The minimum number of files in a single compaction run. Default: 2
+ unsigned int min_merge_width;
+
+ // The maximum number of files in a single compaction run. Default: UINT_MAX
+ unsigned int max_merge_width;
+
+ // The size amplification is defined as the amount (in percentage) of
+ // additional storage needed to store a single byte of data in the database.
+ // For example, a size amplification of 2% means that a database that
+ // contains 100 bytes of user-data may occupy upto 102 bytes of
+ // physical storage. By this definition, a fully compacted database has
+ // a size amplification of 0%. Rocksdb uses the following heuristic
+ // to calculate size amplification: it assumes that all files excluding
+ // the earliest file contribute to the size amplification.
+ // Default: 200, which means that a 100 byte database could require upto
+ // 300 bytes of storage.
+ unsigned int max_size_amplification_percent;
+
+ // If this option is set to be -1 (the default value), all the output files
+ // will follow compression type specified.
+ //
+ // If this option is not negative, we will try to make sure compressed
+ // size is just above this value. In normal cases, at least this percentage
+ // of data will be compressed.
+ // When we are compacting to a new file, here is the criteria whether
+ // it needs to be compressed: assuming here are the list of files sorted
+ // by generation time:
+ // A1...An B1...Bm C1...Ct
+ // where A1 is the newest and Ct is the oldest, and we are going to compact
+ // B1...Bm, we calculate the total size of all the files as total_size, as
+ // well as the total size of C1...Ct as total_C, the compaction output file
+ // will be compressed iff
+ // total_C / total_size < this percentage
+ // Default: -1
+ int compression_size_percent;
+
+ // The algorithm used to stop picking files into a single compaction run
+ // Default: kCompactionStopStyleTotalSize
+ CompactionStopStyle stop_style;
+
+ // Option to optimize the universal multi level compaction by enabling
+ // trivial move for non overlapping files.
+ // Default: false
+ bool allow_trivial_move;
+
+ // Default set of parameters
+ CompactionOptionsUniversal()
+ : size_ratio(1),
+ min_merge_width(2),
+ max_merge_width(UINT_MAX),
+ max_size_amplification_percent(200),
+ compression_size_percent(-1),
+ stop_style(kCompactionStopStyleTotalSize),
+ allow_trivial_move(false) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/backupable_db.h b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
new file mode 100644
index 000000000..f281ed133
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
@@ -0,0 +1,341 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/stackable_db.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BackupableDBOptions {
+ // Where to keep the backup files. Has to be different than dbname_
+ // Best to set this to dbname_ + "/backups"
+ // Required
+ std::string backup_dir;
+
+ // Backup Env object. It will be used for backup file I/O. If it's
+ // nullptr, backups will be written out using DBs Env. If it's
+ // non-nullptr, backup's I/O will be performed using this object.
+ // If you want to have backups on HDFS, use HDFS Env here!
+ // Default: nullptr
+ Env* backup_env;
+
+ // If share_table_files == true, backup will assume that table files with
+ // same name have the same contents. This enables incremental backups and
+ // avoids unnecessary data copies.
+ // If share_table_files == false, each backup will be on its own and will
+ // not share any data with other backups.
+ // default: true
+ bool share_table_files;
+
+ // Backup info and error messages will be written to info_log
+ // if non-nullptr.
+ // Default: nullptr
+ Logger* info_log;
+
+ // If sync == true, we can guarantee you'll get consistent backup even
+ // on a machine crash/reboot. Backup process is slower with sync enabled.
+ // If sync == false, we don't guarantee anything on machine reboot. However,
+ // chances are some of the backups are consistent.
+ // Default: true
+ bool sync;
+
+ // If true, it will delete whatever backups there are already
+ // Default: false
+ bool destroy_old_data;
+
+ // If false, we won't backup log files. This option can be useful for backing
+ // up in-memory databases where log file are persisted, but table files are in
+ // memory.
+ // Default: true
+ bool backup_log_files;
+
+ // Max bytes that can be transferred in a second during backup.
+ // If 0, go as fast as you can
+ // Default: 0
+ uint64_t backup_rate_limit;
+
+ // Backup rate limiter. Used to control transfer speed for backup. If this is
+ // not null, backup_rate_limit is ignored.
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+ // Max bytes that can be transferred in a second during restore.
+ // If 0, go as fast as you can
+ // Default: 0
+ uint64_t restore_rate_limit;
+
+ // Restore rate limiter. Used to control transfer speed during restore. If
+ // this is not null, restore_rate_limit is ignored.
+ // Default: nullptr
+ std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+ // Only used if share_table_files is set to true. If true, will consider that
+ // backups can come from different databases, hence a sst is not uniquely
+ // identifed by its name, but by the triple (file name, crc32, file length)
+ // Default: false
+ // Note: this is an experimental option, and you'll need to set it manually
+ // *turn it on only if you know what you're doing*
+ bool share_files_with_checksum;
+
+ // Up to this many background threads will copy files for CreateNewBackup()
+ // and RestoreDBFromBackup()
+ // Default: 1
+ int max_background_operations;
+
+ // During backup user can get callback every time next
+ // callback_trigger_interval_size bytes being copied.
+ // Default: 4194304
+ uint64_t callback_trigger_interval_size;
+
+ // For BackupEngineReadOnly, Open() will open at most this many of the
+ // latest non-corrupted backups.
+ //
+ // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+ // writable BackupEngine because it would inhibit accounting for shared
+ // files for proper backup deletion, including purging any incompletely
+ // created backups on creation of a new backup.
+ //
+ // Default: INT_MAX
+ int max_valid_backups_to_open;
+
+ void Dump(Logger* logger) const;
+
+ explicit BackupableDBOptions(
+ const std::string& _backup_dir, Env* _backup_env = nullptr,
+ bool _share_table_files = true, Logger* _info_log = nullptr,
+ bool _sync = true, bool _destroy_old_data = false,
+ bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+ uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+ uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+ int _max_valid_backups_to_open = INT_MAX)
+ : backup_dir(_backup_dir),
+ backup_env(_backup_env),
+ share_table_files(_share_table_files),
+ info_log(_info_log),
+ sync(_sync),
+ destroy_old_data(_destroy_old_data),
+ backup_log_files(_backup_log_files),
+ backup_rate_limit(_backup_rate_limit),
+ restore_rate_limit(_restore_rate_limit),
+ share_files_with_checksum(false),
+ max_background_operations(_max_background_operations),
+ callback_trigger_interval_size(_callback_trigger_interval_size),
+ max_valid_backups_to_open(_max_valid_backups_to_open) {
+ assert(share_table_files || !share_files_with_checksum);
+ }
+};
+
+struct RestoreOptions {
+ // If true, restore won't overwrite the existing log files in wal_dir. It will
+ // also move all log files from archive directory to wal_dir. Use this option
+ // in combination with BackupableDBOptions::backup_log_files = false for
+ // persisting in-memory databases.
+ // Default: false
+ bool keep_log_files;
+
+ explicit RestoreOptions(bool _keep_log_files = false)
+ : keep_log_files(_keep_log_files) {}
+};
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+ BackupID backup_id;
+ int64_t timestamp;
+ uint64_t size;
+
+ uint32_t number_files;
+ std::string app_metadata;
+
+ BackupInfo() {}
+
+ BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+ uint32_t _number_files, const std::string& _app_metadata)
+ : backup_id(_backup_id),
+ timestamp(_timestamp),
+ size(_size),
+ number_files(_number_files),
+ app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+ BackupStatistics() {
+ number_success_backup = 0;
+ number_fail_backup = 0;
+ }
+
+ BackupStatistics(uint32_t _number_success_backup,
+ uint32_t _number_fail_backup)
+ : number_success_backup(_number_success_backup),
+ number_fail_backup(_number_fail_backup) {}
+
+ ~BackupStatistics() {}
+
+ void IncrementNumberSuccessBackup();
+ void IncrementNumberFailBackup();
+
+ uint32_t GetNumberSuccessBackup() const;
+ uint32_t GetNumberFailBackup() const;
+
+ std::string ToString() const;
+
+ private:
+ uint32_t number_success_backup;
+ uint32_t number_fail_backup;
+};
+
+// A backup engine for accessing information about backups and restoring from
+// them.
+class BackupEngineReadOnly {
+ public:
+ virtual ~BackupEngineReadOnly() {}
+
+ static Status Open(Env* db_env, const BackupableDBOptions& options,
+ BackupEngineReadOnly** backup_engine_ptr);
+
+ // Returns info about backups in backup_info
+ // You can GetBackupInfo safely, even with other BackupEngine performing
+ // backups on the same directory
+ virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+
+ // Returns info about corrupt backups in corrupt_backups
+ virtual void GetCorruptedBackups(
+ std::vector<BackupID>* corrupt_backup_ids) = 0;
+
+ // Restoring DB from backup is NOT safe when there is another BackupEngine
+ // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
+ // responsibility to synchronize the operation, i.e. don't delete the backup
+ // when you're restoring from it
+ // See also the corresponding doc in BackupEngine
+ virtual Status RestoreDBFromBackup(
+ BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+ // See the corresponding doc in BackupEngine
+ virtual Status RestoreDBFromLatestBackup(
+ const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+ // checks that each file exists and that the size of the file matches our
+ // expectations. it does not check file checksum.
+ //
+ // If this BackupEngine created the backup, it compares the files' current
+ // sizes against the number of bytes written to them during creation.
+ // Otherwise, it compares the files' current sizes against their sizes when
+ // the BackupEngine was opened.
+ //
+ // Returns Status::OK() if all checks are good
+ virtual Status VerifyBackup(BackupID backup_id) = 0;
+};
+
+// A backup engine for creating new backups.
+class BackupEngine {
+ public:
+ virtual ~BackupEngine() {}
+
+ // BackupableDBOptions have to be the same as the ones used in previous
+ // BackupEngines for the same backup directory.
+ static Status Open(Env* db_env, const BackupableDBOptions& options,
+ BackupEngine** backup_engine_ptr);
+
+ // same as CreateNewBackup, but stores extra application metadata
+ // Flush will always trigger if 2PC is enabled.
+ // If write-ahead logs are disabled, set flush_before_backup=true to
+ // avoid losing unflushed key/value pairs from the memtable.
+ virtual Status CreateNewBackupWithMetadata(
+ DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+ std::function<void()> progress_callback = []() {}) = 0;
+
+ // Captures the state of the database in the latest backup
+ // NOT a thread safe call
+ // Flush will always trigger if 2PC is enabled.
+ // If write-ahead logs are disabled, set flush_before_backup=true to
+ // avoid losing unflushed key/value pairs from the memtable.
+ virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false,
+ std::function<void()> progress_callback =
+ []() {}) {
+ return CreateNewBackupWithMetadata(db, "", flush_before_backup,
+ progress_callback);
+ }
+
+ // Deletes old backups, keeping latest num_backups_to_keep alive.
+ // See also DeleteBackup.
+ virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+ // Deletes a specific backup. If this operation (or PurgeOldBackups)
+ // is not completed due to crash, power failure, etc. the state
+ // will be cleaned up the next time you call DeleteBackup,
+ // PurgeOldBackups, or GarbageCollect.
+ virtual Status DeleteBackup(BackupID backup_id) = 0;
+
+ // Call this from another thread if you want to stop the backup
+ // that is currently happening. It will return immediatelly, will
+ // not wait for the backup to stop.
+ // The backup will stop ASAP and the call to CreateNewBackup will
+ // return Status::Incomplete(). It will not clean up after itself, but
+ // the state will remain consistent. The state will be cleaned up the
+ // next time you call CreateNewBackup or GarbageCollect.
+ virtual void StopBackup() = 0;
+
+ // Returns info about backups in backup_info
+ virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+
+ // Returns info about corrupt backups in corrupt_backups
+ virtual void GetCorruptedBackups(
+ std::vector<BackupID>* corrupt_backup_ids) = 0;
+
+ // restore from backup with backup_id
+ // IMPORTANT -- if options_.share_table_files == true,
+ // options_.share_files_with_checksum == false, you restore DB from some
+ // backup that is not the latest, and you start creating new backups from the
+ // new DB, they will probably fail.
+ //
+ // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+ // If you add new data to the DB and try creating a new backup now, the
+ // database will diverge from backups 4 and 5 and the new backup will fail.
+ // If you want to create new backup, you will first have to delete backups 4
+ // and 5.
+ virtual Status RestoreDBFromBackup(
+ BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+ // restore from the latest backup
+ virtual Status RestoreDBFromLatestBackup(
+ const std::string& db_dir, const std::string& wal_dir,
+ const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+ // checks that each file exists and that the size of the file matches our
+ // expectations. it does not check file checksum.
+ // Returns Status::OK() if all checks are good
+ virtual Status VerifyBackup(BackupID backup_id) = 0;
+
+ // Will delete any files left over from incomplete creation or deletion of
+ // a backup. This is not normally needed as those operations also clean up
+ // after prior incomplete calls to the same kind of operation (create or
+ // delete).
+ // NOTE: This is not designed to delete arbitrary files added to the backup
+ // directory outside of BackupEngine, and clean-up is always subject to
+ // permissions on and availability of the underlying filesystem.
+ virtual Status GarbageCollect() = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 000000000..c7f93b4cf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
+
+class Checkpoint {
+ public:
+ // Creates a Checkpoint object to be used for creating openable snapshots
+ static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+ // Builds an openable snapshot of RocksDB on the same disk, which
+ // accepts an output directory on the same disk, and under the directory
+ // (1) hard-linked SST files pointing to existing live SST files
+ // SST files will be copied if output directory is on a different filesystem
+ // (2) a copied manifest files and other files
+ // The directory should not already exist and will be created by this API.
+ // The directory will be an absolute path
+ // log_size_for_flush: if the total log file size is equal or larger than
+ // this value, then a flush is triggered for all the column families. The
+ // default value is 0, which means flush is always triggered. If you move
+ // away from the default, the checkpoint may not contain up-to-date data
+ // if WAL writing is not always enabled.
+ // Flush will always trigger if it is 2PC.
+ virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
+ uint64_t log_size_for_flush = 0);
+
+ // Exports all live SST files of a specified Column Family onto export_dir,
+ // returning SST files information in metadata.
+ // - SST files will be created as hard links when the directory specified
+ // is in the same partition as the db directory, copied otherwise.
+ // - export_dir should not already exist and will be created by this API.
+ // - Always triggers a flush.
+ virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+ const std::string& export_dir,
+ ExportImportFilesMetaData** metadata);
+
+ virtual ~Checkpoint() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
new file mode 100644
index 000000000..f61afd69e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// This file was moved to rocksdb/convenience.h"
+
+#include "rocksdb/convenience.h"
diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
new file mode 100644
index 000000000..dd83cb24b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+// meant to be removed from the db in a non-strict 'ttl' amount of time
+// Therefore, this guarantees that key-values inserted will remain in the
+// db for >= ttl amount of time and the db will make efforts to remove the
+// key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+// triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+// corrupt values(timestamp suffixed) and no ttl effect will be there
+// during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+// whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+ virtual Status CreateColumnFamilyWithTtl(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ ColumnFamilyHandle** handle, int ttl) = 0;
+
+ static Status Open(const Options& options, const std::string& dbname,
+ DBWithTTL** dbptr, int32_t ttl = 0,
+ bool read_only = false);
+
+ static Status Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ DBWithTTL** dbptr, std::vector<int32_t> ttls,
+ bool read_only = false);
+
+ virtual void SetTtl(int32_t ttl) = 0;
+
+ virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
+
+ protected:
+ explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h
new file mode 100644
index 000000000..a2b6adcb0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/debug.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Data associated with a particular version of a key. A database may internally
+// store multiple versions of a same user key due to snapshots, compaction not
+// happening yet, etc.
+struct KeyVersion {
+ KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
+
+ KeyVersion(const std::string& _user_key, const std::string& _value,
+ SequenceNumber _sequence, int _type)
+ : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
+
+ std::string user_key;
+ std::string value;
+ SequenceNumber sequence;
+ // TODO(ajkr): we should provide a helper function that converts the int to a
+ // string describing the type for easier debugging.
+ int type;
+};
+
+// Returns listing of all versions of keys in the provided user key range.
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
+// The result is inserted into the provided vector, `key_versions`.
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+ size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions);
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+ Slice end_key, size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/env_librados.h b/src/rocksdb/include/rocksdb/utilities/env_librados.h
new file mode 100644
index 000000000..361217c62
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/env_librados.h
@@ -0,0 +1,175 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/env_mirror.h"
+
+#include <rados/librados.hpp>
+
+namespace ROCKSDB_NAMESPACE {
+class LibradosWritableFile;
+
+class EnvLibrados : public EnvWrapper {
+ public:
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores nullptr in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ Status NewSequentialFile(const std::string& fname,
+ std::unique_ptr<SequentialFile>* result,
+ const EnvOptions& options) override;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ Status NewRandomAccessFile(const std::string& fname,
+ std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& options) override;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override;
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override;
+
+ // Create an object that represents a directory. Will fail if directory
+ // doesn't exist. If the directory exists, it will open the directory
+ // and create a new Directory object.
+ //
+ // On success, stores a pointer to the new Directory in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override;
+
+ // Returns OK if the named file exists.
+ // NotFound if the named file does not exist,
+ // the calling process does not have permission to determine
+ // whether this file exists, or if the path is invalid.
+ // IOError if an IO Error was encountered
+ Status FileExists(const std::string& fname) override;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ Status GetChildren(const std::string& dir, std::vector<std::string>* result);
+
+ // Delete the named file.
+ Status DeleteFile(const std::string& fname) override;
+
+ // Create the specified directory. Returns error if directory exists.
+ Status CreateDir(const std::string& dirname) override;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ Status CreateDirIfMissing(const std::string& dirname) override;
+
+ // Delete the specified directory.
+ Status DeleteDir(const std::string& dirname) override;
+
+ // Store the size of fname in *file_size.
+ Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
+
+ // Store the last modification time of fname in *file_mtime.
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override;
+ // Rename file src to target.
+ Status RenameFile(const std::string& src, const std::string& target) override;
+ // Hard Link file src to target.
+ Status LinkFile(const std::string& src, const std::string& target) override;
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores nullptr in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ Status LockFile(const std::string& fname, FileLock** lock);
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ Status UnlockFile(FileLock* lock);
+
+ // Get full directory name for this db.
+ Status GetAbsolutePath(const std::string& db_path, std::string* output_path);
+
+ // Generate unique id
+ std::string GenerateUniqueId();
+
+ // Get default EnvLibrados
+ static EnvLibrados* Default();
+
+ explicit EnvLibrados(const std::string& db_name,
+ const std::string& config_path,
+ const std::string& db_pool);
+
+ explicit EnvLibrados(
+ const std::string& client_name, // first 3 parameters are
+ // for RADOS client init
+ const std::string& cluster_name, const uint64_t flags,
+ const std::string& db_name, const std::string& config_path,
+ const std::string& db_pool, const std::string& wal_dir,
+ const std::string& wal_pool, const uint64_t write_buffer_size);
+ ~EnvLibrados() { _rados.shutdown(); }
+
+ private:
+ std::string _client_name;
+ std::string _cluster_name;
+ uint64_t _flags;
+ std::string _db_name; // get from user, readable string; Also used as db_id
+ // for db metadata
+ std::string _config_path;
+ librados::Rados _rados; // RADOS client
+ std::string _db_pool_name;
+ librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool
+ std::string _wal_dir; // WAL dir path
+ std::string _wal_pool_name;
+ librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool
+ uint64_t _write_buffer_size; // WritableFile buffer max size
+
+ /* private function to communicate with rados */
+ std::string _CreateFid();
+ Status _GetFid(const std::string& fname, std::string& fid);
+ Status _GetFid(const std::string& fname, std::string& fid, int fid_len);
+ Status _RenameFid(const std::string& old_fname, const std::string& new_fname);
+ Status _AddFid(const std::string& fname, const std::string& fid);
+ Status _DelFid(const std::string& fname);
+ Status _GetSubFnames(const std::string& dirname,
+ std::vector<std::string>* result);
+ librados::IoCtx* _GetIoctx(const std::string& prefix);
+ friend class LibradosWritableFile;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
new file mode 100644
index 000000000..8e96ac410
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
@@ -0,0 +1,180 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// MirrorEnv is an Env implementation that mirrors all file-related
+// operations to two backing Env's (provided at construction time).
+// Writes are mirrored. For read operations, we do the read from both
+// backends and assert that the results match.
+//
+// This is useful when implementing a new Env and ensuring that the
+// semantics and behavior are correct (in that they match that of an
+// existing, stable Env, like the default POSIX one).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SequentialFileMirror;
+class RandomAccessFileMirror;
+class WritableFileMirror;
+
+class EnvMirror : public EnvWrapper {
+ Env *a_, *b_;
+ bool free_a_, free_b_;
+
+ public:
+ EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false)
+ : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {}
+ ~EnvMirror() {
+ if (free_a_) delete a_;
+ if (free_b_) delete b_;
+ }
+
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) override;
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) override;
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+ virtual Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override {
+ std::unique_ptr<Directory> br;
+ Status as = a_->NewDirectory(name, result);
+ Status bs = b_->NewDirectory(name, &br);
+ assert(as == bs);
+ return as;
+ }
+ Status FileExists(const std::string& f) override {
+ Status as = a_->FileExists(f);
+ Status bs = b_->FileExists(f);
+ assert(as == bs);
+ return as;
+ }
+#if defined(_MSC_VER)
+#pragma warning(push)
+// logical operation on address of string constant
+#pragma warning(disable : 4130)
+#endif
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* r) override {
+ std::vector<std::string> ar, br;
+ Status as = a_->GetChildren(dir, &ar);
+ Status bs = b_->GetChildren(dir, &br);
+ assert(as == bs);
+ std::sort(ar.begin(), ar.end());
+ std::sort(br.begin(), br.end());
+ if (!as.ok() || ar != br) {
+ assert(0 == "getchildren results don't match");
+ }
+ *r = ar;
+ return as;
+ }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+ Status DeleteFile(const std::string& f) override {
+ Status as = a_->DeleteFile(f);
+ Status bs = b_->DeleteFile(f);
+ assert(as == bs);
+ return as;
+ }
+ Status CreateDir(const std::string& d) override {
+ Status as = a_->CreateDir(d);
+ Status bs = b_->CreateDir(d);
+ assert(as == bs);
+ return as;
+ }
+ Status CreateDirIfMissing(const std::string& d) override {
+ Status as = a_->CreateDirIfMissing(d);
+ Status bs = b_->CreateDirIfMissing(d);
+ assert(as == bs);
+ return as;
+ }
+ Status DeleteDir(const std::string& d) override {
+ Status as = a_->DeleteDir(d);
+ Status bs = b_->DeleteDir(d);
+ assert(as == bs);
+ return as;
+ }
+ Status GetFileSize(const std::string& f, uint64_t* s) override {
+ uint64_t asize, bsize;
+ Status as = a_->GetFileSize(f, &asize);
+ Status bs = b_->GetFileSize(f, &bsize);
+ assert(as == bs);
+ assert(!as.ok() || asize == bsize);
+ *s = asize;
+ return as;
+ }
+
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override {
+ uint64_t amtime, bmtime;
+ Status as = a_->GetFileModificationTime(fname, &amtime);
+ Status bs = b_->GetFileModificationTime(fname, &bmtime);
+ assert(as == bs);
+ assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000);
+ *file_mtime = amtime;
+ return as;
+ }
+
+ Status RenameFile(const std::string& s, const std::string& t) override {
+ Status as = a_->RenameFile(s, t);
+ Status bs = b_->RenameFile(s, t);
+ assert(as == bs);
+ return as;
+ }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ Status as = a_->LinkFile(s, t);
+ Status bs = b_->LinkFile(s, t);
+ assert(as == bs);
+ return as;
+ }
+
+ class FileLockMirror : public FileLock {
+ public:
+ FileLock *a_, *b_;
+ FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {}
+ };
+
+ Status LockFile(const std::string& f, FileLock** l) override {
+ FileLock *al, *bl;
+ Status as = a_->LockFile(f, &al);
+ Status bs = b_->LockFile(f, &bl);
+ assert(as == bs);
+ if (as.ok()) *l = new FileLockMirror(al, bl);
+ return as;
+ }
+
+ Status UnlockFile(FileLock* l) override {
+ FileLockMirror* ml = static_cast<FileLockMirror*>(l);
+ Status as = a_->UnlockFile(ml->a_);
+ Status bs = b_->UnlockFile(ml->b_);
+ assert(as == bs);
+ delete ml;
+ return as;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
new file mode 100644
index 000000000..824f8a3df
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function can be used to list the Information logs,
+// given the db pointer.
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
new file mode 100644
index 000000000..94548b538
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
@@ -0,0 +1,277 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/ldb_cmd_execute_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommand {
+ public:
+ // Command-line arguments
+ static const std::string ARG_ENV_URI;
+ static const std::string ARG_DB;
+ static const std::string ARG_PATH;
+ static const std::string ARG_SECONDARY_PATH;
+ static const std::string ARG_HEX;
+ static const std::string ARG_KEY_HEX;
+ static const std::string ARG_VALUE_HEX;
+ static const std::string ARG_CF_NAME;
+ static const std::string ARG_TTL;
+ static const std::string ARG_TTL_START;
+ static const std::string ARG_TTL_END;
+ static const std::string ARG_TIMESTAMP;
+ static const std::string ARG_TRY_LOAD_OPTIONS;
+ static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
+ static const std::string ARG_FROM;
+ static const std::string ARG_TO;
+ static const std::string ARG_MAX_KEYS;
+ static const std::string ARG_BLOOM_BITS;
+ static const std::string ARG_FIX_PREFIX_LEN;
+ static const std::string ARG_COMPRESSION_TYPE;
+ static const std::string ARG_COMPRESSION_MAX_DICT_BYTES;
+ static const std::string ARG_BLOCK_SIZE;
+ static const std::string ARG_AUTO_COMPACTION;
+ static const std::string ARG_DB_WRITE_BUFFER_SIZE;
+ static const std::string ARG_WRITE_BUFFER_SIZE;
+ static const std::string ARG_FILE_SIZE;
+ static const std::string ARG_CREATE_IF_MISSING;
+ static const std::string ARG_NO_VALUE;
+
+ struct ParsedParams {
+ std::string cmd;
+ std::vector<std::string> cmd_params;
+ std::map<std::string, std::string> option_map;
+ std::vector<std::string> flags;
+ };
+
+ static LDBCommand* SelectCommand(const ParsedParams& parsed_parms);
+
+ static LDBCommand* InitFromCmdLineArgs(
+ const std::vector<std::string>& args, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families,
+ const std::function<LDBCommand*(const ParsedParams&)>& selector =
+ SelectCommand);
+
+ static LDBCommand* InitFromCmdLineArgs(
+ int argc, char** argv, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families);
+
+ bool ValidateCmdLineOptions();
+
+ virtual Options PrepareOptionsForOpenDB();
+
+ virtual void SetDBOptions(Options options) { options_ = options; }
+
+ virtual void SetColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>* column_families) {
+ if (column_families != nullptr) {
+ column_families_ = *column_families;
+ } else {
+ column_families_.clear();
+ }
+ }
+
+ void SetLDBOptions(const LDBOptions& ldb_options) {
+ ldb_options_ = ldb_options;
+ }
+
+ const std::map<std::string, std::string>& TEST_GetOptionMap() {
+ return option_map_;
+ }
+
+ const std::vector<std::string>& TEST_GetFlags() { return flags_; }
+
+ virtual bool NoDBOpen() { return false; }
+
+ virtual ~LDBCommand() { CloseDB(); }
+
+ /* Run the command, and return the execute result. */
+ void Run();
+
+ virtual void DoCommand() = 0;
+
+ LDBCommandExecuteResult GetExecuteState() { return exec_state_; }
+
+ void ClearPreviousRunState() { exec_state_.Reset(); }
+
+ // Consider using Slice::DecodeHex directly instead if you don't need the
+ // 0x prefix
+ static std::string HexToString(const std::string& str);
+
+ // Consider using Slice::ToString(true) directly instead if
+ // you don't need the 0x prefix
+ static std::string StringToHex(const std::string& str);
+
+ static const char* DELIM;
+
+ protected:
+ LDBCommandExecuteResult exec_state_;
+ std::string env_uri_;
+ std::string db_path_;
+ // If empty, open DB as primary. If non-empty, open the DB as secondary
+ // with this secondary path. When running against a database opened by
+ // another process, ldb wll leave the source directory completely intact.
+ std::string secondary_path_;
+ std::string column_family_name_;
+ DB* db_;
+ DBWithTTL* db_ttl_;
+ std::map<std::string, ColumnFamilyHandle*> cf_handles_;
+
+ /**
+ * true implies that this command can work if the db is opened in read-only
+ * mode.
+ */
+ bool is_read_only_;
+
+ /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+ bool is_key_hex_;
+
+ /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+ bool is_value_hex_;
+
+ /** If true, the value is treated as timestamp suffixed */
+ bool is_db_ttl_;
+
+ // If true, the kvs are output with their insert/modify timestamp in a ttl db
+ bool timestamp_;
+
+ // If true, try to construct options from DB's option files.
+ bool try_load_options_;
+
+ bool ignore_unknown_options_;
+
+ bool create_if_missing_;
+
+ /**
+ * Map of options passed on the command-line.
+ */
+ const std::map<std::string, std::string> option_map_;
+
+ /**
+ * Flags passed on the command-line.
+ */
+ const std::vector<std::string> flags_;
+
+ /** List of command-line options valid for this command */
+ const std::vector<std::string> valid_cmd_line_options_;
+
+ /** Shared pointer to underlying environment if applicable **/
+ std::shared_ptr<Env> env_guard_;
+
+ bool ParseKeyValue(const std::string& line, std::string* key,
+ std::string* value, bool is_key_hex, bool is_value_hex);
+
+ LDBCommand(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags, bool is_read_only,
+ const std::vector<std::string>& valid_cmd_line_options);
+
+ void OpenDB();
+
+ void CloseDB();
+
+ ColumnFamilyHandle* GetCfHandle();
+
+ static std::string PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_key_hex,
+ bool is_value_hex);
+
+ static std::string PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_hex);
+
+ /**
+ * Return true if the specified flag is present in the specified flags vector
+ */
+ static bool IsFlagPresent(const std::vector<std::string>& flags,
+ const std::string& flag) {
+ return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+ }
+
+ static std::string HelpRangeCmdArgs();
+
+ /**
+ * A helper function that returns a list of command line options
+ * used by this command. It includes the common options and the ones
+ * passed in.
+ */
+ static std::vector<std::string> BuildCmdLineOptions(
+ std::vector<std::string> options);
+
+ bool ParseIntOption(const std::map<std::string, std::string>& options,
+ const std::string& option, int& value,
+ LDBCommandExecuteResult& exec_state);
+
+ bool ParseStringOption(const std::map<std::string, std::string>& options,
+ const std::string& option, std::string* value);
+
+ /**
+ * Returns the value of the specified option as a boolean.
+ * default_val is used if the option is not found in options.
+ * Throws an exception if the value of the option is not
+ * "true" or "false" (case insensitive).
+ */
+ bool ParseBooleanOption(const std::map<std::string, std::string>& options,
+ const std::string& option, bool default_val);
+
+ Options options_;
+ std::vector<ColumnFamilyDescriptor> column_families_;
+ LDBOptions ldb_options_;
+
+ private:
+ /**
+ * Interpret command line options and flags to determine if the key
+ * should be input/output in hex.
+ */
+ bool IsKeyHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ /**
+ * Interpret command line options and flags to determine if the value
+ * should be input/output in hex.
+ */
+ bool IsValueHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ /**
+ * Converts val to a boolean.
+ * val must be either true or false (case insensitive).
+ * Otherwise an exception is thrown.
+ */
+ bool StringToBool(std::string val);
+};
+
+class LDBCommandRunner {
+ public:
+ static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name);
+
+ // Returns the status code to return. 0 is no error.
+ static int RunCommand(
+ int argc, char** argv, Options options, const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
new file mode 100644
index 000000000..c837b47f7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifdef FAILED
+#undef FAILED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommandExecuteResult {
+ public:
+ enum State {
+ EXEC_NOT_STARTED = 0,
+ EXEC_SUCCEED = 1,
+ EXEC_FAILED = 2,
+ };
+
+ LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
+
+ LDBCommandExecuteResult(State state, std::string& msg)
+ : state_(state), message_(msg) {}
+
+ std::string ToString() {
+ std::string ret;
+ switch (state_) {
+ case EXEC_SUCCEED:
+ break;
+ case EXEC_FAILED:
+ ret.append("Failed: ");
+ break;
+ case EXEC_NOT_STARTED:
+ ret.append("Not started: ");
+ }
+ if (!message_.empty()) {
+ ret.append(message_);
+ }
+ return ret;
+ }
+
+ void Reset() {
+ state_ = EXEC_NOT_STARTED;
+ message_ = "";
+ }
+
+ bool IsSucceed() { return state_ == EXEC_SUCCEED; }
+
+ bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; }
+
+ bool IsFailed() { return state_ == EXEC_FAILED; }
+
+ static LDBCommandExecuteResult Succeed(std::string msg) {
+ return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+ }
+
+ static LDBCommandExecuteResult Failed(std::string msg) {
+ return LDBCommandExecuteResult(EXEC_FAILED, msg);
+ }
+
+ private:
+ State state_;
+ std::string message_;
+
+ bool operator==(const LDBCommandExecuteResult&);
+ bool operator!=(const LDBCommandExecuteResult&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 000000000..e9fef9609
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+struct Options;
+class Snapshot;
+
+enum CompressionType : unsigned char;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator;
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists;
+
+ // If true, the implementation will do aggressive checking of the
+ // data it is processing and will stop early if it detects any
+ // errors. This may have unforeseen ramifications: for example, a
+ // corruption of one DB entry may cause a large number of entries to
+ // become unreadable or for the entire DB to become unopenable.
+ // Default: false
+ bool paranoid_checks;
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc.
+ // Default: Env::Default()
+ Env* env;
+
+ // Any internal progress/error information generated by the db will
+ // be written to info_log if it is non-NULL, or to a file stored
+ // in the same directory as the DB contents if info_log is NULL.
+ // Default: NULL
+ Logger* info_log;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
+ //
+ // Larger values increase performance, especially during bulk loads.
+ // Up to two write buffers may be held in memory at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
+ // Also, a larger write buffer will result in a longer recovery time
+ // the next time the database is opened.
+ //
+ // Default: 4MB
+ size_t write_buffer_size;
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set (budget
+ // one open file per 2MB of working set).
+ //
+ // Default: 1000
+ int max_open_files;
+
+ // Control over blocks (user data is stored in a set of blocks, and
+ // a block is the unit of reading from disk).
+
+ // If non-NULL, use the specified cache for blocks.
+ // If NULL, leveldb will automatically create and use an 8MB internal cache.
+ // Default: NULL
+ Cache* block_cache;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ //
+ // Default: 4K
+ size_t block_size;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone.
+ //
+ // Default: 16
+ int block_restart_interval;
+
+ // Compress blocks using the specified compression algorithm. This
+ // parameter can be changed dynamically.
+ //
+ // Default: kSnappyCompression, which gives lightweight but fast
+ // compression.
+ //
+ // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kSnappyCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ CompressionType compression;
+
+ // If non-NULL, use the specified filter policy to reduce disk reads.
+ // Many applications will benefit from passing the result of
+ // NewBloomFilterPolicy() here.
+ //
+ // Default: NULL
+ const FilterPolicy* filter_policy;
+
+ // Create a LevelDBOptions object with default values for all fields.
+ LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
new file mode 100644
index 000000000..f617da02b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2016, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifdef LUA
+
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+// A class that used to define custom C Library that is callable
+// from Lua script
+class RocksLuaCustomLibrary {
+ public:
+ virtual ~RocksLuaCustomLibrary() {}
+ // The name of the C library. This name will also be used as the table
+ // (namespace) in Lua that contains the C library.
+ virtual const char* Name() const = 0;
+
+ // Returns a "static const struct luaL_Reg[]", which includes a list of
+ // C functions. Note that the last entry of this static array must be
+ // {nullptr, nullptr} as required by Lua.
+ //
+ // More details about how to implement Lua C libraries can be found
+ // in the official Lua document http://www.lua.org/pil/26.2.html
+ virtual const struct luaL_Reg* Lib() const = 0;
+
+ // A function that will be called right after the library has been created
+ // and pushed on the top of the lua_State. This custom setup function
+ // allows developers to put additional table or constant values inside
+ // the same table / namespace.
+ virtual void CustomSetup(lua_State* /*L*/) const {}
+};
+} // namespace lua
+} // namespace ROCKSDB_NAMESPACE
+#endif // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
new file mode 100644
index 000000000..3427b65ef
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2016, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+#ifdef LUA
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+class LuaStateWrapper {
+ public:
+ explicit LuaStateWrapper(const std::string& lua_script) {
+ lua_state_ = luaL_newstate();
+ Init(lua_script, {});
+ }
+ LuaStateWrapper(
+ const std::string& lua_script,
+ const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+ lua_state_ = luaL_newstate();
+ Init(lua_script, libraries);
+ }
+ lua_State* GetLuaState() const { return lua_state_; }
+ ~LuaStateWrapper() { lua_close(lua_state_); }
+
+ private:
+ void Init(
+ const std::string& lua_script,
+ const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+ if (lua_state_) {
+ luaL_openlibs(lua_state_);
+ for (const auto& library : libraries) {
+ luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
+ library->CustomSetup(lua_state_);
+ }
+ luaL_dostring(lua_state_, lua_script.c_str());
+ }
+ }
+
+ lua_State* lua_state_;
+};
+} // namespace lua
+} // namespace ROCKSDB_NAMESPACE
+#endif // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h
new file mode 100644
index 000000000..4f1606b51
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the current memory usage of the specified DB instances.
+class MemoryUtil {
+ public:
+ enum UsageType : int {
+ // Memory usage of all the mem-tables.
+ kMemTableTotal = 0,
+ // Memory usage of those un-flushed mem-tables.
+ kMemTableUnFlushed = 1,
+ // Memory usage of all the table readers.
+ kTableReadersTotal = 2,
+ // Memory usage by Cache.
+ kCacheTotal = 3,
+ kNumUsageTypes = 4
+ };
+
+ // Returns the approximate memory usage of different types in the input
+ // list of DBs and Cache set. For instance, in the output map
+ // usage_by_type, usage_by_type[kMemTableTotal] will store the memory
+ // usage of all the mem-tables from all the input rocksdb instances.
+ //
+ // Note that for memory usage inside Cache class, we will
+ // only report the usage of the input "cache_set" without
+ // including those Cache usage inside the input list "dbs"
+ // of DBs.
+ static Status GetApproximateMemoryUsageByType(
+ const std::vector<DB*>& dbs,
+ const std::unordered_set<const Cache*> cache_set,
+ std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h
new file mode 100644
index 000000000..74a49d400
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h
@@ -0,0 +1,205 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <memory>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
+template <typename T>
+using FactoryFunc =
+ std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+class ObjectLibrary {
+ public:
+ // Base class for an Entry in the Registry.
+ class Entry {
+ public:
+ virtual ~Entry() {}
+ Entry(const std::string& name) : name_(std::move(name)) {}
+
+ // Checks to see if the target matches this entry
+ virtual bool matches(const std::string& target) const {
+ return name_ == target;
+ }
+ const std::string& Name() const { return name_; }
+
+ private:
+ const std::string name_; // The name of the Entry
+ }; // End class Entry
+
+ // An Entry containing a FactoryFunc for creating new Objects
+ template <typename T>
+ class FactoryEntry : public Entry {
+ public:
+ FactoryEntry(const std::string& name, FactoryFunc<T> f)
+ : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {}
+ ~FactoryEntry() override {}
+ bool matches(const std::string& target) const override {
+ return std::regex_match(target, pattern_);
+ }
+ // Creates a new T object.
+ T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+ std::string* msg) const {
+ return factory_(target, guard, msg);
+ }
+
+ private:
+ std::regex pattern_; // The pattern for this entry
+ FactoryFunc<T> factory_;
+ }; // End class FactoryEntry
+ public:
+ // Finds the entry matching the input name and type
+ const Entry* FindEntry(const std::string& type,
+ const std::string& name) const;
+ void Dump(Logger* logger) const;
+
+ // Registers the factory with the library for the pattern.
+ // If the pattern matches, the factory may be used to create a new object.
+ template <typename T>
+ const FactoryFunc<T>& Register(const std::string& pattern,
+ const FactoryFunc<T>& factory) {
+ std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory));
+ AddEntry(T::Type(), entry);
+ return factory;
+ }
+ // Returns the default ObjectLibrary
+ static std::shared_ptr<ObjectLibrary>& Default();
+
+ private:
+ // Adds the input entry to the list for the given type
+ void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry);
+
+ // ** FactoryFunctions for this loader, organized by type
+ std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
+};
+
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+ static std::shared_ptr<ObjectRegistry> NewInstance();
+
+ ObjectRegistry();
+
+ void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+ libraries_.emplace_back(library);
+ }
+
+ // Creates a new T using the factory function that was registered with a
+ // pattern that matches the provided "target" string according to
+ // std::regex_match.
+ //
+ // If no registered functions match, returns nullptr. If multiple functions
+ // match, the factory function used is unspecified.
+ //
+ // Populates res_guard with result pointer if caller is granted ownership.
+ template <typename T>
+ T* NewObject(const std::string& target, std::unique_ptr<T>* guard,
+ std::string* errmsg) {
+ guard->reset();
+ const auto* basic = FindEntry(T::Type(), target);
+ if (basic != nullptr) {
+ const auto* factory =
+ static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic);
+ return factory->NewFactoryObject(target, guard, errmsg);
+ } else {
+ *errmsg = std::string("Could not load ") + T::Type();
+ return nullptr;
+ }
+ }
+
+ // Creates a new unique T using the input factory functions.
+ // Returns OK if a new unique T was successfully created
+ // Returns NotFound if the type/target could not be created
+ // Returns InvalidArgument if the factory return an unguarded object
+ // (meaning it cannot be managed by a unique ptr)
+ template <typename T>
+ Status NewUniqueObject(const std::string& target,
+ std::unique_ptr<T>* result) {
+ std::string errmsg;
+ T* ptr = NewObject(target, result, &errmsg);
+ if (ptr == nullptr) {
+ return Status::NotFound(errmsg, target);
+ } else if (*result) {
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument(std::string("Cannot make a unique ") +
+ T::Type() + " from unguarded one ",
+ target);
+ }
+ }
+
+ // Creates a new shared T using the input factory functions.
+ // Returns OK if a new shared T was successfully created
+ // Returns NotFound if the type/target could not be created
+ // Returns InvalidArgument if the factory return an unguarded object
+ // (meaning it cannot be managed by a shared ptr)
+ template <typename T>
+ Status NewSharedObject(const std::string& target,
+ std::shared_ptr<T>* result) {
+ std::string errmsg;
+ std::unique_ptr<T> guard;
+ T* ptr = NewObject(target, &guard, &errmsg);
+ if (ptr == nullptr) {
+ return Status::NotFound(errmsg, target);
+ } else if (guard) {
+ result->reset(guard.release());
+ return Status::OK();
+ } else {
+ return Status::InvalidArgument(std::string("Cannot make a shared ") +
+ T::Type() + " from unguarded one ",
+ target);
+ }
+ }
+
+ // Creates a new static T using the input factory functions.
+ // Returns OK if a new static T was successfully created
+ // Returns NotFound if the type/target could not be created
+ // Returns InvalidArgument if the factory return a guarded object
+ // (meaning it is managed by a unique ptr)
+ template <typename T>
+ Status NewStaticObject(const std::string& target, T** result) {
+ std::string errmsg;
+ std::unique_ptr<T> guard;
+ T* ptr = NewObject(target, &guard, &errmsg);
+ if (ptr == nullptr) {
+ return Status::NotFound(errmsg, target);
+ } else if (guard.get()) {
+ return Status::InvalidArgument(std::string("Cannot make a static ") +
+ T::Type() + " from a guarded one ",
+ target);
+ } else {
+ *result = ptr;
+ return Status::OK();
+ }
+ }
+
+ // Dump the contents of the registry to the logger
+ void Dump(Logger* logger) const;
+
+ private:
+ const ObjectLibrary::Entry* FindEntry(const std::string& type,
+ const std::string& name) const;
+
+ // The set of libraries to search for factories for this registry.
+ // The libraries are searched in reverse order (back to front) when
+ // searching for entries.
+ std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
new file mode 100644
index 000000000..5356df71f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Transaction;
+
+// Database with Transaction support.
+//
+// See optimistic_transaction.h and examples/transaction_example.cc
+
+// Options to use when starting an Optimistic Transaction
+struct OptimisticTransactionOptions {
+ // Setting set_snapshot=true is the same as calling SetSnapshot().
+ bool set_snapshot = false;
+
+ // Should be set if the DB has a non-default comparator.
+ // See comment in WriteBatchWithIndex constructor.
+ const Comparator* cmp = BytewiseComparator();
+};
+
+enum class OccValidationPolicy {
+ // Validate serially at commit stage, AFTER entering the write-group.
+ // Isolation validation is processed single-threaded(since in the
+ // write-group).
+ // May suffer from high mutex contention, as per this link:
+ // https://github.com/facebook/rocksdb/issues/4402
+ kValidateSerial = 0,
+ // Validate parallelly before commit stage, BEFORE entering the write-group to
+ // reduce mutex contention. Each txn acquires locks for its write-set
+ // records in some well-defined order.
+ kValidateParallel = 1
+};
+
+struct OptimisticTransactionDBOptions {
+ OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel;
+
+ // works only if validate_policy == OccValidationPolicy::kValidateParallel
+ uint32_t occ_lock_buckets = (1 << 20);
+};
+
+class OptimisticTransactionDB : public StackableDB {
+ public:
+ // Open an OptimisticTransactionDB similar to DB::Open().
+ static Status Open(const Options& options, const std::string& dbname,
+ OptimisticTransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options,
+ const OptimisticTransactionDBOptions& occ_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr);
+
+ virtual ~OptimisticTransactionDB() {}
+
+ // Starts a new Transaction.
+ //
+ // Caller is responsible for deleting the returned transaction when no
+ // longer needed.
+ //
+ // If old_txn is not null, BeginTransaction will reuse this Transaction
+ // handle instead of allocating a new one. This is an optimization to avoid
+ // extra allocations when repeatedly creating transactions.
+ virtual Transaction* BeginTransaction(
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options =
+ OptimisticTransactionOptions(),
+ Transaction* old_txn = nullptr) = 0;
+
+ OptimisticTransactionDB(const OptimisticTransactionDB&) = delete;
+ void operator=(const OptimisticTransactionDB&) = delete;
+
+ protected:
+ // To Create an OptimisticTransactionDB, call Open()
+ explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
new file mode 100644
index 000000000..cb1d0d117
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Try to migrate DB created with old_opts to be use new_opts.
+// Multiple column families is not supported.
+// It is best-effort. No guarantee to succeed.
+// A full compaction may be executed.
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+ const Options& new_opts);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h
new file mode 100644
index 000000000..1a29464a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_util.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// This file contains utility functions for RocksDB Options.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Constructs the DBOptions and ColumnFamilyDescriptors by loading the
+// latest RocksDB options file stored in the specified rocksdb database.
+//
+// Note that the all the pointer options (except table_factory, which will
+// be described in more details below) will be initialized with the default
+// values. Developers can further initialize them after this function call.
+// Below is an example list of pointer options which will be initialized
+//
+// * env
+// * memtable_factory
+// * compaction_filter_factory
+// * prefix_extractor
+// * comparator
+// * merge_operator
+// * compaction_filter
+//
+// User can also choose to load customized comparator, env, and/or
+// merge_operator through object registry:
+// * comparator needs to be registered through Registrar<const Comparator>
+// * env needs to be registered through Registrar<Env>
+// * merge operator needs to be registered through
+// Registrar<std::shared_ptr<MergeOperator>>.
+//
+// For table_factory, this function further supports deserializing
+// BlockBasedTableFactory and its BlockBasedTableOptions except the
+// pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+// block_cache, and block_cache_compressed), which will be initialized with
+// default values. Developers can further specify these three options by
+// casting the return value of TableFactoroy::GetOptions() to
+// BlockBasedTableOptions and making necessary changes.
+//
+// ignore_unknown_options can be set to true if you want to ignore options
+// that are from a newer version of the db, esentially for forward
+// compatibility.
+//
+// examples/options_file_example.cc demonstrates how to use this function
+// to open a RocksDB instance.
+//
+// @return the function returns an OK status when it went successfully. If
+// the specified "dbpath" does not contain any option file, then a
+// Status::NotFound will be returned. A return value other than
+// Status::OK or Status::NotFound indicates there're some error related
+// to the options file itself.
+//
+// @see LoadOptionsFromFile
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options = false,
+ std::shared_ptr<Cache>* cache = {});
+
+// Similar to LoadLatestOptions, this function constructs the DBOptions
+// and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+//
+// @see LoadLatestOptions
+Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options = false,
+ std::shared_ptr<Cache>* cache = {});
+
+// Returns the latest options file name under the specified db path.
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+ std::string* options_file_name);
+
+// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors
+// are compatible with the latest options stored in the specified DB path.
+//
+// If the return status is non-ok, it means the specified RocksDB instance
+// might not be correctly opened with the input set of options. Currently,
+// changing one of the following options will fail the compatibility check:
+//
+// * comparator
+// * prefix_extractor
+// * table_factory
+// * merge_operator
+Status CheckOptionsCompatibility(
+ const std::string& dbpath, Env* env, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& cf_descs,
+ bool ignore_unknown_options = false);
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
new file mode 100644
index 000000000..ba6f1d748
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCache;
+
+// For instrumentation purpose, use NewSimCache instead of NewLRUCache API
+// NewSimCache is a wrapper function returning a SimCache instance that can
+// have additional interface provided in Simcache class besides Cache interface
+// to predict block cache hit rate without actually allocating the memory. It
+// can help users tune their current block cache size, and determine how
+// efficient they are using the memory.
+//
+// Since GetSimCapacity() returns the capacity for simulutation, it differs from
+// actual memory usage, which can be estimated as:
+// sim_capacity * entry_size / (entry_size + block_size),
+// where 76 <= entry_size <= 104,
+// BlockBasedTableOptions.block_size = 4096 by default but is configurable,
+// Therefore, generally the actual memory overhead of SimCache is Less than
+// sim_capacity * 2%
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+ size_t sim_capacity,
+ int num_shard_bits);
+
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+ std::shared_ptr<Cache> cache,
+ int num_shard_bits);
+
+class SimCache : public Cache {
+ public:
+ SimCache() {}
+
+ ~SimCache() override {}
+
+ const char* Name() const override { return "SimCache"; }
+
+ // returns the maximum configured capacity of the simcache for simulation
+ virtual size_t GetSimCapacity() const = 0;
+
+ // simcache doesn't provide internal handler reference to user, so always
+ // PinnedUsage = 0 and the behavior will be not exactly consistent the
+ // with real cache.
+ // returns the memory size for the entries residing in the simcache.
+ virtual size_t GetSimUsage() const = 0;
+
+ // sets the maximum configured capacity of the simcache. When the new
+ // capacity is less than the old capacity and the existing usage is
+ // greater than new capacity, the implementation will purge old entries
+ // to fit new capapicty.
+ virtual void SetSimCapacity(size_t capacity) = 0;
+
+ // returns the lookup times of simcache
+ virtual uint64_t get_miss_counter() const = 0;
+ // returns the hit times of simcache
+ virtual uint64_t get_hit_counter() const = 0;
+ // reset the lookup and hit counters
+ virtual void reset_counter() = 0;
+ // String representation of the statistics of the simcache
+ virtual std::string ToString() const = 0;
+
+ // Start storing logs of the cache activity (Add/Lookup) into
+ // a file located at activity_log_file, max_logging_size option can be used to
+ // stop logging to the file automatically after reaching a specific size in
+ // bytes, a values of 0 disable this feature
+ virtual Status StartActivityLogging(const std::string& activity_log_file,
+ Env* env,
+ uint64_t max_logging_size = 0) = 0;
+
+ // Stop cache activity logging if any
+ virtual void StopActivityLogging() = 0;
+
+ // Status of cache logging happening in background
+ virtual Status GetActivityLoggingStatus() = 0;
+
+ private:
+ SimCache(const SimCache&);
+ SimCache& operator=(const SimCache&);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
new file mode 100644
index 000000000..9888fa22d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -0,0 +1,465 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include "rocksdb/db.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+ // StackableDB take sole ownership of the underlying db.
+ explicit StackableDB(DB* db) : db_(db) {}
+
+ // StackableDB take shared ownership of the underlying db.
+ explicit StackableDB(std::shared_ptr<DB> db)
+ : db_(db.get()), shared_db_ptr_(db) {}
+
+ ~StackableDB() {
+ if (shared_db_ptr_ == nullptr) {
+ delete db_;
+ } else {
+ assert(shared_db_ptr_.get() == db_);
+ }
+ db_ = nullptr;
+ }
+
+ virtual Status Close() override { return db_->Close(); }
+
+ virtual DB* GetBaseDB() { return db_; }
+
+ virtual DB* GetRootDB() override { return db_->GetRootDB(); }
+
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) override {
+ return db_->CreateColumnFamily(options, column_family_name, handle);
+ }
+
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override {
+ return db_->CreateColumnFamilies(options, column_family_names, handles);
+ }
+
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override {
+ return db_->CreateColumnFamilies(column_families, handles);
+ }
+
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override {
+ return db_->DropColumnFamily(column_family);
+ }
+
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override {
+ return db_->DropColumnFamilies(column_families);
+ }
+
+ virtual Status DestroyColumnFamilyHandle(
+ ColumnFamilyHandle* column_family) override {
+ return db_->DestroyColumnFamilyHandle(column_family);
+ }
+
+ using DB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& val) override {
+ return db_->Put(options, column_family, key, val);
+ }
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override {
+ return db_->Get(options, column_family, key, value);
+ }
+
+ using DB::GetMergeOperands;
+ virtual Status GetMergeOperands(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* slice,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) override {
+ return db_->GetMergeOperands(options, column_family, key, slice,
+ get_merge_operands_options,
+ number_of_operands);
+ }
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override {
+ return db_->MultiGet(options, column_family, keys, values);
+ }
+
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override {
+ return db_->MultiGet(options, column_family, num_keys, keys,
+ values, statuses, sorted_input);
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& options) override {
+ return db_->IngestExternalFile(column_family, external_files, options);
+ }
+
+ using DB::IngestExternalFiles;
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) override {
+ return db_->IngestExternalFiles(args);
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) override {
+ return db_->CreateColumnFamilyWithImport(options, column_family_name,
+ import_options, metadata, handle);
+ }
+
+ virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
+
+ virtual Status VerifyChecksum(const ReadOptions& options) override {
+ return db_->VerifyChecksum(options);
+ }
+
+ using DB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value,
+ bool* value_found = nullptr) override {
+ return db_->KeyMayExist(options, column_family, key, value, value_found);
+ }
+
+ using DB::Delete;
+ virtual Status Delete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override {
+ return db_->Delete(wopts, column_family, key);
+ }
+
+ using DB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override {
+ return db_->SingleDelete(wopts, column_family, key);
+ }
+
+ using DB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override {
+ return db_->Merge(options, column_family, key, value);
+ }
+
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
+ return db_->Write(opts, updates);
+ }
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& opts,
+ ColumnFamilyHandle* column_family) override {
+ return db_->NewIterator(opts, column_family);
+ }
+
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override {
+ return db_->NewIterators(options, column_families, iterators);
+ }
+
+ virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+ return db_->ReleaseSnapshot(snapshot);
+ }
+
+ using DB::GetMapProperty;
+ using DB::GetProperty;
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) override {
+ return db_->GetProperty(column_family, property, value);
+ }
+ virtual bool GetMapProperty(
+ ColumnFamilyHandle* column_family, const Slice& property,
+ std::map<std::string, std::string>* value) override {
+ return db_->GetMapProperty(column_family, property, value);
+ }
+
+ using DB::GetIntProperty;
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) override {
+ return db_->GetIntProperty(column_family, property, value);
+ }
+
+ using DB::GetAggregatedIntProperty;
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* value) override {
+ return db_->GetAggregatedIntProperty(property, value);
+ }
+
+ using DB::GetApproximateSizes;
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* r, int n,
+ uint64_t* sizes) override {
+ return db_->GetApproximateSizes(options, column_family, r, n, sizes);
+ }
+
+ using DB::GetApproximateMemTableStats;
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) override {
+ return db_->GetApproximateMemTableStats(column_family, range, count, size);
+ }
+
+ using DB::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override {
+ return db_->CompactRange(options, column_family, begin, end);
+ }
+
+ using DB::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override {
+ return db_->CompactFiles(compact_options, column_family, input_file_names,
+ output_level, output_path_id, output_file_names,
+ compaction_job_info);
+ }
+
+ virtual Status PauseBackgroundWork() override {
+ return db_->PauseBackgroundWork();
+ }
+ virtual Status ContinueBackgroundWork() override {
+ return db_->ContinueBackgroundWork();
+ }
+
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) override {
+ return db_->EnableAutoCompaction(column_family_handles);
+ }
+
+ virtual void EnableManualCompaction() override {
+ return db_->EnableManualCompaction();
+ }
+ virtual void DisableManualCompaction() override {
+ return db_->DisableManualCompaction();
+ }
+
+ using DB::NumberLevels;
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+ return db_->NumberLevels(column_family);
+ }
+
+ using DB::MaxMemCompactionLevel;
+ virtual int MaxMemCompactionLevel(
+ ColumnFamilyHandle* column_family) override {
+ return db_->MaxMemCompactionLevel(column_family);
+ }
+
+ using DB::Level0StopWriteTrigger;
+ virtual int Level0StopWriteTrigger(
+ ColumnFamilyHandle* column_family) override {
+ return db_->Level0StopWriteTrigger(column_family);
+ }
+
+ virtual const std::string& GetName() const override { return db_->GetName(); }
+
+ virtual Env* GetEnv() const override { return db_->GetEnv(); }
+
+ virtual FileSystem* GetFileSystem() const override {
+ return db_->GetFileSystem();
+ }
+
+ using DB::GetOptions;
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+ return db_->GetOptions(column_family);
+ }
+
+ using DB::GetDBOptions;
+ virtual DBOptions GetDBOptions() const override {
+ return db_->GetDBOptions();
+ }
+
+ using DB::Flush;
+ virtual Status Flush(const FlushOptions& fopts,
+ ColumnFamilyHandle* column_family) override {
+ return db_->Flush(fopts, column_family);
+ }
+ virtual Status Flush(
+ const FlushOptions& fopts,
+ const std::vector<ColumnFamilyHandle*>& column_families) override {
+ return db_->Flush(fopts, column_families);
+ }
+
+ virtual Status SyncWAL() override { return db_->SyncWAL(); }
+
+ virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+
+ virtual Status LockWAL() override { return db_->LockWAL(); }
+
+ virtual Status UnlockWAL() override { return db_->UnlockWAL(); }
+
+#ifndef ROCKSDB_LITE
+
+ virtual Status DisableFileDeletions() override {
+ return db_->DisableFileDeletions();
+ }
+
+ virtual Status EnableFileDeletions(bool force) override {
+ return db_->EnableFileDeletions(force);
+ }
+
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* metadata) override {
+ db_->GetLiveFilesMetaData(metadata);
+ }
+
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* cf_meta) override {
+ db_->GetColumnFamilyMetaData(column_family, cf_meta);
+ }
+
+ using DB::StartBlockCacheTrace;
+ Status StartBlockCacheTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override {
+ return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+ }
+
+ using DB::EndBlockCacheTrace;
+ Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
+#endif // ROCKSDB_LITE
+
+ virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+ bool flush_memtable = true) override {
+ return db_->GetLiveFiles(vec, mfs, flush_memtable);
+ }
+
+ virtual SequenceNumber GetLatestSequenceNumber() const override {
+ return db_->GetLatestSequenceNumber();
+ }
+
+ virtual bool SetPreserveDeletesSequenceNumber(
+ SequenceNumber seqnum) override {
+ return db_->SetPreserveDeletesSequenceNumber(seqnum);
+ }
+
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+ return db_->GetSortedWalFiles(files);
+ }
+
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) override {
+ return db_->GetCurrentWalFile(current_log_file);
+ }
+
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* creation_time) override {
+ return db_->GetCreationTimeOfOldestFile(creation_time);
+ }
+
+ virtual Status DeleteFile(std::string name) override {
+ return db_->DeleteFile(name);
+ }
+
+ virtual Status GetDbIdentity(std::string& identity) const override {
+ return db_->GetDbIdentity(identity);
+ }
+
+ using DB::SetOptions;
+ virtual Status SetOptions(ColumnFamilyHandle* column_family_handle,
+ const std::unordered_map<std::string, std::string>&
+ new_options) override {
+ return db_->SetOptions(column_family_handle, new_options);
+ }
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& new_options)
+ override {
+ return db_->SetDBOptions(new_options);
+ }
+
+ using DB::ResetStats;
+ virtual Status ResetStats() override { return db_->ResetStats(); }
+
+ using DB::GetPropertiesOfAllTables;
+ virtual Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) override {
+ return db_->GetPropertiesOfAllTables(column_family, props);
+ }
+
+ using DB::GetPropertiesOfTablesInRange;
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) override {
+ return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
+ }
+
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options) override {
+ return db_->GetUpdatesSince(seq_number, iter, read_options);
+ }
+
+ virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin,
+ const Slice* end) override {
+ return db_->SuggestCompactRange(column_family, begin, end);
+ }
+
+ virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+ int target_level) override {
+ return db_->PromoteL0(column_family, target_level);
+ }
+
+ virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+ return db_->DefaultColumnFamily();
+ }
+
+#ifndef ROCKSDB_LITE
+ Status TryCatchUpWithPrimary() override {
+ return db_->TryCatchUpWithPrimary();
+ }
+#endif // ROCKSDB_LITE
+
+ protected:
+ DB* db_;
+ std::shared_ptr<DB> shared_db_ptr_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
new file mode 100644
index 000000000..b7ee88bc3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+class CompactOnDeletionCollectorFactory
+ : public TablePropertiesCollectorFactory {
+ public:
+ virtual ~CompactOnDeletionCollectorFactory() {}
+
+ virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context context) override;
+
+ // Change the value of sliding_window_size "N"
+ // Setting it to 0 disables the delete triggered compaction
+ void SetWindowSize(size_t sliding_window_size) {
+ sliding_window_size_.store(sliding_window_size);
+ }
+
+ // Change the value of deletion_trigger "D"
+ void SetDeletionTrigger(size_t deletion_trigger) {
+ deletion_trigger_.store(deletion_trigger);
+ }
+
+ virtual const char* Name() const override {
+ return "CompactOnDeletionCollector";
+ }
+
+ private:
+ friend std::shared_ptr<CompactOnDeletionCollectorFactory>
+ NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger);
+ // A factory of a table property collector that marks a SST
+ // file as need-compaction when it observe at least "D" deletion
+ // entries in any "N" consecutive entires.
+ //
+ // @param sliding_window_size "N"
+ // @param deletion_trigger "D"
+ CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger)
+ : sliding_window_size_(sliding_window_size),
+ deletion_trigger_(deletion_trigger) {}
+
+ std::atomic<size_t> sliding_window_size_;
+ std::atomic<size_t> deletion_trigger_;
+};
+
+// Creates a factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+//
+// @param sliding_window_size "N". Note that this number will be
+// round up to the smallest multiple of 128 that is no less
+// than the specified size.
+// @param deletion_trigger "D". Note that even when "N" is changed,
+// the specified number for "D" will not be changed.
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger);
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h
new file mode 100644
index 000000000..d6c6722c8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction.h
@@ -0,0 +1,540 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+class TransactionDB;
+class WriteBatchWithIndex;
+
+using TransactionName = std::string;
+
+using TransactionID = uint64_t;
+
+// Provides notification to the caller of SetSnapshotOnNextOperation when
+// the actual snapshot gets created
+class TransactionNotifier {
+ public:
+ virtual ~TransactionNotifier() {}
+
+ // Implement this method to receive notification when a snapshot is
+ // requested via SetSnapshotOnNextOperation.
+ virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
+};
+
+// Provides BEGIN/COMMIT/ROLLBACK transactions.
+//
+// To use transactions, you must first create either an OptimisticTransactionDB
+// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
+// more information.
+//
+// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
+//
+// It is up to the caller to synchronize access to this object.
+//
+// See examples/transaction_example.cc for some simple examples.
+//
+// TODO(agiardullo): Not yet implemented
+// -PerfContext statistics
+// -Support for using Transactions with DBWithTTL
+class Transaction {
+ public:
+ // No copying allowed
+ Transaction(const Transaction&) = delete;
+ void operator=(const Transaction&) = delete;
+
+ virtual ~Transaction() {}
+
+ // If a transaction has a snapshot set, the transaction will ensure that
+ // any keys successfully written(or fetched via GetForUpdate()) have not
+ // been modified outside of this transaction since the time the snapshot was
+ // set.
+ // If a snapshot has not been set, the transaction guarantees that keys have
+ // not been modified since the time each key was first written (or fetched via
+ // GetForUpdate()).
+ //
+ // Using SetSnapshot() will provide stricter isolation guarantees at the
+ // expense of potentially more transaction failures due to conflicts with
+ // other writes.
+ //
+ // Calling SetSnapshot() has no effect on keys written before this function
+ // has been called.
+ //
+ // SetSnapshot() may be called multiple times if you would like to change
+ // the snapshot used for different operations in this transaction.
+ //
+ // Calling SetSnapshot will not affect the version of Data returned by Get()
+ // methods. See Transaction::Get() for more details.
+ virtual void SetSnapshot() = 0;
+
+ // Similar to SetSnapshot(), but will not change the current snapshot
+ // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
+ // By calling this function, the transaction will essentially call
+ // SetSnapshot() for you right before performing the next write/GetForUpdate.
+ //
+ // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
+ // returned by GetSnapshot() until the next write/GetForUpdate is executed.
+ //
+ // When the snapshot is created the notifier's SnapshotCreated method will
+ // be called so that the caller can get access to the snapshot.
+ //
+ // This is an optimization to reduce the likelihood of conflicts that
+ // could occur in between the time SetSnapshot() is called and the first
+ // write/GetForUpdate operation. Eg, this prevents the following
+ // race-condition:
+ //
+ // txn1->SetSnapshot();
+ // txn2->Put("A", ...);
+ // txn2->Commit();
+ // txn1->GetForUpdate(opts, "A", ...); // FAIL!
+ virtual void SetSnapshotOnNextOperation(
+ std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
+
+ // Returns the Snapshot created by the last call to SetSnapshot().
+ //
+ // REQUIRED: The returned Snapshot is only valid up until the next time
+ // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
+ // is called, or the Transaction is deleted.
+ virtual const Snapshot* GetSnapshot() const = 0;
+
+ // Clears the current snapshot (i.e. no snapshot will be 'set')
+ //
+ // This removes any snapshot that currently exists or is set to be created
+ // on the next update operation (SetSnapshotOnNextOperation).
+ //
+ // Calling ClearSnapshot() has no effect on keys written before this function
+ // has been called.
+ //
+ // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
+ // longer be valid and should be discarded after a call to ClearSnapshot().
+ virtual void ClearSnapshot() = 0;
+
+ // Prepare the current transaction for 2PC
+ virtual Status Prepare() = 0;
+
+ // Write all batched keys to the db atomically.
+ //
+ // Returns OK on success.
+ //
+ // May return any error status that could be returned by DB:Write().
+ //
+ // If this transaction was created by an OptimisticTransactionDB(),
+ // Status::Busy() may be returned if the transaction could not guarantee
+ // that there are no write conflicts. Status::TryAgain() may be returned
+ // if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain).
+ //
+ // If this transaction was created by a TransactionDB(), Status::Expired()
+ // may be returned if this transaction has lived for longer than
+ // TransactionOptions.expiration.
+ virtual Status Commit() = 0;
+
+ // Discard all batched writes in this transaction.
+ virtual Status Rollback() = 0;
+
+ // Records the state of the transaction for future calls to
+ // RollbackToSavePoint(). May be called multiple times to set multiple save
+ // points.
+ virtual void SetSavePoint() = 0;
+
+ // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
+ // since the most recent call to SetSavePoint() and removes the most recent
+ // SetSavePoint().
+ // If there is no previous call to SetSavePoint(), returns Status::NotFound()
+ virtual Status RollbackToSavePoint() = 0;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ virtual Status PopSavePoint() = 0;
+
+ // This function is similar to DB::Get() except it will also read pending
+ // changes in this transaction. Currently, this function will return
+ // Status::MergeInProgress if the most recent write to the queried key in
+ // this batch is a Merge.
+ //
+ // If read_options.snapshot is not set, the current version of the key will
+ // be read. Calling SetSnapshot() does not affect the version of the data
+ // returned.
+ //
+ // Note that setting read_options.snapshot will affect what is read from the
+ // DB but will NOT change which keys are read from this transaction (the keys
+ // in this transaction do not yet belong to any snapshot and will be fetched
+ // regardless).
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value) = 0;
+
+ // An overload of the above method that receives a PinnableSlice
+ // For backward compatibility a default implementation is provided
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ auto s = Get(options, column_family, key, pinnable_val->GetSelf());
+ pinnable_val->PinSelf();
+ return s;
+ }
+
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) = 0;
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ auto s = Get(options, key, pinnable_val->GetSelf());
+ pinnable_val->PinSelf();
+ return s;
+ }
+
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+ virtual std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) = 0;
+
+ // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
+ // expected to override this with an implementation that calls
+ // DBImpl::MultiGet()
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool /*sorted_input*/ = false) {
+ for (size_t i = 0; i < num_keys; ++i) {
+ statuses[i] = Get(options, column_family, keys[i], &values[i]);
+ }
+ }
+
+ // Read this key and ensure that this transaction will only
+ // be able to be committed if this key is not written outside this
+ // transaction after it has first been read (or after the snapshot if a
+ // snapshot is set in this transaction and do_validate is true). If
+ // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
+ // that GetForUpdate returns the latest committed value. The transaction
+ // behavior is the same regardless of whether the key exists or not.
+ //
+ // Note: Currently, this function will return Status::MergeInProgress
+ // if the most recent write to the queried key in this batch is a Merge.
+ //
+ // The values returned by this function are similar to Transaction::Get().
+ // If value==nullptr, then this function will not read any data, but will
+ // still ensure that this key cannot be written to by outside of this
+ // transaction.
+ //
+ // If this transaction was created by an OptimisticTransaction, GetForUpdate()
+ // could cause commit() to fail. Otherwise, it could return any error
+ // that could be returned by DB::Get().
+ //
+ // If this transaction was created by a TransactionDB, it can return
+ // Status::OK() on success,
+ // Status::Busy() if there is a write conflict,
+ // Status::TimedOut() if a lock could not be acquired,
+ // Status::TryAgain() if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain)
+ // Status::MergeInProgress() if merge operations cannot be resolved.
+ // or other errors if this key could not be read.
+ virtual Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value,
+ bool exclusive = true,
+ const bool do_validate = true) = 0;
+
+ // An overload of the above method that receives a PinnableSlice
+ // For backward compatibility a default implementation is provided
+ virtual Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val,
+ bool exclusive = true,
+ const bool do_validate = true) {
+ if (pinnable_val == nullptr) {
+ std::string* null_str = nullptr;
+ return GetForUpdate(options, column_family, key, null_str, exclusive,
+ do_validate);
+ } else {
+ auto s = GetForUpdate(options, column_family, key,
+ pinnable_val->GetSelf(), exclusive, do_validate);
+ pinnable_val->PinSelf();
+ return s;
+ }
+ }
+
+ virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
+ std::string* value, bool exclusive = true,
+ const bool do_validate = true) = 0;
+
+ virtual std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+ virtual std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options, const std::vector<Slice>& keys,
+ std::vector<std::string>* values) = 0;
+
+ // Returns an iterator that will iterate on all keys in the default
+ // column family including both keys in the DB and uncommitted keys in this
+ // transaction.
+ //
+ // Setting read_options.snapshot will affect what is read from the
+ // DB but will NOT change which keys are read from this transaction (the keys
+ // in this transaction do not yet belong to any snapshot and will be fetched
+ // regardless).
+ //
+ // Caller is responsible for deleting the returned Iterator.
+ //
+ // The returned iterator is only valid until Commit(), Rollback(), or
+ // RollbackToSavePoint() is called.
+ virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
+
+ virtual Iterator* GetIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) = 0;
+
+ // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
+ // functions in WriteBatch, but will also do conflict checking on the
+ // keys being written.
+ //
+ // assume_tracked=true expects the key be already tracked. More
+ // specifically, it means the the key was previous tracked in the same
+ // savepoint, with the same exclusive flag, and at a lower sequence number.
+ // If valid then it skips ValidateSnapshot. Returns error otherwise.
+ //
+ // If this Transaction was created on an OptimisticTransactionDB, these
+ // functions should always return Status::OK().
+ //
+ // If this Transaction was created on a TransactionDB, the status returned
+ // can be:
+ // Status::OK() on success,
+ // Status::Busy() if there is a write conflict,
+ // Status::TimedOut() if a lock could not be acquired,
+ // Status::TryAgain() if the memtable history size is not large enough
+ // (See max_write_buffer_size_to_maintain)
+ // or other errors on unexpected failures.
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) = 0;
+ virtual Status Put(const Slice& key, const Slice& value) = 0;
+ virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value,
+ const bool assume_tracked = false) = 0;
+ virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
+
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value,
+ const bool assume_tracked = false) = 0;
+ virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+ virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status Delete(const Slice& key) = 0;
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status Delete(const SliceParts& key) = 0;
+
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status SingleDelete(const Slice& key) = 0;
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) = 0;
+ virtual Status SingleDelete(const SliceParts& key) = 0;
+
+ // PutUntracked() will write a Put to the batch of operations to be committed
+ // in this transaction. This write will only happen if this transaction
+ // gets committed successfully. But unlike Transaction::Put(),
+ // no conflict checking will be done for this key.
+ //
+ // If this Transaction was created on a PessimisticTransactionDB, this
+ // function will still acquire locks necessary to make sure this write doesn't
+ // cause conflicts in other transactions and may return Status::Busy().
+ virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) = 0;
+ virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
+ virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const SliceParts& value) = 0;
+ virtual Status PutUntracked(const SliceParts& key,
+ const SliceParts& value) = 0;
+
+ virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) = 0;
+ virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
+
+ virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ virtual Status DeleteUntracked(const Slice& key) = 0;
+ virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) = 0;
+ virtual Status DeleteUntracked(const SliceParts& key) = 0;
+ virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ virtual Status SingleDeleteUntracked(const Slice& key) = 0;
+
+ // Similar to WriteBatch::PutLogData
+ virtual void PutLogData(const Slice& blob) = 0;
+
+ // By default, all Put/Merge/Delete operations will be indexed in the
+ // transaction so that Get/GetForUpdate/GetIterator can search for these
+ // keys.
+ //
+ // If the caller does not want to fetch the keys about to be written,
+ // they may want to avoid indexing as a performance optimization.
+ // Calling DisableIndexing() will turn off indexing for all future
+ // Put/Merge/Delete operations until EnableIndexing() is called.
+ //
+ // If a key is Put/Merge/Deleted after DisableIndexing is called and then
+ // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
+ // undefined.
+ virtual void DisableIndexing() = 0;
+ virtual void EnableIndexing() = 0;
+
+ // Returns the number of distinct Keys being tracked by this transaction.
+ // If this transaction was created by a TransactionDB, this is the number of
+ // keys that are currently locked by this transaction.
+ // If this transaction was created by an OptimisticTransactionDB, this is the
+ // number of keys that need to be checked for conflicts at commit time.
+ virtual uint64_t GetNumKeys() const = 0;
+
+ // Returns the number of Puts/Deletes/Merges that have been applied to this
+ // transaction so far.
+ virtual uint64_t GetNumPuts() const = 0;
+ virtual uint64_t GetNumDeletes() const = 0;
+ virtual uint64_t GetNumMerges() const = 0;
+
+ // Returns the elapsed time in milliseconds since this Transaction began.
+ virtual uint64_t GetElapsedTime() const = 0;
+
+ // Fetch the underlying write batch that contains all pending changes to be
+ // committed.
+ //
+ // Note: You should not write or delete anything from the batch directly and
+ // should only use the functions in the Transaction class to
+ // write to this transaction.
+ virtual WriteBatchWithIndex* GetWriteBatch() = 0;
+
+ // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
+ // this transaction.
+ // Has no effect on OptimisticTransactions.
+ virtual void SetLockTimeout(int64_t timeout) = 0;
+
+ // Return the WriteOptions that will be used during Commit()
+ virtual WriteOptions* GetWriteOptions() = 0;
+
+ // Reset the WriteOptions that will be used during Commit().
+ virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
+
+ // If this key was previously fetched in this transaction using
+ // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
+ // the transaction that it no longer needs to do any conflict checking
+ // for this key.
+ //
+ // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
+ // then UndoGetForUpdate will only have an effect if it is also called N
+ // times. If this key has been written to in this transaction,
+ // UndoGetForUpdate() will have no effect.
+ //
+ // If SetSavePoint() has been called after the GetForUpdate(),
+ // UndoGetForUpdate() will not have any effect.
+ //
+ // If this Transaction was created by an OptimisticTransactionDB,
+ // calling UndoGetForUpdate can affect whether this key is conflict checked
+ // at commit time.
+ // If this Transaction was created by a TransactionDB,
+ // calling UndoGetForUpdate may release any held locks for this key.
+ virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual void UndoGetForUpdate(const Slice& key) = 0;
+
+ virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
+
+ virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
+
+ virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
+
+ virtual uint64_t GetLogNumber() const { return log_number_; }
+
+ virtual Status SetName(const TransactionName& name) = 0;
+
+ virtual TransactionName GetName() const { return name_; }
+
+ virtual TransactionID GetID() const { return 0; }
+
+ virtual bool IsDeadlockDetect() const { return false; }
+
+ virtual std::vector<TransactionID> GetWaitingTxns(
+ uint32_t* /*column_family_id*/, std::string* /*key*/) const {
+ assert(false);
+ return std::vector<TransactionID>();
+ }
+
+ enum TransactionState {
+ STARTED = 0,
+ AWAITING_PREPARE = 1,
+ PREPARED = 2,
+ AWAITING_COMMIT = 3,
+ COMMITED = 4,
+ AWAITING_ROLLBACK = 5,
+ ROLLEDBACK = 6,
+ LOCKS_STOLEN = 7,
+ };
+
+ TransactionState GetState() const { return txn_state_; }
+ void SetState(TransactionState state) { txn_state_ = state; }
+
+ // NOTE: Experimental feature
+ // The globally unique id with which the transaction is identified. This id
+ // might or might not be set depending on the implementation. Similarly the
+ // implementation decides the point in lifetime of a transaction at which it
+ // assigns the id. Although currently it is the case, the id is not guaranteed
+ // to remain the same across restarts.
+ uint64_t GetId() { return id_; }
+
+ protected:
+ explicit Transaction(const TransactionDB* /*db*/) {}
+ Transaction() : log_number_(0), txn_state_(STARTED) {}
+
+ // the log in which the prepared section for this txn resides
+ // (for two phase commit)
+ uint64_t log_number_;
+ TransactionName name_;
+
+ // Execution status of the transaction.
+ std::atomic<TransactionState> txn_state_;
+
+ uint64_t id_ = 0;
+ virtual void SetId(uint64_t id) {
+ assert(id_ == 0);
+ id_ = id;
+ }
+
+ virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
+ private:
+ friend class PessimisticTransactionDB;
+ friend class WriteUnpreparedTxnDB;
+ friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+ friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
new file mode 100644
index 000000000..73b7416a0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -0,0 +1,309 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+// Database with Transaction support.
+//
+// See transaction.h and examples/transaction_example.cc
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexFactory;
+
+enum TxnDBWritePolicy {
+ WRITE_COMMITTED = 0, // write only the committed data
+ WRITE_PREPARED, // write data after the prepare phase of 2pc
+ WRITE_UNPREPARED // write data before the prepare phase of 2pc
+};
+
+const uint32_t kInitialMaxDeadlocks = 5;
+
+struct TransactionDBOptions {
+ // Specifies the maximum number of keys that can be locked at the same time
+ // per column family.
+ // If the number of locked keys is greater than max_num_locks, transaction
+ // writes (or GetForUpdate) will return an error.
+ // If this value is not positive, no limit will be enforced.
+ int64_t max_num_locks = -1;
+
+ // Stores the number of latest deadlocks to track
+ uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
+
+ // Increasing this value will increase the concurrency by dividing the lock
+ // table (per column family) into more sub-tables, each with their own
+ // separate
+ // mutex.
+ size_t num_stripes = 16;
+
+ // If positive, specifies the default wait timeout in milliseconds when
+ // a transaction attempts to lock a key if not specified by
+ // TransactionOptions::lock_timeout.
+ //
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, there is no timeout. Not using a timeout is not recommended
+ // as it can lead to deadlocks. Currently, there is no deadlock-detection to
+ // recover
+ // from a deadlock.
+ int64_t transaction_lock_timeout = 1000; // 1 second
+
+ // If positive, specifies the wait timeout in milliseconds when writing a key
+ // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
+ // directly).
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, there is no timeout and will block indefinitely when acquiring
+ // a lock.
+ //
+ // Not using a timeout can lead to deadlocks. Currently, there
+ // is no deadlock-detection to recover from a deadlock. While DB writes
+ // cannot deadlock with other DB writes, they can deadlock with a transaction.
+ // A negative timeout should only be used if all transactions have a small
+ // expiration set.
+ int64_t default_lock_timeout = 1000; // 1 second
+
+ // If set, the TransactionDB will use this implementation of a mutex and
+ // condition variable for all transaction locking instead of the default
+ // mutex/condvar implementation.
+ std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+
+ // The policy for when to write the data into the DB. The default policy is to
+ // write only the committed data (WRITE_COMMITTED). The data could be written
+ // before the commit phase. The DB then needs to provide the mechanisms to
+ // tell apart committed from uncommitted data.
+ TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+
+ // TODO(myabandeh): remove this option
+ // Note: this is a temporary option as a hot fix in rollback of writeprepared
+ // txns in myrocks. MyRocks uses merge operands for autoinc column id without
+ // however obtaining locks. This breaks the assumption behind the rollback
+ // logic in myrocks. This hack of simply not rolling back merge operands works
+ // for the special way that myrocks uses this operands.
+ bool rollback_merge_operands = false;
+
+ // If true, the TransactionDB implementation might skip concurrency control
+ // unless it is overridden by TransactionOptions or
+ // TransactionDBWriteOptimizations. This can be used in conjuction with
+ // DBOptions::unordered_write when the TransactionDB is used solely for write
+ // ordering rather than concurrency control.
+ bool skip_concurrency_control = false;
+
+ // This option is only valid for write unprepared. If a write batch exceeds
+ // this threshold, then the transaction will implicitly flush the currently
+ // pending writes into the database. A value of 0 or less means no limit.
+ int64_t default_write_batch_flush_threshold = 0;
+
+ private:
+ // 128 entries
+ size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
+ // 8m entry, 64MB size
+ size_t wp_commit_cache_bits = static_cast<size_t>(23);
+
+ // For testing, whether transaction name should be auto-generated or not. This
+ // is useful for write unprepared which requires named transactions.
+ bool autogenerate_name = false;
+
+ friend class WritePreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+ friend class WritePreparedTransactionTestBase;
+ friend class TransactionTestBase;
+ friend class MySQLStyleTransactionTest;
+};
+
+struct TransactionOptions {
+ // Setting set_snapshot=true is the same as calling
+ // Transaction::SetSnapshot().
+ bool set_snapshot = false;
+
+ // Setting to true means that before acquiring locks, this transaction will
+ // check if doing so will cause a deadlock. If so, it will return with
+ // Status::Busy. The user should retry their transaction.
+ bool deadlock_detect = false;
+
+ // If set, it states that the CommitTimeWriteBatch represents the latest state
+ // of the application, has only one sub-batch, i.e., no duplicate keys, and
+ // meant to be used later during recovery. It enables an optimization to
+ // postpone updating the memtable with CommitTimeWriteBatch to only
+ // SwitchMemtable or recovery.
+ bool use_only_the_last_commit_time_batch_for_recovery = false;
+
+ // TODO(agiardullo): TransactionDB does not yet support comparators that allow
+ // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
+ // return 0 if
+ // a.compare(b) returns 0.
+
+ // If positive, specifies the wait timeout in milliseconds when
+ // a transaction attempts to lock a key.
+ //
+ // If 0, no waiting is done if a lock cannot instantly be acquired.
+ // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
+ int64_t lock_timeout = -1;
+
+ // Expiration duration in milliseconds. If non-negative, transactions that
+ // last longer than this many milliseconds will fail to commit. If not set,
+ // a forgotten transaction that is never committed, rolled back, or deleted
+ // will never relinquish any locks it holds. This could prevent keys from
+ // being written by other writers.
+ int64_t expiration = -1;
+
+ // The number of traversals to make during deadlock detection.
+ int64_t deadlock_detect_depth = 50;
+
+ // The maximum number of bytes used for the write batch. 0 means no limit.
+ size_t max_write_batch_size = 0;
+
+ // Skip Concurrency Control. This could be as an optimization if the
+ // application knows that the transaction would not have any conflict with
+ // concurrent transactions. It could also be used during recovery if (i)
+ // application guarantees no conflict between prepared transactions in the WAL
+ // (ii) application guarantees that recovered transactions will be rolled
+ // back/commit before new transactions start.
+ // Default: false
+ bool skip_concurrency_control = false;
+
+ // See TransactionDBOptions::default_write_batch_flush_threshold for
+ // description. If a negative value is specified, then the default value from
+ // TransactionDBOptions is used.
+ int64_t write_batch_flush_threshold = -1;
+};
+
+// The per-write optimizations that do not involve transactions. TransactionDB
+// implementation might or might not make use of the specified optimizations.
+struct TransactionDBWriteOptimizations {
+ // If it is true it means that the application guarantees that the
+ // key-set in the write batch do not conflict with any concurrent transaction
+ // and hence the concurrency control mechanism could be skipped for this
+ // write.
+ bool skip_concurrency_control = false;
+ // If true, the application guarantees that there is no duplicate <column
+ // family, key> in the write batch and any employed mechanism to handle
+ // duplicate keys could be skipped.
+ bool skip_duplicate_key_check = false;
+};
+
+struct KeyLockInfo {
+ std::string key;
+ std::vector<TransactionID> ids;
+ bool exclusive;
+};
+
+struct DeadlockInfo {
+ TransactionID m_txn_id;
+ uint32_t m_cf_id;
+ bool m_exclusive;
+ std::string m_waiting_key;
+};
+
+struct DeadlockPath {
+ std::vector<DeadlockInfo> path;
+ bool limit_exceeded;
+ int64_t deadlock_time;
+
+ explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
+ const int64_t& dl_time)
+ : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+ // empty path, limit exceeded constructor and default constructor
+ explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+ : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+ bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+class TransactionDB : public StackableDB {
+ public:
+ // Optimized version of ::Write that receives more optimization request such
+ // as skip_concurrency_control.
+ using StackableDB::Write;
+ virtual Status Write(const WriteOptions& opts,
+ const TransactionDBWriteOptimizations&,
+ WriteBatch* updates) {
+ // The default implementation ignores TransactionDBWriteOptimizations and
+ // falls back to the un-optimized version of ::Write
+ return Write(opts, updates);
+ }
+ // Open a TransactionDB similar to DB::Open().
+ // Internally call PrepareWrap() and WrapDB()
+ // If the return status is not ok, then dbptr is set to nullptr.
+ static Status Open(const Options& options,
+ const TransactionDBOptions& txn_db_options,
+ const std::string& dbname, TransactionDB** dbptr);
+
+ static Status Open(const DBOptions& db_options,
+ const TransactionDBOptions& txn_db_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ TransactionDB** dbptr);
+ // Note: PrepareWrap() may change parameters, make copies before the
+ // invocation if needed.
+ static void PrepareWrap(DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* column_families,
+ std::vector<size_t>* compaction_enabled_cf_indices);
+ // If the return status is not ok, then dbptr will bet set to nullptr. The
+ // input db parameter might or might not be deleted as a result of the
+ // failure. If it is properly deleted it will be set to nullptr. If the return
+ // status is ok, the ownership of db is transferred to dbptr.
+ static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles,
+ TransactionDB** dbptr);
+ // If the return status is not ok, then dbptr will bet set to nullptr. The
+ // input db parameter might or might not be deleted as a result of the
+ // failure. If it is properly deleted it will be set to nullptr. If the return
+ // status is ok, the ownership of db is transferred to dbptr.
+ static Status WrapStackableDB(
+ StackableDB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
+ // Since the destructor in StackableDB is virtual, this destructor is virtual
+ // too. The root db will be deleted by the base's destructor.
+ ~TransactionDB() override {}
+
+ // Starts a new Transaction.
+ //
+ // Caller is responsible for deleting the returned transaction when no
+ // longer needed.
+ //
+ // If old_txn is not null, BeginTransaction will reuse this Transaction
+ // handle instead of allocating a new one. This is an optimization to avoid
+ // extra allocations when repeatedly creating transactions.
+ virtual Transaction* BeginTransaction(
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options = TransactionOptions(),
+ Transaction* old_txn = nullptr) = 0;
+
+ virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
+ virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
+
+ // Returns set of all locks held.
+ //
+ // The mapping is column family id -> KeyLockInfo
+ virtual std::unordered_multimap<uint32_t, KeyLockInfo>
+ GetLockStatusData() = 0;
+ virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+ virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+ protected:
+ // To Create an TransactionDB, call Open()
+ // The ownership of db is transferred to the base StackableDB
+ explicit TransactionDB(DB* db) : StackableDB(db) {}
+ // No copying allowed
+ TransactionDB(const TransactionDB&) = delete;
+ void operator=(const TransactionDB&) = delete;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
new file mode 100644
index 000000000..96a42adf8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TransactionDBMutex and TransactionDBCondVar APIs allows applications to
+// implement custom mutexes and condition variables to be used by a
+// TransactionDB when locking keys.
+//
+// To open a TransactionDB with a custom TransactionDBMutexFactory, set
+// TransactionDBOptions.custom_mutex_factory.
+
+class TransactionDBMutex {
+ public:
+ virtual ~TransactionDBMutex() {}
+
+ // Attempt to acquire lock. Return OK on success, or other Status on failure.
+ // If returned status is OK, TransactionDB will eventually call UnLock().
+ virtual Status Lock() = 0;
+
+ // Attempt to acquire lock. If timeout is non-negative, operation may be
+ // failed after this many microseconds.
+ // Returns OK on success,
+ // TimedOut if timed out,
+ // or other Status on failure.
+ // If returned status is OK, TransactionDB will eventually call UnLock().
+ virtual Status TryLockFor(int64_t timeout_time) = 0;
+
+ // Unlock Mutex that was successfully locked by Lock() or TryLockUntil()
+ virtual void UnLock() = 0;
+};
+
+class TransactionDBCondVar {
+ public:
+ virtual ~TransactionDBCondVar() {}
+
+ // Block current thread until condition variable is notified by a call to
+ // Notify() or NotifyAll(). Wait() will be called with mutex locked.
+ // Returns OK if notified.
+ // Returns non-OK if TransactionDB should stop waiting and fail the operation.
+ // May return OK spuriously even if not notified.
+ virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0;
+
+ // Block current thread until condition variable is notified by a call to
+ // Notify() or NotifyAll(), or if the timeout is reached.
+ // Wait() will be called with mutex locked.
+ //
+ // If timeout is non-negative, operation should be failed after this many
+ // microseconds.
+ // If implementing a custom version of this class, the implementation may
+ // choose to ignore the timeout.
+ //
+ // Returns OK if notified.
+ // Returns TimedOut if timeout is reached.
+ // Returns other status if TransactionDB should otherwis stop waiting and
+ // fail the operation.
+ // May return OK spuriously even if not notified.
+ virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+ int64_t timeout_time) = 0;
+
+ // If any threads are waiting on *this, unblock at least one of the
+ // waiting threads.
+ virtual void Notify() = 0;
+
+ // Unblocks all threads waiting on *this.
+ virtual void NotifyAll() = 0;
+};
+
+// Factory class that can allocate mutexes and condition variables.
+class TransactionDBMutexFactory {
+ public:
+ // Create a TransactionDBMutex object.
+ virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0;
+
+ // Create a TransactionDBCondVar object.
+ virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0;
+
+ virtual ~TransactionDBMutexFactory() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/utility_db.h b/src/rocksdb/include/rocksdb/utilities/utility_db.h
new file mode 100644
index 000000000..cf2e5811c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/utility_db.h
@@ -0,0 +1,34 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Please don't use this class. It's deprecated
+class UtilityDB {
+ public:
+ // This function is here only for backwards compatibility. Please use the
+ // functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h)
+ // (deprecated)
+#if defined(__GNUC__) || defined(__clang__)
+ __attribute__((deprecated))
+#elif _WIN32
+ __declspec(deprecated)
+#endif
+ static Status
+ OpenTtlDB(const Options& options, const std::string& name,
+ StackableDB** dbptr, int32_t ttl = 0, bool read_only = false);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
new file mode 100644
index 000000000..424aa1582
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -0,0 +1,278 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class Comparator;
+class DB;
+class ReadCallback;
+struct ReadOptions;
+struct DBOptions;
+
+enum WriteType {
+ kPutRecord,
+ kMergeRecord,
+ kDeleteRecord,
+ kSingleDeleteRecord,
+ kDeleteRangeRecord,
+ kLogDataRecord,
+ kXIDRecord,
+};
+
+// an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
+// Used in WBWIIterator.
+struct WriteEntry {
+ WriteType type;
+ Slice key;
+ Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+ virtual ~WBWIIterator() {}
+
+ virtual bool Valid() const = 0;
+
+ virtual void SeekToFirst() = 0;
+
+ virtual void SeekToLast() = 0;
+
+ virtual void Seek(const Slice& key) = 0;
+
+ virtual void SeekForPrev(const Slice& key) = 0;
+
+ virtual void Next() = 0;
+
+ virtual void Prev() = 0;
+
+ // the return WriteEntry is only valid until the next mutation of
+ // WriteBatchWithIndex
+ virtual WriteEntry Entry() const = 0;
+
+ virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() Delete(), or SingleDelete(), the same function of the
+// wrapped will be called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex : public WriteBatchBase {
+ public:
+ // backup_index_comparator: the backup comparator used to compare keys
+ // within the same column family, if column family is not given in the
+ // interface, or we can't find a column family from the column family handle
+ // passed in, backup_index_comparator will be used for the column family.
+ // reserved_bytes: reserved bytes in underlying WriteBatch
+ // max_bytes: maximum size of underlying WriteBatch in bytes
+ // overwrite_key: if true, overwrite the key in the index when inserting
+ // the same key as previously, so iterator will never
+ // show two entries with the same key.
+ explicit WriteBatchWithIndex(
+ const Comparator* backup_index_comparator = BytewiseComparator(),
+ size_t reserved_bytes = 0, bool overwrite_key = false,
+ size_t max_bytes = 0);
+
+ ~WriteBatchWithIndex() override;
+ WriteBatchWithIndex(WriteBatchWithIndex&&);
+ WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
+
+ using WriteBatchBase::Put;
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ Status Put(const Slice& key, const Slice& value) override;
+
+ using WriteBatchBase::Merge;
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ Status Merge(const Slice& key, const Slice& value) override;
+
+ using WriteBatchBase::Delete;
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+ Status Delete(const Slice& key) override;
+
+ using WriteBatchBase::SingleDelete;
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDelete(const Slice& key) override;
+
+ using WriteBatchBase::DeleteRange;
+ Status DeleteRange(ColumnFamilyHandle* /* column_family */,
+ const Slice& /* begin_key */,
+ const Slice& /* end_key */) override {
+ return Status::NotSupported(
+ "DeleteRange unsupported in WriteBatchWithIndex");
+ }
+ Status DeleteRange(const Slice& /* begin_key */,
+ const Slice& /* end_key */) override {
+ return Status::NotSupported(
+ "DeleteRange unsupported in WriteBatchWithIndex");
+ }
+
+ using WriteBatchBase::PutLogData;
+ Status PutLogData(const Slice& blob) override;
+
+ using WriteBatchBase::Clear;
+ void Clear() override;
+
+ using WriteBatchBase::GetWriteBatch;
+ WriteBatch* GetWriteBatch() override;
+
+ // Create an iterator of a column family. User can call iterator.Seek() to
+ // search to the next entry of or after a key. Keys will be iterated in the
+ // order given by index_comparator. For multiple updates on the same key,
+ // each update will be returned as a separate entry, in the order of update
+ // time.
+ //
+ // The returned iterator should be deleted by the caller.
+ WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+ // Create an iterator of the default column family.
+ WBWIIterator* NewIterator();
+
+ // Will create a new Iterator that will use WBWIIterator as a delta and
+ // base_iterator as base.
+ //
+ // This function is only supported if the WriteBatchWithIndex was
+ // constructed with overwrite_key=true.
+ //
+ // The returned iterator should be deleted by the caller.
+ // The base_iterator is now 'owned' by the returned iterator. Deleting the
+ // returned iterator will also delete the base_iterator.
+ //
+ // Updating write batch with the current key of the iterator is not safe.
+ // We strongly recommand users not to do it. It will invalidate the current
+ // key() and value() of the iterator. This invalidation happens even before
+ // the write batch update finishes. The state may recover after Next() is
+ // called.
+ Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+ Iterator* base_iterator,
+ const ReadOptions* opts = nullptr);
+ // default column family
+ Iterator* NewIteratorWithBase(Iterator* base_iterator);
+
+ // Similar to DB::Get() but will only read the key from this batch.
+ // If the batch does not have enough data to resolve Merge operations,
+ // MergeInProgress status may be returned.
+ Status GetFromBatch(ColumnFamilyHandle* column_family,
+ const DBOptions& options, const Slice& key,
+ std::string* value);
+
+ // Similar to previous function but does not require a column_family.
+ // Note: An InvalidArgument status will be returned if there are any Merge
+ // operators for this key. Use previous method instead.
+ Status GetFromBatch(const DBOptions& options, const Slice& key,
+ std::string* value) {
+ return GetFromBatch(nullptr, options, key, value);
+ }
+
+ // Similar to DB::Get() but will also read writes from this batch.
+ //
+ // This function will query both this batch and the DB and then merge
+ // the results using the DB's merge operator (if the batch contains any
+ // merge requests).
+ //
+ // Setting read_options.snapshot will affect what is read from the DB
+ // but will NOT change which keys are read from the batch (the keys in
+ // this batch do not yet belong to any snapshot and will be fetched
+ // regardless).
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ const Slice& key, std::string* value);
+
+ // An overload of the above method that receives a PinnableSlice
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ const Slice& key, PinnableSlice* value);
+
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value);
+
+ // An overload of the above method that receives a PinnableSlice
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value);
+
+ void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ bool sorted_input);
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ void SetSavePoint() override;
+
+ // Remove all entries in this batch (Put, Merge, Delete, SingleDelete,
+ // PutLogData) since the most recent call to SetSavePoint() and removes the
+ // most recent save point.
+ // If there is no previous call to SetSavePoint(), behaves the same as
+ // Clear().
+ //
+ // Calling RollbackToSavePoint invalidates any open iterators on this batch.
+ //
+ // Returns Status::OK() on success,
+ // Status::NotFound() if no previous call to SetSavePoint(),
+ // or other Status on corruption.
+ Status RollbackToSavePoint() override;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status PopSavePoint() override;
+
+ void SetMaxBytes(size_t max_bytes) override;
+ size_t GetDataSize() const;
+
+ private:
+ friend class PessimisticTransactionDB;
+ friend class WritePreparedTxn;
+ friend class WriteUnpreparedTxn;
+ friend class WriteBatchWithIndex_SubBatchCnt_Test;
+ // Returns the number of sub-batches inside the write batch. A sub-batch
+ // starts right before inserting a key that is a duplicate of a key in the
+ // last sub-batch.
+ size_t SubBatchCnt();
+
+ Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, ReadCallback* callback);
+ void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ bool sorted_input, ReadCallback* callback);
+ struct Rep;
+ std::unique_ptr<Rep> rep;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
new file mode 100644
index 000000000..d364b7f95
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#define ROCKSDB_MAJOR 6
+#define ROCKSDB_MINOR 8
+#define ROCKSDB_PATCH 1
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h
new file mode 100644
index 000000000..98eddc2e2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wal_filter.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteBatch;
+
+// WALFilter allows an application to inspect write-ahead-log (WAL)
+// records or modify their processing on recovery.
+// Please see the details below.
+class WalFilter {
+ public:
+ enum class WalProcessingOption {
+ // Continue processing as usual
+ kContinueProcessing = 0,
+ // Ignore the current record but continue processing of log(s)
+ kIgnoreCurrentRecord = 1,
+ // Stop replay of logs and discard logs
+ // Logs won't be replayed on subsequent recovery
+ kStopReplay = 2,
+ // Corrupted record detected by filter
+ kCorruptedRecord = 3,
+ // Marker for enum count
+ kWalProcessingOptionMax = 4
+ };
+
+ virtual ~WalFilter() {}
+
+ // Provide ColumnFamily->LogNumber map to filter
+ // so that filter can determine whether a log number applies to a given
+ // column family (i.e. that log hasn't been flushed to SST already for the
+ // column family).
+ // We also pass in name->id map as only name is known during
+ // recovery (as handles are opened post-recovery).
+ // while write batch callbacks happen in terms of column family id.
+ //
+ // @params cf_lognumber_map column_family_id to lognumber map
+ // @params cf_name_id_map column_family_name to column_family_id map
+
+ virtual void ColumnFamilyLogNumberMap(
+ const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/,
+ const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {}
+
+ // LogRecord is invoked for each log record encountered for all the logs
+ // during replay on logs on recovery. This method can be used to:
+ // * inspect the record (using the batch parameter)
+ // * ignoring current record
+ // (by returning WalProcessingOption::kIgnoreCurrentRecord)
+ // * reporting corrupted record
+ // (by returning WalProcessingOption::kCorruptedRecord)
+ // * stop log replay
+ // (by returning kStop replay) - please note that this implies
+ // discarding the logs from current record onwards.
+ //
+ // @params log_number log_number of the current log.
+ // Filter might use this to determine if the log
+ // record is applicable to a certain column family.
+ // @params log_file_name log file name - only for informational purposes
+ // @params batch batch encountered in the log during recovery
+ // @params new_batch new_batch to populate if filter wants to change
+ // the batch (for example to filter some records out,
+ // or alter some records).
+ // Please note that the new batch MUST NOT contain
+ // more records than original, else recovery would
+ // be failed.
+ // @params batch_changed Whether batch was changed by the filter.
+ // It must be set to true if new_batch was populated,
+ // else new_batch has no effect.
+ // @returns Processing option for the current record.
+ // Please see WalProcessingOption enum above for
+ // details.
+ virtual WalProcessingOption LogRecordFound(
+ unsigned long long /*log_number*/, const std::string& /*log_file_name*/,
+ const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+ // Default implementation falls back to older function for compatibility
+ return LogRecord(batch, new_batch, batch_changed);
+ }
+
+ // Please see the comments for LogRecord above. This function is for
+ // compatibility only and contains a subset of parameters.
+ // New code should use the function above.
+ virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+ WriteBatch* /*new_batch*/,
+ bool* /*batch_changed*/) const {
+ return WalProcessingOption::kContinueProcessing;
+ }
+
+ // Returns a name that identifies this WAL filter.
+ // The name will be printed to LOG file on start up for diagnosis.
+ virtual const char* Name() const = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
new file mode 100644
index 000000000..51fd4d8ac
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch. For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+// batch.Put("key", "v1");
+// batch.Delete("key");
+// batch.Put("key", "v2");
+// batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SavePoints;
+struct SliceParts;
+
+struct SavePoint {
+ size_t size; // size of rep_
+ int count; // count of elements in rep_
+ uint32_t content_flags;
+
+ SavePoint() : size(0), count(0), content_flags(0) {}
+
+ SavePoint(size_t _size, int _count, uint32_t _flags)
+ : size(_size), count(_count), content_flags(_flags) {}
+
+ void clear() {
+ size = 0;
+ count = 0;
+ content_flags = 0;
+ }
+
+ bool is_cleared() const { return (size | count | content_flags) == 0; }
+};
+
+class WriteBatch : public WriteBatchBase {
+ public:
+ explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
+ explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
+ ~WriteBatch() override;
+
+ using WriteBatchBase::Put;
+ // Store the mapping "key->value" in the database.
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status Put(const Slice& key, const Slice& value) override {
+ return Put(nullptr, key, value);
+ }
+
+ // Variant of Put() that gathers output like writev(2). The key and value
+ // that will be written to the database are concatenations of arrays of
+ // slices.
+ Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+ Status Put(const SliceParts& key, const SliceParts& value) override {
+ return Put(nullptr, key, value);
+ }
+
+ using WriteBatchBase::Delete;
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+ Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+
+ // variant that takes SliceParts
+ Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+ Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+ using WriteBatchBase::SingleDelete;
+ // WriteBatch implementation of DB::SingleDelete(). See db.h.
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDelete(const Slice& key) override {
+ return SingleDelete(nullptr, key);
+ }
+
+ // variant that takes SliceParts
+ Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+ Status SingleDelete(const SliceParts& key) override {
+ return SingleDelete(nullptr, key);
+ }
+
+ using WriteBatchBase::DeleteRange;
+ // WriteBatch implementation of DB::DeleteRange(). See db.h.
+ Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+ const Slice& end_key) override;
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key) override {
+ return DeleteRange(nullptr, begin_key, end_key);
+ }
+
+ // variant that takes SliceParts
+ Status DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key) override;
+ Status DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key) override {
+ return DeleteRange(nullptr, begin_key, end_key);
+ }
+
+ using WriteBatchBase::Merge;
+ // Merge "value" with the existing value of "key" in the database.
+ // "key->merge(existing, value)"
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status Merge(const Slice& key, const Slice& value) override {
+ return Merge(nullptr, key, value);
+ }
+
+ // variant that takes SliceParts
+ Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+ Status Merge(const SliceParts& key, const SliceParts& value) override {
+ return Merge(nullptr, key, value);
+ }
+
+ using WriteBatchBase::PutLogData;
+ // Append a blob of arbitrary size to the records in this batch. The blob will
+ // be stored in the transaction log but not in any other file. In particular,
+ // it will not be persisted to the SST files. When iterating over this
+ // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+ // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+ // encountered in the same order in which they were inserted. The blob will
+ // NOT consume sequence number(s) and will NOT increase the count of the batch
+ //
+ // Example application: add timestamps to the transaction log for use in
+ // replication.
+ Status PutLogData(const Slice& blob) override;
+
+ using WriteBatchBase::Clear;
+ // Clear all updates buffered in this batch.
+ void Clear() override;
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ void SetSavePoint() override;
+
+ // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+ // most recent call to SetSavePoint() and removes the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status RollbackToSavePoint() override;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ Status PopSavePoint() override;
+
+ // Support for iterating over the contents of a batch.
+ class Handler {
+ public:
+ virtual ~Handler();
+ // All handler functions in this class provide default implementations so
+ // we won't break existing clients of Handler on a source code level when
+ // adding a new member function.
+
+ // default implementation will just call Put without column family for
+ // backwards compatibility. If the column family is not default,
+ // the function is noop
+ virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ // Put() historically doesn't return status. We didn't want to be
+ // backwards incompatible so we didn't change the return status
+ // (this is a public API). We do an ordinary get and return Status::OK()
+ Put(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and PutCF not implemented");
+ }
+ virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
+
+ virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+ if (column_family_id == 0) {
+ Delete(key);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and DeleteCF not implemented");
+ }
+ virtual void Delete(const Slice& /*key*/) {}
+
+ virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
+ if (column_family_id == 0) {
+ SingleDelete(key);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and SingleDeleteCF not implemented");
+ }
+ virtual void SingleDelete(const Slice& /*key*/) {}
+
+ virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
+ const Slice& /*begin_key*/,
+ const Slice& /*end_key*/) {
+ return Status::InvalidArgument("DeleteRangeCF not implemented");
+ }
+
+ virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ Merge(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and MergeCF not implemented");
+ }
+ virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
+
+ virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/,
+ const Slice& /*value*/) {
+ return Status::InvalidArgument("PutBlobIndexCF not implemented");
+ }
+
+ // The default implementation of LogData does nothing.
+ virtual void LogData(const Slice& blob);
+
+ virtual Status MarkBeginPrepare(bool = false) {
+ return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
+ }
+
+ virtual Status MarkEndPrepare(const Slice& /*xid*/) {
+ return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
+ }
+
+ virtual Status MarkNoop(bool /*empty_batch*/) {
+ return Status::InvalidArgument("MarkNoop() handler not defined.");
+ }
+
+ virtual Status MarkRollback(const Slice& /*xid*/) {
+ return Status::InvalidArgument(
+ "MarkRollbackPrepare() handler not defined.");
+ }
+
+ virtual Status MarkCommit(const Slice& /*xid*/) {
+ return Status::InvalidArgument("MarkCommit() handler not defined.");
+ }
+
+ // Continue is called by WriteBatch::Iterate. If it returns false,
+ // iteration is halted. Otherwise, it continues iterating. The default
+ // implementation always returns true.
+ virtual bool Continue();
+
+ protected:
+ friend class WriteBatchInternal;
+ virtual bool WriteAfterCommit() const { return true; }
+ virtual bool WriteBeforePrepare() const { return false; }
+ };
+ Status Iterate(Handler* handler) const;
+
+ // Retrieve the serialized version of this batch.
+ const std::string& Data() const { return rep_; }
+
+ // Retrieve data size of the batch.
+ size_t GetDataSize() const { return rep_.size(); }
+
+ // Returns the number of updates in the batch
+ uint32_t Count() const;
+
+ // Returns true if PutCF will be called during Iterate
+ bool HasPut() const;
+
+ // Returns true if DeleteCF will be called during Iterate
+ bool HasDelete() const;
+
+ // Returns true if SingleDeleteCF will be called during Iterate
+ bool HasSingleDelete() const;
+
+ // Returns true if DeleteRangeCF will be called during Iterate
+ bool HasDeleteRange() const;
+
+ // Returns true if MergeCF will be called during Iterate
+ bool HasMerge() const;
+
+ // Returns true if MarkBeginPrepare will be called during Iterate
+ bool HasBeginPrepare() const;
+
+ // Returns true if MarkEndPrepare will be called during Iterate
+ bool HasEndPrepare() const;
+
+ // Returns trie if MarkCommit will be called during Iterate
+ bool HasCommit() const;
+
+ // Returns trie if MarkRollback will be called during Iterate
+ bool HasRollback() const;
+
+ // Assign timestamp to write batch
+ Status AssignTimestamp(const Slice& ts);
+
+ // Assign timestamps to write batch
+ Status AssignTimestamps(const std::vector<Slice>& ts_list);
+
+ using WriteBatchBase::GetWriteBatch;
+ WriteBatch* GetWriteBatch() override { return this; }
+
+ // Constructor with a serialized string object
+ explicit WriteBatch(const std::string& rep);
+ explicit WriteBatch(std::string&& rep);
+
+ WriteBatch(const WriteBatch& src);
+ WriteBatch(WriteBatch&& src) noexcept;
+ WriteBatch& operator=(const WriteBatch& src);
+ WriteBatch& operator=(WriteBatch&& src);
+
+ // marks this point in the WriteBatch as the last record to
+ // be inserted into the WAL, provided the WAL is enabled
+ void MarkWalTerminationPoint();
+ const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; }
+
+ void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
+
+ private:
+ friend class WriteBatchInternal;
+ friend class LocalSavePoint;
+ // TODO(myabandeh): this is needed for a hack to collapse the write batch and
+ // remove duplicate keys. Remove it when the hack is replaced with a proper
+ // solution.
+ friend class WriteBatchWithIndex;
+ std::unique_ptr<SavePoints> save_points_;
+
+ // When sending a WriteBatch through WriteImpl we might want to
+ // specify that only the first x records of the batch be written to
+ // the WAL.
+ SavePoint wal_term_point_;
+
+ // For HasXYZ. Mutable to allow lazy computation of results
+ mutable std::atomic<uint32_t> content_flags_;
+
+ // Performs deferred computation of content_flags if necessary
+ uint32_t ComputeContentFlags() const;
+
+ // Maximum size of rep_.
+ size_t max_bytes_;
+
+ // Is the content of the batch the application's latest state that meant only
+ // to be used for recovery? Refer to
+ // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
+ // more details.
+ bool is_latest_persistent_state_ = false;
+
+ protected:
+ std::string rep_; // See comment in write_batch.cc for the format of rep_
+ const size_t timestamp_size_;
+
+ // Intentionally copyable
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
new file mode 100644
index 000000000..19ff877e7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstddef>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+class ColumnFamilyHandle;
+class WriteBatch;
+struct SliceParts;
+
+// Abstract base class that defines the basic interface for a write batch.
+// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
+// indexed implementation.
+class WriteBatchBase {
+ public:
+ virtual ~WriteBatchBase() {}
+
+ // Store the mapping "key->value" in the database.
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Put(const Slice& key, const Slice& value) = 0;
+
+ // Variant of Put() that gathers output like writev(2). The key and value
+ // that will be written to the database are concatenations of arrays of
+ // slices.
+ virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value);
+ virtual Status Put(const SliceParts& key, const SliceParts& value);
+
+ // Merge "value" with the existing value of "key" in the database.
+ // "key->merge(existing, value)"
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+ virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+ // variant that takes SliceParts
+ virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value);
+ virtual Status Merge(const SliceParts& key, const SliceParts& value);
+
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status Delete(const Slice& key) = 0;
+
+ // variant that takes SliceParts
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key);
+ virtual Status Delete(const SliceParts& key);
+
+ // If the database contains a mapping for "key", erase it. Expects that the
+ // key was not overwritten. Else do nothing.
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+ virtual Status SingleDelete(const Slice& key) = 0;
+
+ // variant that takes SliceParts
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key);
+ virtual Status SingleDelete(const SliceParts& key);
+
+ // If the database contains mappings in the range ["begin_key", "end_key"),
+ // erase them. Else do nothing.
+ virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) = 0;
+ virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0;
+
+ // variant that takes SliceParts
+ virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+ const SliceParts& begin_key,
+ const SliceParts& end_key);
+ virtual Status DeleteRange(const SliceParts& begin_key,
+ const SliceParts& end_key);
+
+ // Append a blob of arbitrary size to the records in this batch. The blob will
+ // be stored in the transaction log but not in any other file. In particular,
+ // it will not be persisted to the SST files. When iterating over this
+ // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+ // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+ // encountered in the same order in which they were inserted. The blob will
+ // NOT consume sequence number(s) and will NOT increase the count of the batch
+ //
+ // Example application: add timestamps to the transaction log for use in
+ // replication.
+ virtual Status PutLogData(const Slice& blob) = 0;
+
+ // Clear all updates buffered in this batch.
+ virtual void Clear() = 0;
+
+ // Covert this batch into a WriteBatch. This is an abstracted way of
+ // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
+ // WriteBatch.
+ virtual WriteBatch* GetWriteBatch() = 0;
+
+ // Records the state of the batch for future calls to RollbackToSavePoint().
+ // May be called multiple times to set multiple save points.
+ virtual void SetSavePoint() = 0;
+
+ // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+ // most recent call to SetSavePoint() and removes the most recent save point.
+ // If there is no previous call to SetSavePoint(), behaves the same as
+ // Clear().
+ virtual Status RollbackToSavePoint() = 0;
+
+ // Pop the most recent save point.
+ // If there is no previous call to SetSavePoint(), Status::NotFound()
+ // will be returned.
+ // Otherwise returns Status::OK().
+ virtual Status PopSavePoint() = 0;
+
+ // Sets the maximum size of the write batch in bytes. 0 means no limit.
+ virtual void SetMaxBytes(size_t max_bytes) = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h
new file mode 100644
index 000000000..ae1c98caf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBufferManager is for managing memory allocation for one or more
+// MemTables.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteBufferManager {
+ public:
+ // _buffer_size = 0 indicates no limit. Memory won't be capped.
+ // memory_usage() won't be valid and ShouldFlush() will always return true.
+ // if `cache` is provided, we'll put dummy entries in the cache and cost
+ // the memory allocated to the cache. It can be used even if _buffer_size = 0.
+ explicit WriteBufferManager(size_t _buffer_size,
+ std::shared_ptr<Cache> cache = {});
+ // No copying allowed
+ WriteBufferManager(const WriteBufferManager&) = delete;
+ WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
+ ~WriteBufferManager();
+
+ bool enabled() const { return buffer_size_ != 0; }
+
+ bool cost_to_cache() const { return cache_rep_ != nullptr; }
+
+ // Only valid if enabled()
+ size_t memory_usage() const {
+ return memory_used_.load(std::memory_order_relaxed);
+ }
+ size_t mutable_memtable_memory_usage() const {
+ return memory_active_.load(std::memory_order_relaxed);
+ }
+ size_t buffer_size() const { return buffer_size_; }
+
+ // Should only be called from write thread
+ bool ShouldFlush() const {
+ if (enabled()) {
+ if (mutable_memtable_memory_usage() > mutable_limit_) {
+ return true;
+ }
+ if (memory_usage() >= buffer_size_ &&
+ mutable_memtable_memory_usage() >= buffer_size_ / 2) {
+ // If the memory exceeds the buffer size, we trigger more aggressive
+ // flush. But if already more than half memory is being flushed,
+ // triggering more flush may not help. We will hold it instead.
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void ReserveMem(size_t mem) {
+ if (cache_rep_ != nullptr) {
+ ReserveMemWithCache(mem);
+ } else if (enabled()) {
+ memory_used_.fetch_add(mem, std::memory_order_relaxed);
+ }
+ if (enabled()) {
+ memory_active_.fetch_add(mem, std::memory_order_relaxed);
+ }
+ }
+ // We are in the process of freeing `mem` bytes, so it is not considered
+ // when checking the soft limit.
+ void ScheduleFreeMem(size_t mem) {
+ if (enabled()) {
+ memory_active_.fetch_sub(mem, std::memory_order_relaxed);
+ }
+ }
+ void FreeMem(size_t mem) {
+ if (cache_rep_ != nullptr) {
+ FreeMemWithCache(mem);
+ } else if (enabled()) {
+ memory_used_.fetch_sub(mem, std::memory_order_relaxed);
+ }
+ }
+
+ private:
+ const size_t buffer_size_;
+ const size_t mutable_limit_;
+ std::atomic<size_t> memory_used_;
+ // Memory that hasn't been scheduled to free.
+ std::atomic<size_t> memory_active_;
+ struct CacheRep;
+ std::unique_ptr<CacheRep> cache_rep_;
+
+ void ReserveMemWithCache(size_t mem);
+ void FreeMemWithCache(size_t mem);
+};
+} // namespace ROCKSDB_NAMESPACE