102 files changed, 31477 insertions, 0 deletions
diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h
new file mode 100644
index 000000000..258cf82a1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/advanced_options.h
@@ -0,0 +1,1098 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+class TablePropertiesCollectorFactory;
+class TableFactory;
+struct Options;
+
+enum CompactionStyle : char {
+  // level based compaction style
+  kCompactionStyleLevel = 0x0,
+  // Universal compaction style
+  // Not supported in ROCKSDB_LITE.
+  kCompactionStyleUniversal = 0x1,
+  // FIFO compaction style
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleFIFO = 0x2,
+  // Disable background compaction. Compaction jobs are submitted
+  // via CompactFiles().
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleNone = 0x3,
+};
+
+// In Level-based compaction, it Determines which file from a level to be
+// picked to merge to the next level. We suggest people try
+// kMinOverlappingRatio first when you tune your database.
+enum CompactionPri : char {
+  // Slightly prioritize larger files by size compensated by #deletes
+  kByCompensatedSize = 0x0,
+  // First compact files whose data's latest update time is oldest.
+  // Try this if you only update some hot keys in small ranges.
+  kOldestLargestSeqFirst = 0x1,
+  // First compact files whose range hasn't been compacted to the next level
+  // for the longest. If your updates are random across the key space,
+  // write amplification is slightly better with this option.
+  kOldestSmallestSeqFirst = 0x2,
+  // First compact files whose ratio between overlapping size in next level
+  // and its size is the smallest. It in many cases can optimize write
+  // amplification.
+  kMinOverlappingRatio = 0x3,
+  // Keeps a cursor(s) of the successor of the file (key range) was/were
+  // compacted before, and always picks the next files (key range) in that
+  // level. The file picking process will cycle through all the files in a
+  // round-robin manner.
+  kRoundRobin = 0x4,
+};
+
+struct CompactionOptionsFIFO {
+  // once the total sum of table files reaches this, we will delete the oldest
+  // table file
+  // Default: 1GB
+  uint64_t max_table_files_size;
+
+  // If true, try to do compaction to compact smaller files into larger ones.
+  // Minimum files to compact follows options.level0_file_num_compaction_trigger
+  // and compaction won't trigger if average compact bytes per del file is
+  // larger than options.write_buffer_size. This is to protect large files
+  // from being compacted again.
+  // Default: false;
+  bool allow_compaction = false;
+
+  // When not 0, if the data in the file is older than this threshold, RocksDB
+  // will soon move the file to warm temperature.
+  uint64_t age_for_warm = 0;
+
+  CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
+  CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
+      : max_table_files_size(_max_table_files_size),
+        allow_compaction(_allow_compaction) {}
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  // RocksDB's generic default compression level. Internally it'll be translated
+  // to the default compression level specific to the library being used (see
+  // comment above `ColumnFamilyOptions::compression`).
+  //
+  // The default value is the max 16-bit int as it'll be written out in OPTIONS
+  // file, which should be portable.
+  const static int kDefaultCompressionLevel = 32767;
+
+  int window_bits;
+  int level;
+  int strategy;
+
+  // Maximum size of dictionaries used to prime the compression library.
+  // Enabling dictionary can improve compression ratios when there are
+  // repetitions across data blocks.
+  //
+  // The dictionary is created by sampling the SST file data. If
+  // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
+  // dictionary generator (see comments for option `use_zstd_dict_trainer` for
+  // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the
+  // random samples are used directly as the dictionary.
+  //
+  // When compression dictionary is disabled, we compress and write each block
+  // before buffering data for the next one. When compression dictionary is
+  // enabled, we buffer SST file data in-memory so we can sample it, as data
+  // can only be compressed and written after the dictionary has been finalized.
+  //
+  // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This
+  // buffered memory is charged to the block cache when there is a block cache.
+  // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is
+  // full), we finalize the dictionary with whatever data we have and then stop
+  // buffering.
+  //
+  // Default: 0.
+  uint32_t max_dict_bytes;
+
+  // Maximum size of training data passed to zstd's dictionary trainer. Using
+  // zstd's dictionary trainer can achieve even better compression ratio
+  // improvements than using `max_dict_bytes` alone.
+  //
+  // The training data will be used to generate a dictionary of max_dict_bytes.
+  //
+  // Default: 0.
+  uint32_t zstd_max_train_bytes;
+
+  // Number of threads for parallel compression.
+  // Parallel compression is enabled only if threads > 1.
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // This option is valid only when BlockBasedTable is used.
+  //
+  // When parallel compression is enabled, SST size file sizes might be
+  // more inflated compared to the target size, because more data of unknown
+  // compressed size is in flight when compression is parallelized. To be
+  // reasonably accurate, this inflation is also estimated by using historical
+  // compression ratio and current bytes inflight.
+  //
+  // Default: 1.
+  uint32_t parallel_threads;
+
+  // When the compression options are set by the user, it will be set to "true".
+  // For bottommost_compression_opts, to enable it, user must set enabled=true.
+  // Otherwise, bottommost compression will use compression_opts as default
+  // compression options.
+  //
+  // For compression_opts, if compression_opts.enabled=false, it is still
+  // used as compression options for compression process.
+  //
+  // Default: false.
+  bool enabled;
+
+  // Limit on data buffering when gathering samples to build a dictionary. Zero
+  // means no limit. When dictionary is disabled (`max_dict_bytes == 0`),
+  // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect.
+  //
+  // In compaction, the buffering is limited to the target file size (see
+  // `target_file_size_base` and `target_file_size_multiplier`) even if this
+  // setting permits more buffering. Since we cannot determine where the file
+  // should be cut until data blocks are compressed with dictionary, buffering
+  // more than the target file size could lead to selecting samples that belong
+  // to a later output SST.
+  //
+  // Limiting too strictly may harm dictionary effectiveness since it forces
+  // RocksDB to pick samples from the initial portion of the output SST, which
+  // may not be representative of the whole file. Configuring this limit below
+  // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can
+  // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can
+  // restrict the size of the final dictionary.
+  //
+  // Default: 0 (unlimited)
+  uint64_t max_dict_buffer_bytes;
+
+  // Use zstd trainer to generate dictionaries. When this option is set to true,
+  // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes
+  // buffered data will be passed to zstd dictionary trainer to generate a
+  // dictionary of size max_dict_bytes.
+  //
+  // When this option is false, zstd's API ZDICT_finalizeDictionary() will be
+  // called to generate dictionaries. zstd_max_train_bytes of training sampled
+  // data will be passed to this API. Using this API should save CPU time on
+  // dictionary training, but the compression ratio may not be as good as using
+  // a dictionary trainer.
+  //
+  // Default: true
+  bool use_zstd_dict_trainer;
+
+  CompressionOptions()
+      : window_bits(-14),
+        level(kDefaultCompressionLevel),
+        strategy(0),
+        max_dict_bytes(0),
+        zstd_max_train_bytes(0),
+        parallel_threads(1),
+        enabled(false),
+        max_dict_buffer_bytes(0),
+        use_zstd_dict_trainer(true) {}
+  CompressionOptions(int wbits, int _lev, int _strategy,
+                     uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes,
+                     uint32_t _parallel_threads, bool _enabled,
+                     uint64_t _max_dict_buffer_bytes,
+                     bool _use_zstd_dict_trainer)
+      : window_bits(wbits),
+        level(_lev),
+        strategy(_strategy),
+        max_dict_bytes(_max_dict_bytes),
+        zstd_max_train_bytes(_zstd_max_train_bytes),
+        parallel_threads(_parallel_threads),
+        enabled(_enabled),
+        max_dict_buffer_bytes(_max_dict_buffer_bytes),
+        use_zstd_dict_trainer(_use_zstd_dict_trainer) {}
+};
+
+// Temperature of a file. Used to pass to FileSystem for a different
+// placement and/or coding.
+// Reserve some numbers in the middle, in case we need to insert new tier
+// there.
+enum class Temperature : uint8_t {
+  kUnknown = 0,
+  kHot = 0x04,
+  kWarm = 0x08,
+  kCold = 0x0C,
+  kLastTemperature,
+};
+
+// The control option of how the cache tiers will be used. Currently rocksdb
+// support block cache (volatile tier), secondary cache (non-volatile tier).
+// In the future, we may add more caching layers.
+enum class CacheTier : uint8_t {
+  kVolatileTier = 0,
+  kNonVolatileBlockTier = 0x01,
+};
+
+enum UpdateStatus {     // Return status For inplace update callback
+  UPDATE_FAILED = 0,    // Nothing to update
+  UPDATED_INPLACE = 1,  // Value updated inplace
+  UPDATED = 2,          // No inplace update. Merged value set
+};
+
+enum class PrepopulateBlobCache : uint8_t {
+  kDisable = 0x0,    // Disable prepopulate blob cache
+  kFlushOnly = 0x1,  // Prepopulate blobs during flush only
+};
+
+struct AdvancedColumnFamilyOptions {
+  // The maximum number of write buffers that are built up in memory.
+  // The default and the minimum number is 2, so that when 1 write buffer
+  // is being flushed to storage, new writes can continue to the other
+  // write buffer.
+  // If max_write_buffer_number > 3, writing will be slowed down to
+  // options.delayed_write_rate if we are writing to the last write buffer
+  // allowed.
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetOptions() API
+  int max_write_buffer_number = 2;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are flushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.
+  // If atomic flush is enabled (options.atomic_flush == true), then this
+  // option will be sanitized to 1.
+  // Default: 1
+  int min_write_buffer_number_to_merge = 1;
+
+  // DEPRECATED
+  // The total maximum number of write buffers to maintain in memory including
+  // copies of buffers that have already been flushed.  Unlike
+  // max_write_buffer_number, this parameter does not affect flushing.
+  // This parameter is being replaced by max_write_buffer_size_to_maintain.
+  // If both parameters are set to non-zero values, this parameter will be
+  // ignored.
+  int max_write_buffer_number_to_maintain = 0;
+
+  // The target number of write history bytes to hold in memory. Write history
+  // comprises the latest write buffers (memtables). To reach the target, write
+  // buffers that were most recently flushed to SST files may be retained in
+  // memory.
+  //
+  // This controls the target amount of write history that will be available
+  // in memory for conflict checking when Transactions are used.
+  //
+  // This target may be undershot when the CF first opens and has not recovered
+  // or received enough writes to reach the target. After reaching the target
+  // once, it is guaranteed to never undershoot again. That guarantee is
+  // implemented by retaining flushed write buffers in-memory until the oldest
+  // one can be trimmed without dropping below the target.
+  //
+  // Examples with `max_write_buffer_size_to_maintain` set to 32MB:
+  //
+  // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB,
+  //   and zero flushed immutable memtables. Nothing trimmable exists.
+  // - One mutable memtable of 16MB, zero unflushed immutable memtables, and
+  //   one flushed immutable memtable of 64MB. Trimming is disallowed because
+  //   dropping the earliest (only) flushed immutable memtable would result in
+  //   write history of 16MB < 32MB.
+  // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB,
+  //   and one flushed immutable memtable of 16MB. The earliest (only) flushed
+  //   immutable memtable is trimmed because without it we still have
+  //   16MB + 24MB = 40MB > 32MB of write history.
+  //
+  // When using an OptimisticTransactionDB:
+  // If this value is too low, some transactions may fail at commit time due
+  // to not being able to determine whether there were any write conflicts.
+  //
+  // When using a TransactionDB:
+  // If Transaction::SetSnapshot is used, TransactionDB will read either
+  // in-memory write buffers or SST files to do write-conflict checking.
+  // Increasing this value can reduce the number of reads to SST files
+  // done for conflict detection.
+  //
+  // Setting this value to 0 will cause write buffers to be freed immediately
+  // after they are flushed. If this value is set to -1,
+  // 'max_write_buffer_number * write_buffer_size' will be used.
+  //
+  // Default:
+  // If using a TransactionDB/OptimisticTransactionDB, the default value will
+  // be set to the value of 'max_write_buffer_number * write_buffer_size'
+  // if it is not explicitly set by the user.  Otherwise, the default is 0.
+  int64_t max_write_buffer_size_to_maintain = 0;
+
+  // Allows thread-safe inplace updates. If this is true, there is no way to
+  // achieve point-in-time consistency using snapshot or iterator (assuming
+  // concurrent updates). Hence iterator and multi-get will return results
+  // which are not consistent as of any point-in-time.
+  // Backward iteration on memtables will not work either.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
+  // Default: false.
+  bool inplace_update_support = false;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t inplace_update_num_locks = 10000;
+
+  // [experimental]
+  // Used to activate or deactive the Mempurge feature (memtable garbage
+  // collection). (deactivated by default). At every flush, the total useful
+  // payload (total entries minus garbage entries) is estimated as a ratio
+  // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then
+  // compared to this `threshold` value:
+  //     - if ratio<threshold: the flush is replaced by a mempurge operation
+  //     - else: a regular flush operation takes place.
+  // Threshold values:
+  //   0.0: mempurge deactivated (default).
+  //   1.0: recommended threshold value.
+  //   >1.0 : aggressive mempurge.
+  //   0 < threshold < 1.0: mempurge triggered only for very low useful payload
+  //   ratios.
+  // [experimental]
+  double experimental_mempurge_threshold = 0.0;
+
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
+  //
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the key in the database.
+  //
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+  //
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+  //
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
+  //
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
+  // Hence the inplace_callback function should be consistent across db reopens.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  //
+  // Default: nullptr
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value) = nullptr;
+
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
+  // write_buffer_size * memtable_prefix_bloom_size_ratio.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
+  //
+  // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  double memtable_prefix_bloom_size_ratio = 0.0;
+
+  // Enable whole key bloom filter in memtable. Note this will only take effect
+  // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+  // can potentially reduce CPU usage for point-look-ups.
+  //
+  // Default: false (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  bool memtable_whole_key_filtering = false;
+
+  // Page size for huge page for the arena used by the memtable. If <=0, it
+  // won't allocate from huge page but from malloc.
+  // Users are responsible to reserve huge pages for it to be allocated. For
+  // example:
+  //      sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt
+  // If there isn't enough free huge page available, it will fall back to
+  // malloc.
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t memtable_huge_page_size = 0;
+
+  // If non-nullptr, memtable will use the specified function to extract
+  // prefixes for keys, and for each prefix maintain a hint of insert location
+  // to reduce CPU usage for inserting keys with the prefix. Keys out of
+  // domain of the prefix extractor will be insert without using hints.
+  //
+  // Currently only the default skiplist based memtable implements the feature.
+  // All other memtable implementation will ignore the option. It incurs ~250
+  // additional bytes of memory overhead to store a hint for each prefix.
+  // Also concurrent writes (when allow_concurrent_memtable_write is true) will
+  // ignore the option.
+  //
+  // The option is best suited for workloads where keys will likely to insert
+  // to a location close the last inserted key with the same prefix.
+  // One example could be inserting keys of the form (prefix + timestamp),
+  // and keys of the same prefix always comes in with time order. Another
+  // example would be updating the same key over and over again, in which case
+  // the prefix can be the key itself.
+  //
+  // Default: nullptr (disabled)
+  std::shared_ptr<const SliceTransform>
+      memtable_insert_with_hint_prefix_extractor = nullptr;
+
+  // Control locality of bloom filter probes to improve CPU cache hit rate.
+  // This option now only applies to plaintable prefix bloom. This
+  // optimization is turned off when set to 0, and positive number to turn
+  // it on.
+  // Default: 0
+  uint32_t bloom_locality = 0;
+
+  // size of one block in arena memory allocation.
+  // If <= 0, a proper value is automatically calculated (usually 1/8 of
+  // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is
+  // smaller).
+  //
+  // There are two additional restriction of the specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t arena_block_size = 0;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to use quick compression
+  // algorithms while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non-empty, should have an entry for
+  // each level of the database; these override the value specified in
+  // the previous field 'compression'.
+  //
+  // NOTICE if level_compaction_dynamic_level_bytes=true,
+  // compression_per_level[0] still determines L0, but other elements
+  // of the array are based on base level (the level L0 files are merged
+  // to), and may not match the level users see from info log for metadata.
+  // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
+  // determines compaction type for level n+i-1.
+  // For example, if we have three 5 levels, and we determine to merge L0
+  // data to L4 (which means L1..L3 will be empty), then the new files go to
+  // L4 uses compression type compression_per_level[1].
+  // If now L0 is merged to L2. Data goes to L2 will be compressed
+  // according to compression_per_level[1], L3 using compression_per_level[2]
+  // and L4 using compression_per_level[3]. Compaction for each level can
+  // change when data grows.
+  //
+  // NOTE: if the vector size is smaller than the level number, the undefined
+  // lower level uses the last option in the vector, for example, for 3 level
+  // LSM tree the following settings are the same:
+  // {kNoCompression, kSnappyCompression}
+  // {kNoCompression, kSnappyCompression, kSnappyCompression}
+  //
+  // Dynamically changeable through SetOptions() API
+  std::vector<CompressionType> compression_per_level;
+
+  // Number of levels for this database
+  int num_levels = 7;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  //
+  // Default: 20
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_slowdown_writes_trigger = 20;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  //
+  // Default: 36
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_stop_writes_trigger = 36;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+  //
+  // Default: 64MB.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t target_file_size_base = 64 * 1048576;
+
+  // By default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  //
+  // Dynamically changeable through SetOptions() API
+  int target_file_size_multiplier = 1;
+
+  // If true, RocksDB will pick target size of each level dynamically.
+  // We will pick a base level b >= 1. L0 will be directly merged into level b,
+  // instead of always into level 1. Level 1 to b-1 need to be empty.
+  // We try to pick b and its target size so that
+  // 1. target size is in the range of
+  //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+  //    max_bytes_for_level_base]
+  // 2. target size of the last level (level num_levels-1) equals to extra size
+  //    of the level.
+  // At the same time max_bytes_for_level_multiplier and
+  // max_bytes_for_level_multiplier_additional are still satisfied.
+  // (When L0 is too large, we make some adjustment. See below.)
+  //
+  // With this option on, from an empty DB, we make last level the base level,
+  // which means merging L0 data into the last level, until it exceeds
+  // max_bytes_for_level_base. And then we make the second last level to be
+  // base level, to start to merge L0 data to second last level, with its
+  // target size to be 1/max_bytes_for_level_multiplier of the last level's
+  // extra size. After the data accumulates more so that we need to move the
+  // base level to the third last one, and so on.
+  //
+  // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
+  // and max_bytes_for_level_base=10MB.
+  // Target sizes of level 1 to 5 starts with:
+  // [- - - - 10MB]
+  // with base level is level. Target sizes of level 1 to 4 are not applicable
+  // because they will not be used.
+  // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+  // base target to level 4 and now the targets looks like:
+  // [- - - 1.1MB 11MB]
+  // While data are accumulated, size targets are tuned based on actual data
+  // of level 5. When level 5 has 50MB of data, the target is like:
+  // [- - - 5MB 50MB]
+  // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
+  // level 4 to be the base level, its target size needs to be 10.1MB, which
+  // doesn't satisfy the target size range. So now we make level 3 the target
+  // size and the target sizes of the levels look like:
+  // [- - 1.01MB 10.1MB 101MB]
+  // In the same way, while level 5 further grows, all levels' targets grow,
+  // like
+  // [- - 5MB 50MB 500MB]
+  // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+  // base level and make levels' target sizes like this:
+  // [- 1.001MB 10.01MB 100.1MB 1001MB]
+  // and go on...
+  //
+  // By doing it, we give max_bytes_for_level_multiplier a priority against
+  // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
+  // useful to limit worse case space amplification.
+  //
+  //
+  // If the compaction from L0 is lagged behind, a special mode will be turned
+  // on to prioritize write amplification against max_bytes_for_level_multiplier
+  // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+  // at number of L0 files and total L0 size. If number of L0 files is at least
+  // the double of level0_file_num_compaction_trigger, or the total size is
+  // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+  // to the actual data size in L0, and then determine the target for each level
+  // so that each level will have the same level multiplier.
+  //
+  // For example, when L0 size is 100MB, the size of last level is 1600MB,
+  // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+  // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+  // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+  // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+  // be needed. The level multiplier will be calculated to be 4 and the three
+  // levels' target to be [100MB, 400MB, 1600MB].
+  //
+  // In this mode, The number of levels will be no more than the normal mode,
+  // and the level multiplier will be lower. The write amplification will
+  // likely to be reduced.
+  //
+  //
+  // max_bytes_for_level_multiplier_additional is ignored with this flag on.
+  //
+  // Turning this feature on or off for an existing DB can cause unexpected
+  // LSM tree structure so it's not recommended.
+  //
+  // Default: false
+  bool level_compaction_dynamic_level_bytes = false;
+
+  // Allows RocksDB to generate files that are not exactly the target_file_size
+  // only for the non-bottommost files. Which can reduce the write-amplification
+  // from compaction. The file size could be from 0 to 2x target_file_size.
+  // Once enabled, non-bottommost compaction will try to cut the files align
+  // with the next level file boundaries (grandparent level).
+  //
+  // Default: true
+  bool level_compaction_dynamic_file_size = true;
+
+  // Default: 10.
+  //
+  // Dynamically changeable through SetOptions() API
+  double max_bytes_for_level_multiplier = 10;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  //
+  // Default: 1
+  //
+  // Dynamically changeable through SetOptions() API
+  std::vector<int> max_bytes_for_level_multiplier_additional =
+      std::vector<int>(num_levels, 1);
+
+  // We try to limit number of bytes in one compaction to be lower than this
+  // threshold. But it's not guaranteed.
+  // Value 0 will be sanitized.
+  //
+  // Default: target_file_size_base * 25
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_compaction_bytes = 0;
+
+  // When setting up compaction input files, we ignore the
+  // `max_compaction_bytes` limit when pulling in input files that are entirely
+  // within output key range.
+  //
+  // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
+  // We could remove this knob and always ignore the limit once it is proven
+  // safe.
+  bool ignore_max_compaction_bytes_for_input = true;
+
+  // All writes will be slowed down to at least delayed_write_rate if estimated
+  // bytes needed to be compaction exceed this threshold.
+  //
+  // Default: 64GB
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
+
+  // All writes are stopped if estimated bytes needed to be compaction exceed
+  // this threshold.
+  //
+  // Default: 256GB
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // If level compaction_style = kCompactionStyleLevel, for each level,
+  // which files are prioritized to be picked to compact.
+  // Default: kMinOverlappingRatio
+  CompactionPri compaction_pri = kMinOverlappingRatio;
+
+  // The options needed to support Universal Style compactions
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_universal", "{size_ratio=2;}")
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // The options for FIFO compaction style
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
+  CompactionOptionsFIFO compaction_options_fifo;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  //
+  // Default: 8
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_sequential_skip_in_iterations = 8;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory =
+      std::shared_ptr<SkipListFactory>(new SkipListFactory);
+
+  // Block-based table related options are moved to BlockBasedTableOptions.
+  // Related options that were originally here but now moved include:
+  //   no_block_cache
+  //   block_cache
+  //   block_cache_compressed
+  //   block_size
+  //   block_size_deviation
+  //   block_restart_interval
+  //   filter_policy
+  //   whole_key_filtering
+  // If you'd like to customize some of these options, you will need to
+  // use NewBlockBasedTableFactory() to construct a new table factory.
+
+  // This option allows user to collect their own interested statistics of
+  // the tables.
+  // Default: empty vector -- no user-defined statistics collection will be
+  // performed.
+  using TablePropertiesCollectorFactories =
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
+  TablePropertiesCollectorFactories table_properties_collector_factories;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t max_successive_merges = 0;
+
+  // This flag specifies that the implementation should optimize the filters
+  // mainly for cases where keys are found rather than also optimize for keys
+  // missed. This would be used in cases where the application knows that
+  // there are very few misses or the performance in the case of misses is not
+  // important.
+  //
+  // For now, this flag allows us to not store filters for the last level i.e
+  // the largest level which contains data of the LSM store. For keys which
+  // are hits, the filters in this level are not useful because we will search
+  // for the data anyway. NOTE: the filters in other levels are still useful
+  // even for key hit because they tell us whether to look in that level or go
+  // to the higher level.
+  //
+  // Default: false
+  bool optimize_filters_for_hits = false;
+
+  // During flush or compaction, check whether keys inserted to output files
+  // are in order.
+  //
+  // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
+  bool check_flush_compaction_key_order = true;
+
+  // After writing every SST file, reopen it and read all the keys.
+  // Checks the hash of all of the keys and values written versus the
+  // keys in the file and signals a corruption if they do not match
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool paranoid_file_checks = false;
+
+  // In debug mode, RocksDB runs consistency checks on the LSM every time the
+  // LSM changes (Flush, Compaction, AddFile). When this option is true, these
+  // checks are also enabled in release mode. These checks were historically
+  // disabled in release mode, but are now enabled by default for proactive
+  // corruption detection. The CPU overhead is negligible for normal mixed
+  // operations but can slow down saturated writing. See
+  // Options::DisableExtraChecks().
+  // Default: true
+  bool force_consistency_checks = true;
+
+  // Measure IO stats in compactions and flushes, if true.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool report_bg_io_stats = false;
+
+  // Files containing updates older than TTL will go through the compaction
+  // process. This usually happens in a cascading way so that those entries
+  // will be compacted to bottommost level/file.
+  // The feature is used to remove stale entries that have been deleted or
+  // updated from the file system.
+  // Pre-req: This needs max_open_files to be set to -1.
+  // In Level: Non-bottom-level files older than TTL will go through the
+  //           compaction process.
+  // In FIFO: Files older than TTL will be deleted.
+  // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+  // In FIFO, this option will have the same meaning as
+  // periodic_compaction_seconds. Whichever stricter will be used.
+  // 0 means disabling.
+  // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+  // pick default.
+  //
+  // Default: 30 days for leveled compaction + block based table. disable
+  //          otherwise.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t ttl = 0xfffffffffffffffe;
+
+  // Files older than this value will be picked up for compaction, and
+  // re-written to the same level as they were before.
+  // One main use of the feature is to make sure a file goes through compaction
+  // filters periodically. Users can also use the feature to clear up SST
+  // files using old format.
+  //
+  // A file's age is computed by looking at file_creation_time or creation_time
+  // table properties in order, if they have valid non-zero values; if not, the
+  // age is based on the file's last modified time (given by the underlying
+  // Env).
+  //
+  // Supported in Level and FIFO compaction.
+  // In FIFO compaction, this option has the same meaning as TTL and whichever
+  // stricter will be used.
+  // Pre-req: max_open_file == -1.
+  // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+  //
+  // Values:
+  // 0: Turn off Periodic compactions.
+  // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+  //     as needed. For now, RocksDB will change this value to 30 days
+  //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+  //     process at least once every 30 days if not compacted sooner.
+  //     In FIFO compaction, since the option has the same meaning as ttl,
+  //     when this value is left default, and ttl is left to 0, 30 days will be
+  //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+  //
+  // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
+
+  // If this option is set then 1 in N blocks are compressed
+  // using a fast (lz4) and slow (zstd) compression algorithm.
+  // The compressibility is reported as stats and the stored
+  // data is left uncompressed (unless compression is also requested).
+  uint64_t sample_for_compression = 0;
+
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when creating the last level files, pass this
+  // temperature to FileSystem used. Should be no-op for default FileSystem
+  // and users need to plug in their own FileSystem to take advantage of it.
+  //
+  // Note: the feature is changed from `bottommost_temperature` to
+  //  `last_level_temperature` which now only apply for the last level files.
+  //  The option name `bottommost_temperature` is kept only for migration, the
+  //  behavior is the same as `last_level_temperature`. Please stop using
+  //  `bottommost_temperature` and will be removed in next release.
+  //
+  // Dynamically changeable through the SetOptions() API
+  Temperature bottommost_temperature = Temperature::kUnknown;
+  Temperature last_level_temperature = Temperature::kUnknown;
+
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when data insert time is within this time range, it
+  // will be precluded from the last level.
+  // 0 means no key will be precluded from the last level.
+  //
+  // Note: when enabled, universal size amplification (controlled by option
+  //  `compaction_options_universal.max_size_amplification_percent`) calculation
+  //  will exclude the last level. As the feature is designed for tiered storage
+  //  and a typical setting is the last level is cold tier which is likely not
+  //  size constrained, the size amp is going to be only for non-last levels.
+  //
+  // Default: 0 (disable the feature)
+  //
+  // Not dynamically changeable, change it requires db restart.
+  uint64_t preclude_last_level_data_seconds = 0;
+
+  // EXPERIMENTAL
+  // If this option is set, it will preserve the internal time information about
+  // the data until it's older than the specified time here.
+  // Internally the time information is a map between sequence number and time,
+  // which is the same as `preclude_last_level_data_seconds`. But it won't
+  // preclude the data from the last level and the data in the last level won't
+  // have the sequence number zeroed out.
+  // Internally, rocksdb would sample the sequence number to time pair and store
+  // that in SST property "rocksdb.seqno.time.map". The information is currently
+  // only used for tiered storage compaction (option
+  // `preclude_last_level_data_seconds`).
+  //
+  // Note: if both `preclude_last_level_data_seconds` and this option is set, it
+  //  will preserve the max time of the 2 options and compaction still preclude
+  //  the data based on `preclude_last_level_data_seconds`.
+  //  The higher the preserve_time is, the less the sampling frequency will be (
+  //  which means less accuracy of the time estimation).
+  //
+  // Default: 0 (disable the feature)
+  //
+  // Not dynamically changeable, change it requires db restart.
+  uint64_t preserve_internal_time_seconds = 0;
+
+  // When set, large values (blobs) are written to separate blob files, and
+  // only pointers to them are stored in SST files. This can reduce write
+  // amplification for large-value use cases at the cost of introducing a level
+  // of indirection for reads. See also the options min_blob_size,
+  // blob_file_size, blob_compression_type, enable_blob_garbage_collection,
+  // blob_garbage_collection_age_cutoff,
+  // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size
+  // below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_files = false;
+
+  // The size of the smallest value to be stored separately in a blob file.
+  // Values which have an uncompressed size smaller than this threshold are
+  // stored alongside the keys in SST files in the usual fashion. A value of
+  // zero for this option means that all values are stored in blob files. Note
+  // that enable_blob_files has to be set in order for this option to have any
+  // effect.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t min_blob_size = 0;
+
+  // The size limit for blob files. When writing blob files, a new file is
+  // opened once this limit is reached. Note that enable_blob_files has to be
+  // set in order for this option to have any effect.
+  //
+  // Default: 256 MB
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_file_size = 1ULL << 28;
+
+  // The compression algorithm to use for large values stored in blob files.
+  // Note that enable_blob_files has to be set in order for this option to have
+  // any effect.
+  //
+  // Default: no compression
+  //
+  // Dynamically changeable through the SetOptions() API
+  CompressionType blob_compression_type = kNoCompression;
+
+  // Enables garbage collection of blobs. Blob GC is performed as part of
+  // compaction. Valid blobs residing in blob files older than a cutoff get
+  // relocated to new files as they are encountered during compaction, which
+  // makes it possible to clean up blob files once they contain nothing but
+  // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and
+  // blob_garbage_collection_force_threshold below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_garbage_collection = false;
+
+  // The cutoff in terms of blob file age for garbage collection. Blobs in
+  // the oldest N blob files will be relocated when encountered during
+  // compaction, where N = garbage_collection_cutoff * number_of_blob_files.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 0.25
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_age_cutoff = 0.25;
+
+  // If the ratio of garbage in the oldest blob files exceeds this threshold,
+  // targeted compactions are scheduled in order to force garbage collecting
+  // the blob files in question, assuming they are all eligible based on the
+  // value of blob_garbage_collection_age_cutoff above. This option is
+  // currently only supported with leveled compactions.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 1.0
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_force_threshold = 1.0;
+
+  // Compaction readahead for blob files.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_compaction_readahead_size = 0;
+
+  // Enable blob files starting from a certain LSM tree level.
+  //
+  // For certain use cases that have a mix of short-lived and long-lived values,
+  // it might make sense to support extracting large values only during
+  // compactions whose output level is greater than or equal to a specified LSM
+  // tree level (e.g. compactions into L1/L2/... or above). This could reduce
+  // the space amplification caused by large values that are turned into garbage
+  // shortly after being written at the price of some write amplification
+  // incurred by long-lived values whose extraction to blob files is delayed.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  int blob_file_starting_level = 0;
+
+  // The Cache object to use for blobs. Using a dedicated object for blobs and
+  // using the same object for the block and blob caches are both supported. In
+  // the latter case, note that blobs are less valuable from a caching
+  // perspective than SST blocks, and some cache implementations have
+  // configuration options that can be used to prioritize items accordingly (see
+  // Cache::Priority and LRUCacheOptions::{high,low}_pri_pool_ratio).
+  //
+  // Default: nullptr (disabled)
+  std::shared_ptr<Cache> blob_cache = nullptr;
+
+  // Enable/disable prepopulating the blob cache. When set to kFlushOnly, BlobDB
+  // will insert newly written blobs into the blob cache during flush. This can
+  // improve performance when reading back these blobs would otherwise be
+  // expensive (e.g. when using direct I/O or remote storage), or when the
+  // workload has a high temporal locality.
+  //
+  // Default: disabled
+  //
+  // Dynamically changeable through the SetOptions() API
+  PrepopulateBlobCache prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+
+  // Enable memtable per key-value checksum protection.
+  //
+  // Each entry in memtable will be suffixed by a per key-value checksum.
+  // This options determines the size of such checksums.
+  //
+  // It is suggested to turn on write batch per key-value
+  // checksum protection together with this option, so that the checksum
+  // computation is done outside of writer threads (memtable kv checksum can be
+  // computed from write batch checksum) See
+  // WriteOptions::protection_bytes_per_key for more detail.
+  //
+  // Default: 0 (no protection)
+  // Supported values: 0, 1, 2, 4, 8.
+  uint32_t memtable_protection_bytes_per_key = 0;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  AdvancedColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit AdvancedColumnFamilyOptions(const Options& options);
+
+  // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/block_cache_trace_writer.h b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
new file mode 100644
index 000000000..18d28685b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2022, Meta Platforms, Inc. and affiliates.  All rights
+// reserved. This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_reader_caller.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A record for block cache lookups/inserts. This is passed by the table
+// reader to the BlockCacheTraceWriter for every block cache op.
+struct BlockCacheTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp = 0;
+
+  // Info related to the block being looked up or inserted
+  //
+  // 1. The cache key for the block
+  std::string block_key;
+
+  // 2. The type of block
+  TraceType block_type = TraceType::kTraceMax;
+
+  // 3. Size of the block
+  uint64_t block_size = 0;
+
+  // Info about the SST file the block is in
+  //
+  // 1. Column family ID
+  uint64_t cf_id = 0;
+
+  // 2. Column family name
+  std::string cf_name;
+
+  // 3. LSM level of the file
+  uint32_t level = 0;
+
+  // 4. SST file number
+  uint64_t sst_fd_number = 0;
+
+  // Info about the calling context
+  //
+  // 1. The higher level request triggering the block cache request
+  TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
+
+  // 2. Cache lookup hit/miss. Not relevant for inserts
+  bool is_cache_hit = false;
+
+  // 3. Whether this request is a lookup
+  bool no_insert = false;
+
+  // Get/MultiGet specific info
+  //
+  // 1. A unique ID for Get/MultiGet
+  uint64_t get_id = kReservedGetId;
+
+  // 2. Whether the Get/MultiGet is from a user-specified snapshot
+  bool get_from_user_specified_snapshot = false;
+
+  // 3. The target user key in the block
+  std::string referenced_key;
+
+  // Required fields for data block and user Get/Multi-Get only.
+  //
+  // 1. Size of te useful data in the block
+  uint64_t referenced_data_size = 0;
+
+  // 2. Only for MultiGet, number of keys from the batch found in the block
+  uint64_t num_keys_in_block = 0;
+
+  // 3. Whether the key was found in the block or not (false positive)
+  bool referenced_key_exist_in_block = false;
+
+  static const uint64_t kReservedGetId;
+
+  BlockCacheTraceRecord() {}
+
+  BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
+                        TraceType _block_type, uint64_t _block_size,
+                        uint64_t _cf_id, std::string _cf_name, uint32_t _level,
+                        uint64_t _sst_fd_number, TableReaderCaller _caller,
+                        bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
+                        bool _get_from_user_specified_snapshot = false,
+                        std::string _referenced_key = "",
+                        uint64_t _referenced_data_size = 0,
+                        uint64_t _num_keys_in_block = 0,
+                        bool _referenced_key_exist_in_block = false)
+      : access_timestamp(_access_timestamp),
+        block_key(_block_key),
+        block_type(_block_type),
+        block_size(_block_size),
+        cf_id(_cf_id),
+        cf_name(_cf_name),
+        level(_level),
+        sst_fd_number(_sst_fd_number),
+        caller(_caller),
+        is_cache_hit(_is_cache_hit),
+        no_insert(_no_insert),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot),
+        referenced_key(_referenced_key),
+        referenced_data_size(_referenced_data_size),
+        num_keys_in_block(_num_keys_in_block),
+        referenced_key_exist_in_block(_referenced_key_exist_in_block) {}
+};
+
+// Options for tracing block cache accesses
+struct BlockCacheTraceOptions {
+  // Specify trace sampling option, i.e. capture one per how many requests.
+  // Default to 1 (capture every request).
+  uint64_t sampling_frequency = 1;
+};
+
+// Options for the built-in implementation of BlockCacheTraceWriter
+struct BlockCacheTraceWriterOptions {
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+};
+
+// BlockCacheTraceWriter is an abstract class that captures all RocksDB block
+// cache accesses. Every RocksDB operation is passed to WriteBlockAccess()
+// with a BlockCacheTraceRecord.
+class BlockCacheTraceWriter {
+ public:
+  virtual ~BlockCacheTraceWriter() {}
+
+  // Pass Slice references to avoid copy.
+  virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                                  const Slice& block_key, const Slice& cf_name,
+                                  const Slice& referenced_key) = 0;
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  virtual Status WriteHeader() = 0;
+};
+
+// Allocate an instance of the built-in BlockCacheTraceWriter implementation,
+// that traces all block cache accesses to a user-provided TraceWriter. Each
+// access is traced to a file with a timestamp and type, followed by the
+// payload.
+std::unique_ptr<BlockCacheTraceWriter> NewBlockCacheTraceWriter(
+    SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
new file mode 100644
index 000000000..1639f3cd3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -0,0 +1,2793 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for rocksdb.  May be useful as a stable ABI that can be
+  used by programs that keep rocksdb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#pragma once
+
+#ifdef _WIN32
+#ifdef ROCKSDB_DLL
+#ifdef ROCKSDB_LIBRARY_EXPORTS
+#define ROCKSDB_LIBRARY_API __declspec(dllexport)
+#else
+#define ROCKSDB_LIBRARY_API __declspec(dllimport)
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
+typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
+typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
+typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
+typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
+typedef struct rocksdb_compactionfiltercontext_t
+    rocksdb_compactionfiltercontext_t;
+typedef struct rocksdb_compactionfilterfactory_t
+    rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_comparator_t rocksdb_comparator_t;
+typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
+typedef struct rocksdb_env_t rocksdb_env_t;
+typedef struct rocksdb_fifo_compaction_options_t
+    rocksdb_fifo_compaction_options_t;
+typedef struct rocksdb_filelock_t rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t rocksdb_iterator_t;
+typedef struct rocksdb_logger_t rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t rocksdb_options_t;
+typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t;
+typedef struct rocksdb_block_based_table_options_t
+    rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t;
+typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
+typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
+typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t
+    rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
+typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
+typedef struct rocksdb_column_family_metadata_t
+    rocksdb_column_family_metadata_t;
+typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t;
+typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t;
+typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
+typedef struct rocksdb_ingestexternalfileoptions_t
+    rocksdb_ingestexternalfileoptions_t;
+typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t;
+typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t;
+typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t;
+typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
+typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
+typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
+typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t
+    rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t
+    rocksdb_optimistictransaction_options_t;
+typedef struct rocksdb_transaction_t rocksdb_transaction_t;
+typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
+typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
+typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
+
+/* DB operations */
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl(
+    const rocksdb_options_t* options, const char* name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options, const char* name,
+    unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t*
+rocksdb_backup_engine_open_opts(const rocksdb_backup_engine_options_t* options,
+                                rocksdb_env_t* env, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush(
+    rocksdb_backup_engine_t* be, rocksdb_t* db,
+    unsigned char flush_before_backup, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
+    rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
+rocksdb_restore_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
+    rocksdb_restore_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
+    rocksdb_restore_options_t* opt, int v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup(
+    rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
+rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count(
+    const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
+    rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_increase_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* ts_low, size_t ts_lowlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    size_t* ts_lowlen, char** errptr);
+
+/* BackupEngineOptions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_options_t*
+rocksdb_backup_engine_options_create(const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_backup_dir(
+    rocksdb_backup_engine_options_t* options, const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_env(
+    rocksdb_backup_engine_options_t* options, rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_table_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_share_table_files(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_sync(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backup_engine_options_get_sync(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_destroy_old_data(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_destroy_old_data(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_log_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_backup_log_files(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_background_operations(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_background_operations(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options, uint64_t size);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_destroy(
+    rocksdb_backup_engine_options_t*);
+
+/* Checkpoint */
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
+    rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
+    uint64_t log_size_for_flush, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
+    rocksdb_checkpoint_t* checkpoint);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_and_trim_history(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+    size_t trim_tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
+    const rocksdb_options_t* options, const char* name, size_t* lencf,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy(
+    char** list, size_t len);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family(rocksdb_t* db,
+                             const rocksdb_options_t* column_family_options,
+                             const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy(
+    rocksdb_column_family_handle_t*);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_column_family_handle_get_id(rocksdb_column_family_handle_t* handle);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_handle_get_name(
+    rocksdb_column_family_handle_t* handle, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* start_key,
+    size_t start_key_len, const char* end_key, size_t end_key_len,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_write(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch, char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+// if values_list[i] == NULL and errs[i] == NULL,
+// then we got status.IsNotFound(), which we will not return.
+// all errors except status status.ok() and status.IsNotFound() are returned.
+//
+// errs, values_list and values_list_sizes must be num_keys in length,
+// allocated by the caller.
+// errs is a list of strings as opposed to the conventional one error,
+// where errs[i] is the status for retrieval of keys_list[i].
+// each non-NULL errs entry is a malloc()ed, null terminated string.
+// each non-NULL values_list entry is a malloc()ed array, with
+// the length for each stored in values_list_sizes[i].
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes, char** timestamp_list,
+    size_t* timestamp_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** timestamps_list,
+    size_t* timestamps_list_sizes, char** errs);
+
+// The MultiGet API that improves performance by batching operations
+// in the read path for greater efficiency. Currently, only the block based
+// table format with full filters are supported. Other table formats such
+// as plain table, block based table with block based filters and
+// partitioned indexes will still work, but will not get any performance
+// benefits.
+//
+// Note that all the keys passed to this API are restricted to a single
+// column family.
+//
+// Parameters -
+// db - the RocksDB instance.
+// options - ReadOptions
+// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+//                 passed to the API are restricted to a single column family
+// num_keys - Number of keys to lookup
+// keys_list - Pointer to C style array of keys with num_keys elements
+// keys_list_sizes - Pointer to C style array of the size of corresponding key
+//   in key_list with num_keys elements.
+// values - Pointer to C style array of PinnableSlices with num_keys elements
+// statuses - Pointer to C style array of Status with num_keys elements
+// sorted_input - If true, it means the input keys are already sorted by key
+//                order, so the MultiGet() API doesn't have to sort them
+//                again. If false, the keys will be copied and sorted
+//                internally by the API - the input array will not be
+//                modified
+extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db, const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+    rocksdb_t* db, uint64_t seq_number,
+    const rocksdb_wal_readoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators(
+    rocksdb_t* db, rocksdb_readoptions_t* opts,
+    rocksdb_column_family_handle_t** column_families,
+    rocksdb_iterator_t** iterators, size_t size, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
+    rocksdb_t* db, const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
+                                                        const char* propname);
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+                         uint64_t* out_val);
+
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* propname);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
+    rocksdb_t* db, int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
+                                                      const char* start_key,
+                                                      size_t start_key_len,
+                                                      const char* limit_key,
+                                                      size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range(
+    rocksdb_t* db, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt(
+    rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key,
+    size_t start_key_len, const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db,
+                                                    const char* name);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush(
+    rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
+    rocksdb_t* db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
+                                                  unsigned char sync,
+                                                  char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
+                                                               char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(
+    rocksdb_t* db, unsigned char force, char** errptr);
+
+/* Management operations */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_repair_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+/* Iterator */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid(
+    const rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*,
+                                                  const char* k, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*,
+                                                           const char* k,
+                                                           size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key(
+    const rocksdb_iterator_t*, size_t* klen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
+    const rocksdb_iterator_t*, size_t* vlen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp(
+    const rocksdb_iterator_t*, size_t* tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
+    const rocksdb_iterator_t*, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(
+    rocksdb_wal_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid(
+    const rocksdb_wal_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status(
+    const rocksdb_wal_iterator_t* iter, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+    const rocksdb_wal_iterator_t* iter, uint64_t* seq);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_get_latest_sequence_number(rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy(
+    const rocksdb_wal_iterator_t* iter);
+
+/* Write batch */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(
+    void);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
+    const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
+    rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*,
+                                                       const char* key,
+                                                       size_t klen,
+                                                       const char* val,
+                                                       size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf_with_ts(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*,
+                                                         const char* key,
+                                                         size_t klen,
+                                                         const char* val,
+                                                         size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
+                                                          const char* key,
+                                                          size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete(
+    rocksdb_writebatch_t* b, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf_with_ts(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range(
+    rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len,
+    const char* end_key, size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data(
+    rocksdb_writebatch_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
+    rocksdb_writebatch_t*, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(
+    rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point(
+    rocksdb_writebatch_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point(
+    rocksdb_writebatch_t*, char** errptr);
+
+/* Write batch with index */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create(size_t reserved_bytes,
+                             unsigned char overwrite_keys);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create_from(const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(
+    rocksdb_writebatch_wi_t* b);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range(
+    rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len,
+    const char* end_key, size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev(
+    rocksdb_writebatch_wi_t* b, int num_keys,
+    const char* const* start_keys_list, const size_t* start_keys_list_sizes,
+    const char* const* end_keys_list, const size_t* end_keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data(
+    rocksdb_writebatch_wi_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate(
+    rocksdb_writebatch_wi_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data(
+    rocksdb_writebatch_wi_t* b, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point(
+    rocksdb_writebatch_wi_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    const char* key, size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_wi_t* wbwi, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* cf);
+
+/* Options utils */
+
+// Load the latest rocksdb options from the specified db_path.
+//
+// On success, num_column_families will be updated with a non-zero
+// number indicating the number of column families.
+// The returned db_options, column_family_names, and column_family_options
+// should be released via rocksdb_load_latest_options_destroy().
+//
+// On error, a non-null errptr that includes the error message will be
+// returned.  db_options, column_family_names, and column_family_options
+// will be set to NULL.
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options(
+    const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+    rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+    size_t* num_column_families, char*** column_family_names,
+    rocksdb_options_t*** column_family_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy(
+    rocksdb_options_t* db_options, char** list_column_family_names,
+    rocksdb_options_t** list_column_family_options, size_t len);
+
+/* Block based table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_checksum(
+    rocksdb_block_based_table_options_t*, char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options,
+    int index_block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char partition_filters);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char use_delta_encoding);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char no_block_cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t*, int);
+enum {
+  rocksdb_block_based_table_index_type_binary_search = 0,
+  rocksdb_block_based_table_index_type_hash_search = 1,
+  rocksdb_block_based_table_index_type_two_level_index_search = 2,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+enum {
+  rocksdb_block_based_table_data_block_index_type_binary_search = 0,
+  rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1,
+};
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_index_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_hash_ratio(
+    rocksdb_block_based_table_options_t* options, double v);
+// rocksdb_block_based_options_set_hash_index_allow_collision()
+// is removed since BlockBasedTableOptions.hash_index_allow_collision()
+// is removed
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
+
+/* Cuckoo table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options);
+
+/* Options */
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count,
+                                                    const char* const keys[],
+                                                    const char* const values[],
+                                                    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count,
+    const char* const keys[], const char* const values[], char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
+    rocksdb_options_t* opt, int total_threads);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t*, rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
+    rocksdb_options_t*, rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
+    rocksdb_options_t*, rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
+    rocksdb_options_t* opt, const int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
+                                                   unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_create_missing_column_families(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(
+    rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
+                                                        rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
+                                                             rocksdb_logger_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
+    rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
+    rocksdb_options_t*, int, int, int, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*,
+                                                             int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
+                                                         int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int,
+                                                   int, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t*, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_target_file_size_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
+                                                         unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t*, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                 unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt);
+
+/* Blob Options Settings */
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_min_blob_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_file_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                   uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache(
+    rocksdb_options_t* opt, rocksdb_cache_t* blob_cache);
+
+enum {
+  rocksdb_prepopulate_blob_disable = 0,
+  rocksdb_prepopulate_blob_flush_only = 1
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prepopulate_blob_cache(
+    rocksdb_options_t* opt, int val);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache(
+    rocksdb_options_t* opt);
+
+/* returns a pointer to a malloc()-ed, null terminated string */
+extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
+                                                        int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+                                                      int64_t);
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_max_subcompactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_log_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_keep_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+                                                        size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+                                                        size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_arena_block_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
+                                                            const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*,
+                                                           unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_advise_random_on_open(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
+                                                    unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*,
+                                                       unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
+                                                      uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
+                                                        uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*,
+                                                     double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
+    rocksdb_options_t*, size_t, int32_t, int32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress(
+    rocksdb_options_t* opt, int level);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_successive_merges(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_bloom_locality(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_inplace_update_support(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t*,
+                                                  unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_avoid_unnecessary_blocking_io(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_experimental_mempurge_threshold(rocksdb_options_t*);
+
+enum {
+  rocksdb_tolerate_corrupted_tail_records_recovery = 0,
+  rocksdb_absolute_consistency_recovery = 1,
+  rocksdb_point_in_time_recovery = 2,
+  rocksdb_skip_any_corrupted_records_recovery = 3
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode(
+    rocksdb_options_t*);
+
+enum {
+  rocksdb_no_compression = 0,
+  rocksdb_snappy_compression = 1,
+  rocksdb_zlib_compression = 2,
+  rocksdb_bz2_compression = 3,
+  rocksdb_lz4_compression = 4,
+  rocksdb_lz4hc_compression = 5,
+  rocksdb_xpress_compression = 6,
+  rocksdb_zstd_compression = 7
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression(
+    rocksdb_options_t*);
+
+enum {
+  rocksdb_level_compaction = 0,
+  rocksdb_universal_compaction = 1,
+  rocksdb_fifo_compaction = 2
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_universal_compaction_options(
+    rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter(
+    rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
+    rocksdb_options_t* opt, rocksdb_cache_t* cache);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_compression(
+    rocksdb_options_t* opt, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression(
+    rocksdb_options_t* opt);
+
+/* RateLimiter */
+extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
+extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(
+    rocksdb_ratelimiter_t*);
+
+/* PerfContext */
+enum {
+  rocksdb_uninitialized = 0,
+  rocksdb_disable = 1,
+  rocksdb_enable_count = 2,
+  rocksdb_enable_time_except_for_mutex = 3,
+  rocksdb_enable_time = 4,
+  rocksdb_out_of_bounds = 5
+};
+
+enum {
+  rocksdb_user_key_comparison_count = 0,
+  rocksdb_block_cache_hit_count,
+  rocksdb_block_read_count,
+  rocksdb_block_read_byte,
+  rocksdb_block_read_time,
+  rocksdb_block_checksum_time,
+  rocksdb_block_decompress_time,
+  rocksdb_get_read_bytes,
+  rocksdb_multiget_read_bytes,
+  rocksdb_iter_read_bytes,
+  rocksdb_internal_key_skipped_count,
+  rocksdb_internal_delete_skipped_count,
+  rocksdb_internal_recent_skipped_count,
+  rocksdb_internal_merge_count,
+  rocksdb_get_snapshot_time,
+  rocksdb_get_from_memtable_time,
+  rocksdb_get_from_memtable_count,
+  rocksdb_get_post_process_time,
+  rocksdb_get_from_output_files_time,
+  rocksdb_seek_on_memtable_time,
+  rocksdb_seek_on_memtable_count,
+  rocksdb_next_on_memtable_count,
+  rocksdb_prev_on_memtable_count,
+  rocksdb_seek_child_seek_time,
+  rocksdb_seek_child_seek_count,
+  rocksdb_seek_min_heap_time,
+  rocksdb_seek_max_heap_time,
+  rocksdb_seek_internal_seek_time,
+  rocksdb_find_next_user_entry_time,
+  rocksdb_write_wal_time,
+  rocksdb_write_memtable_time,
+  rocksdb_write_delay_time,
+  rocksdb_write_pre_and_post_process_time,
+  rocksdb_db_mutex_lock_nanos,
+  rocksdb_db_condition_wait_nanos,
+  rocksdb_merge_operator_time_nanos,
+  rocksdb_read_index_block_nanos,
+  rocksdb_read_filter_block_nanos,
+  rocksdb_new_table_block_iter_nanos,
+  rocksdb_new_table_iterator_nanos,
+  rocksdb_block_seek_nanos,
+  rocksdb_find_table_nanos,
+  rocksdb_bloom_memtable_hit_count,
+  rocksdb_bloom_memtable_miss_count,
+  rocksdb_bloom_sst_hit_count,
+  rocksdb_bloom_sst_miss_count,
+  rocksdb_key_lock_wait_time,
+  rocksdb_key_lock_wait_count,
+  rocksdb_env_new_sequential_file_nanos,
+  rocksdb_env_new_random_access_file_nanos,
+  rocksdb_env_new_writable_file_nanos,
+  rocksdb_env_reuse_writable_file_nanos,
+  rocksdb_env_new_random_rw_file_nanos,
+  rocksdb_env_new_directory_nanos,
+  rocksdb_env_file_exists_nanos,
+  rocksdb_env_get_children_nanos,
+  rocksdb_env_get_children_file_attributes_nanos,
+  rocksdb_env_delete_file_nanos,
+  rocksdb_env_create_dir_nanos,
+  rocksdb_env_create_dir_if_missing_nanos,
+  rocksdb_env_delete_dir_nanos,
+  rocksdb_env_get_file_size_nanos,
+  rocksdb_env_get_file_modification_time_nanos,
+  rocksdb_env_rename_file_nanos,
+  rocksdb_env_link_file_nanos,
+  rocksdb_env_lock_file_nanos,
+  rocksdb_env_unlock_file_nanos,
+  rocksdb_env_new_logger_nanos,
+  rocksdb_number_async_seek,
+  rocksdb_blob_cache_hit_count,
+  rocksdb_blob_read_count,
+  rocksdb_blob_read_byte,
+  rocksdb_blob_read_time,
+  rocksdb_blob_checksum_time,
+  rocksdb_blob_decompress_time,
+  rocksdb_internal_range_del_reseek_count,
+  rocksdb_total_metric_count = 78
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
+    rocksdb_perfcontext_t* context);
+extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
+    rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy(
+    rocksdb_perfcontext_t* context);
+
+/* Compaction Filter */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
+rocksdb_compactionfilter_create(
+    void* state, void (*destructor)(void*),
+    unsigned char (*filter)(void*, int level, const char* key,
+                            size_t key_length, const char* existing_value,
+                            size_t value_length, char** new_value,
+                            size_t* new_value_length,
+                            unsigned char* value_changed),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots(
+    rocksdb_compactionfilter_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy(
+    rocksdb_compactionfilter_t*);
+
+/* Compaction Filter Context */
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+/* Compaction Filter Factory */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t*
+rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t*);
+
+/* Comparator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy(
+    rocksdb_comparator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t*
+rocksdb_comparator_with_ts_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                      size_t b_tslen),
+    int (*compare_without_ts)(void*, const char* a, size_t alen,
+                              unsigned char a_has_ts, const char* b,
+                              size_t blen, unsigned char b_has_ts),
+    const char* (*name)(void*), size_t timestamp_size);
+
+/* Filter policy */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy(
+    rocksdb_filterpolicy_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom_full(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level);
+
+/* Merge Operator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t*
+rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
+    rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
+    rocksdb_readoptions_t*);
+// The functionality that this option controlled has been removed.
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
+    rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*,
+                                                    uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp(
+    rocksdb_readoptions_t*, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts(
+    rocksdb_readoptions_t*, const char* ts, size_t tslen);
+
+/* Write options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
+    rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*,
+                                                        unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+                                                        unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t*);
+
+/* Compact range options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
+rocksdb_compactoptions_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_exclusive_manual_compaction(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
+    rocksdb_compactoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_full_history_ts_low(
+    rocksdb_compactoptions_t*, char* ts, size_t tslen);
+
+/* Flush options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
+    rocksdb_flushoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait(
+    rocksdb_flushoptions_t*);
+
+/* Memory allocator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t*
+rocksdb_jemalloc_nodump_allocator_create(char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy(
+    rocksdb_memory_allocator_t*);
+
+/* Cache */
+
+extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t*
+rocksdb_lru_cache_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
+    rocksdb_lru_cache_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits(
+    rocksdb_lru_cache_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
+    size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t*
+rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data(
+    rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
+    rocksdb_cache_t* cache, size_t capacity);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_capacity(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_usage(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
+
+/* DBPath */
+
+extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(
+    const char* path, uint64_t target_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
+
+/* Env */
+
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
+    rocksdb_envoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_create_dir_if_missing(
+    rocksdb_env_t* env, const char* path, char** errptr);
+
+/* SstFile */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env,
+                             const rocksdb_options_t* io_options);
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create_with_comparator(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+    const rocksdb_comparator_t* comparator);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open(
+    rocksdb_sstfilewriter_t* writer, const char* name, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put_with_ts(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_with_ts(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* ts, size_t tslen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_range(
+    rocksdb_sstfilewriter_t* writer, const char* begin_key, size_t begin_keylen,
+    const char* end_key, size_t end_keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish(
+    rocksdb_sstfilewriter_t* writer, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size(
+    rocksdb_sstfilewriter_t* writer, uint64_t* file_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
+    rocksdb_sstfilewriter_t* writer);
+extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_move_files(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char snapshot_consistency);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_blocking_flush);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
+    rocksdb_ingestexternalfileoptions_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file(
+    rocksdb_t* db, const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+    const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+    rocksdb_t* db, char** errptr);
+
+/* SliceTransform */
+
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create(
+    void* state, void (*destructor)(void*),
+    char* (*transform)(void*, const char* key, size_t length,
+                       size_t* dst_length),
+    unsigned char (*in_domain)(void*, const char* key, size_t length),
+    unsigned char (*in_range)(void*, const char* key, size_t length),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+    rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create_noop(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
+    rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+  rocksdb_similar_size_compaction_stop_style = 0,
+  rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_size_ratio(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_min_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_compression_size_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_stop_style(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
+    rocksdb_universal_compaction_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
+rocksdb_fifo_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
+    const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
+    const rocksdb_livefiles_t*);
+
+/* Utility Helpers */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
+    const rocksdb_options_t* base_options, const char* opts_str,
+    rocksdb_options_t* new_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range(
+    rocksdb_t* db, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr);
+
+/* MetaData */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata(rocksdb_t* db);
+
+/**
+ * Returns the rocksdb_column_family_metadata_t of the specified
+ * column family.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_column_family_metadata_destroy.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_metadata_destroy(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_column_family_metadata_get_size(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t rocksdb_column_family_metadata_get_file_count(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_metadata_get_name(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_column_family_metadata_get_level_count(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+/**
+ * Returns the rocksdb_level_metadata_t of the ith level from the specified
+ * column family metadata.
+ *
+ * If the specified i is greater than or equal to the number of levels
+ * in the specified column family, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_level_metadata_destroy before releasing its parent
+ * rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_level_metadata_t*
+rocksdb_column_family_metadata_get_level_metadata(
+    rocksdb_column_family_metadata_t* cf_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_level_metadata_t.
+ *
+ * Note that the specified rocksdb_level_metadata_t must be released
+ * before the release of its parent rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_level_metadata_destroy(
+    rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_level_metadata_get_level(
+    rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_level_metadata_get_file_count(rocksdb_level_metadata_t* level_meta);
+
+/**
+ * Returns the sst_file_metadata_t of the ith file from the specified level
+ * metadata.
+ *
+ * If the specified i is greater than or equal to the number of files
+ * in the specified level, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_sst_file_metadata_destroy before releasing its
+ * parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_sst_file_metadata_t*
+rocksdb_level_metadata_get_sst_file_metadata(
+    rocksdb_level_metadata_t* level_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_sst_file_metadata_t.
+ *
+ * Note that the specified rocksdb_sst_file_metadata_t must be released
+ * before the release of its parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_metadata_destroy(
+    rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API char*
+rocksdb_sst_file_metadata_get_relative_filename(
+    rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ *     after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_smallestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ *     after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_largestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/* Transactions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t*
+rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value(
+    rocksdb_transactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int(
+    rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_transaction_options_t* txn_options,
+    rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t**
+rocksdb_transactiondb_get_prepared_transactions(rocksdb_transactiondb_t* txn_db,
+                                                size_t* cnt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_name(
+    rocksdb_transaction_t* txn, const char* name, size_t name_len,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_name(
+    rocksdb_transaction_t* txn, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_prepare(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint(
+    rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
+    rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_transaction_get_writebatch_wi(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+    char** errptr);
+
+// This rocksdb_writebatch_wi_t should be freed with rocksdb_free
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch_wi(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_wi_t* wi, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_commit_timestamp(
+    rocksdb_transaction_t* txn, uint64_t commit_timestamp);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_set_read_timestamp_for_validation(
+    rocksdb_transaction_t* txn, uint64_t read_timestamp);
+
+// This snapshot should be freed using rocksdb_free
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned(rocksdb_transaction_t* txn,
+                               const rocksdb_readoptions_t* options,
+                               const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_cf(rocksdb_transaction_t* txn,
+                                  const rocksdb_readoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update(rocksdb_transaction_t* txn,
+                                          const rocksdb_readoptions_t* options,
+                                          const char* key, size_t klen,
+                                          unsigned char exclusive,
+                                          char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_readoptions_t* options,
+                                 const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+    size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+    size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
+                                    const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
+                                      const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
+    rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal(
+    rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db,
+                                               char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
+                                     const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db(
+    rocksdb_t* base_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
+rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr);
+
+/* Transaction Options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
+rocksdb_transactiondb_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
+    rocksdb_transactiondb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks(
+    rocksdb_transactiondb_options_t* opt, int64_t max_num_locks);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes(
+    rocksdb_transactiondb_options_t* opt, size_t num_stripes);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_transaction_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_default_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
+rocksdb_transaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
+    rocksdb_transaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout(
+    rocksdb_transaction_options_t* opt, int64_t lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration(
+    rocksdb_transaction_options_t* opt, int64_t expiration);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_deadlock_detect_depth(
+    rocksdb_transaction_options_t* opt, int64_t depth);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_max_write_batch_size(
+    rocksdb_transaction_options_t* opt, size_t size);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_skip_prepare(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val);
+
+// referring to convention (3), this should be used by client
+// to free memory that was malloc()ed
+extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy(
+    rocksdb_pinnableslice_t* v);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
+    const rocksdb_pinnableslice_t* t, size_t* vlen);
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+rocksdb_memory_consumers_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+    rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+    rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+    rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+                                        char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+    rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats(
+    rocksdb_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*,
+                                                 unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work(
+    rocksdb_t* db, unsigned char wait);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
new file mode 100644
index 000000000..575d276b5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -0,0 +1,775 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ConfigOptions;
+class Logger;
+class SecondaryCache;
+
+// Classifications of block cache entries.
+//
+// Developer notes: Adding a new enum to this class requires corresponding
+// updates to `kCacheEntryRoleToCamelString` and
+// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since
+// `kNumCacheEntryRoles` assumes `kMisc` comes last.
+enum class CacheEntryRole {
+  // Block-based table data block
+  kDataBlock,
+  // Block-based table filter block (full or partitioned)
+  kFilterBlock,
+  // Block-based table metadata block for partitioned filter
+  kFilterMetaBlock,
+  // OBSOLETE / DEPRECATED: old/removed block-based filter
+  kDeprecatedFilterBlock,
+  // Block-based table index block
+  kIndexBlock,
+  // Other kinds of block-based table block
+  kOtherBlock,
+  // WriteBufferManager's charge to account for its memtable usage
+  kWriteBuffer,
+  // Compression dictionary building buffer's charge to account for
+  // its memory usage
+  kCompressionDictionaryBuildingBuffer,
+  // Filter's charge to account for
+  // (new) bloom and ribbon filter construction's memory usage
+  kFilterConstruction,
+  // BlockBasedTableReader's charge to account for its memory usage
+  kBlockBasedTableReader,
+  // FileMetadata's charge to account for its memory usage
+  kFileMetadata,
+  // Blob value (when using the same cache as block cache and blob cache)
+  kBlobValue,
+  // Blob cache's charge to account for its memory usage (when using a
+  // separate block cache and blob cache)
+  kBlobCache,
+  // Default bucket, for miscellaneous cache entries. Do not use for
+  // entries that could potentially add up to large usage.
+  kMisc,
+};
+constexpr uint32_t kNumCacheEntryRoles =
+    static_cast<uint32_t>(CacheEntryRole::kMisc) + 1;
+
+// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`.
+const std::string& GetCacheEntryRoleName(CacheEntryRole);
+
+// For use with `GetMapProperty()` for property
+// `DB::Properties::kBlockCacheEntryStats`. On success, the map will
+// be populated with all keys that can be obtained from these functions.
+struct BlockCacheEntryStatsMapKeys {
+  static const std::string& CacheId();
+  static const std::string& CacheCapacityBytes();
+  static const std::string& LastCollectionDurationSeconds();
+  static const std::string& LastCollectionAgeSeconds();
+
+  static std::string EntryCount(CacheEntryRole);
+  static std::string UsedBytes(CacheEntryRole);
+  static std::string UsedPercent(CacheEntryRole);
+};
+
+extern const bool kDefaultToAdaptiveMutex;
+
+enum CacheMetadataChargePolicy {
+  // Only the `charge` of each entry inserted into a Cache counts against
+  // the `capacity`
+  kDontChargeCacheMetadata,
+  // In addition to the `charge`, the approximate space overheads in the
+  // Cache (in bytes) also count against `capacity`. These space overheads
+  // are for supporting fast Lookup and managing the lifetime of entries.
+  kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+    kFullChargeCacheMetadata;
+
+// Options shared betweeen various cache implementations that
+// divide the key space into shards using hashing.
+struct ShardedCacheOptions {
+  // Capacity of the cache, in the same units as the `charge` of each entry.
+  // This is typically measured in bytes, but can be a different unit if using
+  // kDontChargeCacheMetadata.
+  size_t capacity = 0;
+
+  // Cache is sharded into 2^num_shard_bits shards, by hash of key.
+  // If < 0, a good default is chosen based on the capacity and the
+  // implementation. (Mutex-based implementations are much more reliant
+  // on many shards for parallel scalability.)
+  int num_shard_bits = -1;
+
+  // If strict_capacity_limit is set, Insert() will fail if there is not
+  // enough capacity for the new entry along with all the existing referenced
+  // (pinned) cache entries. (Unreferenced cache entries are evicted as
+  // needed, sometimes immediately.) If strict_capacity_limit == false
+  // (default), Insert() never fails.
+  bool strict_capacity_limit = false;
+
+  // If non-nullptr, RocksDB will use this allocator instead of system
+  // allocator when allocating memory for cache blocks.
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
+  std::shared_ptr<MemoryAllocator> memory_allocator;
+
+  // See CacheMetadataChargePolicy
+  CacheMetadataChargePolicy metadata_charge_policy =
+      kDefaultCacheMetadataChargePolicy;
+
+  ShardedCacheOptions() {}
+  ShardedCacheOptions(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy)
+      : capacity(_capacity),
+        num_shard_bits(_num_shard_bits),
+        strict_capacity_limit(_strict_capacity_limit),
+        memory_allocator(std::move(_memory_allocator)),
+        metadata_charge_policy(_metadata_charge_policy) {}
+};
+
+struct LRUCacheOptions : public ShardedCacheOptions {
+  // Ratio of cache reserved for high-priority and low-priority entries,
+  // respectively. (See Cache::Priority below more information on the levels.)
+  // Valid values are between 0 and 1 (inclusive), and the sum of the two
+  // values cannot exceed 1.
+  //
+  // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU
+  // list is maintained by the cache. Similarly, if low_pri_pool_ratio is
+  // greater than zero, a dedicated low-priority LRU list is maintained.
+  // There is also a bottom-priority LRU list, which is always enabled and not
+  // explicitly configurable. Entries are spilled over to the next available
+  // lower-priority pool if a certain pool's capacity is exceeded.
+  //
+  // Entries with cache hits are inserted into the highest priority LRU list
+  // available regardless of the entry's priority. Entries without hits
+  // are inserted into highest priority LRU list available whose priority
+  // does not exceed the entry's priority. (For example, high-priority items
+  // with no hits are placed in the high-priority pool if available;
+  // otherwise, they are placed in the low-priority pool if available;
+  // otherwise, they are placed in the bottom-priority pool.) This results
+  // in lower-priority entries without hits getting evicted from the cache
+  // sooner.
+  //
+  // Default values: high_pri_pool_ratio = 0.5 (which is referred to as
+  // "midpoint insertion"), low_pri_pool_ratio = 0
+  double high_pri_pool_ratio = 0.5;
+  double low_pri_pool_ratio = 0.0;
+
+  // Whether to use adaptive mutexes for cache shards. Note that adaptive
+  // mutexes need to be supported by the platform in order for this to have any
+  // effect. The default value is true if RocksDB is compiled with
+  // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
+  bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
+
+  // A SecondaryCache instance to use a the non-volatile tier.
+  std::shared_ptr<SecondaryCache> secondary_cache;
+
+  LRUCacheOptions() {}
+  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
+                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
+                  std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+                  CacheMetadataChargePolicy _metadata_charge_policy =
+                      kDefaultCacheMetadataChargePolicy,
+                  double _low_pri_pool_ratio = 0.0)
+      : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                            std::move(_memory_allocator),
+                            _metadata_charge_policy),
+        high_pri_pool_ratio(_high_pri_pool_ratio),
+        low_pri_pool_ratio(_low_pri_pool_ratio),
+        use_adaptive_mutex(_use_adaptive_mutex) {}
+};
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^num_shard_bits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. If strict_capacity_limit
+// is set, insert to the cache will fail when cache is full. User can also
+// set percentage of the cache reserves for high priority entries via
+// high_pri_pool_pct.
+// num_shard_bits = -1 means it is automatically determined: every shard
+// will be at least 512KB and number of shard bits will not exceed 6.
+extern std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy,
+    double low_pri_pool_ratio = 0.0);
+
+extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
+
+// EXPERIMENTAL
+// Options structure for configuring a SecondaryCache instance based on
+// LRUCache. The LRUCacheOptions.secondary_cache is not used and
+// should not be set.
+struct CompressedSecondaryCacheOptions : LRUCacheOptions {
+  // The compression method (if any) that is used to compress data.
+  CompressionType compression_type = CompressionType::kLZ4Compression;
+
+  // compress_format_version can have two values:
+  // compress_format_version == 1 -- decompressed size is not included in the
+  // block header.
+  // compress_format_version == 2 -- decompressed size is included in the block
+  // header in varint32 format.
+  uint32_t compress_format_version = 2;
+
+  // Enable the custom split and merge feature, which split the compressed value
+  // into chunks so that they may better fit jemalloc bins.
+  bool enable_custom_split_merge = false;
+
+  CompressedSecondaryCacheOptions() {}
+  CompressedSecondaryCacheOptions(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      double _high_pri_pool_ratio, double _low_pri_pool_ratio = 0.0,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy,
+      CompressionType _compression_type = CompressionType::kLZ4Compression,
+      uint32_t _compress_format_version = 2,
+      bool _enable_custom_split_merge = false)
+      : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                        _high_pri_pool_ratio, std::move(_memory_allocator),
+                        _use_adaptive_mutex, _metadata_charge_policy,
+                        _low_pri_pool_ratio),
+        compression_type(_compression_type),
+        compress_format_version(_compress_format_version),
+        enable_custom_split_merge(_enable_custom_split_merge) {}
+};
+
+// EXPERIMENTAL
+// Create a new Secondary Cache that is implemented on top of LRUCache.
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+    double low_pri_pool_ratio = 0.0,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy,
+    CompressionType compression_type = CompressionType::kLZ4Compression,
+    uint32_t compress_format_version = 2,
+    bool enable_custom_split_merge = false);
+
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    const CompressedSecondaryCacheOptions& opts);
+
+// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
+// that offers much improved CPU efficiency vs. LRUCache under high parallel
+// load or high contention, with some caveats:
+// * Not a general Cache implementation: can only be used for
+// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is
+// compatible with HyperClockCache.
+// * Requires an extra tuning parameter: see estimated_entry_charge below.
+// Similarly, substantially changing the capacity with SetCapacity could
+// harm efficiency.
+// * SecondaryCache is not yet supported.
+// * Cache priorities are less aggressively enforced, which could cause
+// cache dilution from long range scans (unless they use fill_cache=false).
+// * Can be worse for small caches, because if almost all of a cache shard is
+// pinned (more likely with non-partitioned filters), then CLOCK eviction
+// becomes very CPU intensive.
+//
+// See internal cache/clock_cache.h for full description.
+struct HyperClockCacheOptions : public ShardedCacheOptions {
+  // The estimated average `charge` associated with cache entries. This is a
+  // critical configuration parameter for good performance from the hyper
+  // cache, because having a table size that is fixed at creation time greatly
+  // reduces the required synchronization between threads.
+  // * If the estimate is substantially too low (e.g. less than half the true
+  // average) then metadata space overhead with be substantially higher (e.g.
+  // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this
+  // can slightly reduce cache hit rates, and slightly reduce access times due
+  // to the larger working memory size.
+  // * If the estimate is substantially too high (e.g. 25% higher than the true
+  // average) then there might not be sufficient slots in the hash table for
+  // both efficient operation and capacity utilization (hit rate). The hyper
+  // cache will evict entries to prevent load factors that could dramatically
+  // affect lookup times, instead letting the hit rate suffer by not utilizing
+  // the full capacity.
+  //
+  // A reasonable choice is the larger of block_size and metadata_block_size.
+  // When WriteBufferManager (and similar) charge memory usage to the block
+  // cache, this can lead to the same effect as estimate being too low, which
+  // is better than the opposite. Therefore, the general recommendation is to
+  // assume that other memory charged to block cache could be negligible, and
+  // ignore it in making the estimate.
+  //
+  // The best parameter choice based on a cache in use is given by
+  // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as
+  // with kDontChargeCacheMetadata. More precisely with
+  // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) /
+  // GetOccupancyCount(). However, when the average value size might vary
+  // (e.g. balance between metadata and data blocks in cache), it is better
+  // to estimate toward the lower side than the higher side.
+  size_t estimated_entry_charge;
+
+  HyperClockCacheOptions(
+      size_t _capacity, size_t _estimated_entry_charge,
+      int _num_shard_bits = -1, bool _strict_capacity_limit = false,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy)
+      : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                            std::move(_memory_allocator),
+                            _metadata_charge_policy),
+        estimated_entry_charge(_estimated_entry_charge) {}
+
+  // Construct an instance of HyperClockCache using these options
+  std::shared_ptr<Cache> MakeSharedCache() const;
+};
+
+// DEPRECATED - The old Clock Cache implementation had an unresolved bug and
+// has been removed. The new HyperClockCache requires an additional
+// configuration parameter that is not provided by this API. This function
+// simply returns a new LRUCache for functional compatibility.
+extern std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
+
+class Cache {
+ public:  // opaque types
+  // Opaque handle to an entry stored in the cache.
+  struct Handle {};
+
+ public:  // type defs
+  // Depending on implementation, cache entries with higher priority levels
+  // could be less likely to get evicted than entries with lower priority
+  // levels. The "high" priority level applies to certain SST metablocks (e.g.
+  // index and filter blocks) if the option
+  // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority
+  // level is used for other kinds of SST blocks (most importantly, data
+  // blocks), as well as the above metablocks in case
+  // cache_index_and_filter_blocks_with_high_priority is
+  // not set. The "bottom" priority level is for BlobDB's blob values.
+  enum class Priority { HIGH, LOW, BOTTOM };
+
+  // A set of callbacks to allow objects in the primary block cache to be
+  // be persisted in a secondary cache. The purpose of the secondary cache
+  // is to support other ways of caching the object, such as persistent or
+  // compressed data, that may require the object to be parsed and transformed
+  // in some way. Since the primary cache holds C++ objects and the secondary
+  // cache may only hold flat data that doesn't need relocation, these
+  // callbacks need to be provided by the user of the block
+  // cache to do the conversion.
+  // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers
+  // to callback functions for size, saving and deletion of the
+  // object. The callbacks are defined in C-style in order to make them
+  // stateless and not add to the cache metadata size.
+  // Saving multiple std::function objects will take up 32 bytes per
+  // function, even if its not bound to an object and does no capture.
+  //
+  // All the callbacks are C-style function pointers in order to simplify
+  // lifecycle management. Objects in the cache can outlive the parent DB,
+  // so anything required for these operations should be contained in the
+  // object itself.
+  //
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  using CreateCallback = std::function<Status(const void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+
+ public:  // ctor/dtor/create
+  Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+      : memory_allocator_(std::move(allocator)) {}
+  // No copying allowed
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
+
+  // Destroys all remaining entries by calling the associated "deleter"
+  virtual ~Cache() {}
+
+  // Creates a new Cache based on the input value string and returns the result.
+  // Currently, this method can be used to create LRUCaches only
+  // @param config_options
+  // @param value  The value might be:
+  //   - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102(
+  //   - Name-value option pairs -- "capacity=1M; num_shard_bits=4;
+  //     For the LRUCache, the values are defined in LRUCacheOptions.
+  // @param result The new Cache object
+  // @return OK if the cache was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<Cache>* result);
+
+ public:  // functions
+  // The type of the Cache
+  virtual const char* Name() const = 0;
+
+  // EXPERIMENTAL SecondaryCache support:
+  // Some APIs here are experimental and might change in the future.
+  // The Insert and Lookup APIs below are intended to allow cached objects
+  // to be demoted/promoted between the primary block cache and a secondary
+  // cache. The secondary cache could be a non-volatile cache, and will
+  // likely store the object in a different representation. They rely on a
+  // per object CacheItemHelper to do the conversions.
+  // The secondary cache may persist across process and system restarts,
+  // and may even be moved between hosts. Therefore, the cache key must
+  // be repeatable across restarts/reboots, and globally unique if
+  // multiple DBs share the same cache and the set of DBs can change
+  // over time.
+
+  // Insert a mapping from key->value into the volatile cache only
+  // and assign it with the specified charge against the total cache capacity.
+  // If strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::MemoryLimit.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter" which must delete the value.
+  // (The Cache is responsible for copying and reclaiming space for
+  // the key.)
+  virtual Status Insert(const Slice& key, void* value, size_t charge,
+                        DeleterFn deleter, Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) = 0;
+
+  // EXPERIMENTAL
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity. If
+  // strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::MemoryLimit. `value` must be non-nullptr for this
+  // Insert() because Value() == nullptr is reserved for indicating failure
+  // with secondary-cache-compatible mappings.
+  //
+  // The helper argument is saved by the cache and will be used when the
+  // inserted object is evicted or promoted to the secondary cache. It,
+  // therefore, must outlive the cache.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // Regardless of whether the item was inserted into the cache,
+  // it will attempt to insert it into the secondary cache if one is
+  // configured, and the helper supports it.
+  // The cache implementation must support a secondary cache, otherwise
+  // the item is only inserted into the primary cache. It may
+  // defer the insertion to the secondary cache as it sees fit.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t charge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    return Insert(key, value, charge, helper->del_cb, handle, priority);
+  }
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  // If stats is not nullptr, relative tickers could be used inside the
+  // function.
+  virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0;
+
+  // EXPERIMENTAL
+  // Lookup the key in the primary and secondary caches (if one is configured).
+  // The create_cb callback function object will be used to contruct the
+  // cached object.
+  // If none of the caches have the mapping for the key, returns nullptr.
+  // Else, returns a handle that corresponds to the mapping.
+  //
+  // This call may promote the object from the secondary cache (if one is
+  // configured, and has the given key) to the primary cache.
+  //
+  // The helper argument should be provided if the caller wants the lookup
+  // to include the secondary cache (if one is configured) and the object,
+  // if it exists, to be promoted to the primary cache. The helper may be
+  // saved and used later when the object is evicted. Therefore, it must
+  // outlive the cache.
+  //
+  // ======================== Async Lookup (wait=false) ======================
+  // When wait=false, the handle returned might be in any of three states:
+  // * Present - If Value() != nullptr, then the result is present and
+  // the handle can be used just as if wait=true.
+  // * Pending, not ready (IsReady() == false) - secondary cache is still
+  // working to retrieve the value. Might become ready any time.
+  // * Pending, ready (IsReady() == true) - secondary cache has the value
+  // but it has not been loaded into primary cache. Call to Wait()/WaitAll()
+  // will not block.
+  //
+  // IMPORTANT: Pending handles are not thread-safe, and only these functions
+  // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release()
+  // can only come after Wait() or WaitAll() even though a reference is held.
+  //
+  // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is
+  // safe and has no effect on other handle states.) After waiting on a Handle,
+  // it is in one of two states:
+  // * Present - if Value() != nullptr
+  // * Failed - if Value() == nullptr, such as if the secondary cache
+  // initially thought it had the value but actually did not.
+  //
+  // Note that given an arbitrary Handle, the only way to distinguish the
+  // Pending+ready state from the Failed state is to Wait() on it. A cache
+  // entry not compatible with secondary cache can also have Value()==nullptr
+  // like the Failed state, but this is not generally a concern.
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/,
+                         const CreateCallback& /*create_cb*/,
+                         Priority /*priority*/, bool /*wait*/,
+                         Statistics* stats = nullptr) {
+    return Lookup(key, stats);
+  }
+
+  // Increments the reference count for the handle if it refers to an entry in
+  // the cache. Returns true if refcount was incremented; otherwise, returns
+  // false.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual bool Ref(Handle* handle) = 0;
+
+  /**
+   * Release a mapping returned by a previous Lookup(). A released entry might
+   * still remain in cache in case it is later looked up by others. If
+   * erase_if_last_ref is set then it also erases it from the cache if there is
+   * no other reference to  it. Erasing it should call the deleter function that
+   * was provided when the entry was inserted.
+   *
+   * Returns true if the entry was also erased.
+   */
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains the entry for the key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharding the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // sets the maximum configured capacity of the cache. When the new
+  // capacity is less than the old capacity and the existing usage is
+  // greater than new capacity, the implementation will do its best job to
+  // purge the released entries from the cache in order to lower the usage
+  virtual void SetCapacity(size_t capacity) = 0;
+
+  // Set whether to return error on insertion when cache reaches its full
+  // capacity.
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+
+  // Get the flag whether to return error on insertion when cache reaches its
+  // full capacity.
+  virtual bool HasStrictCapacityLimit() const = 0;
+
+  // Returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() const = 0;
+
+  // Returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
+
+  // Returns the number of entries currently tracked in the table. SIZE_MAX
+  // means "not supported." This is used for inspecting the load factor, along
+  // with GetTableAddressCount().
+  virtual size_t GetOccupancyCount() const { return SIZE_MAX; }
+
+  // Returns the number of ways the hash function is divided for addressing
+  // entries. Zero means "not supported." This is used for inspecting the load
+  // factor, along with GetOccupancyCount().
+  virtual size_t GetTableAddressCount() const { return 0; }
+
+  // Returns the memory size for a specific entry in the cache.
+  virtual size_t GetUsage(Handle* handle) const = 0;
+
+  // Returns the memory size for the entries in use by the system
+  virtual size_t GetPinnedUsage() const = 0;
+
+  // Returns the charge for the specific entry in the cache.
+  virtual size_t GetCharge(Handle* handle) const = 0;
+
+  // Returns the deleter for the specified entry. This might seem useless
+  // as the Cache itself is responsible for calling the deleter, but
+  // the deleter can essentially verify that a cache entry is of an
+  // expected type from an expected code source.
+  virtual DeleterFn GetDeleter(Handle* handle) const = 0;
+
+  // Call this on shutdown if you want to speed it up. Cache will disown
+  // any underlying data and will not free it on delete. This call will leak
+  // memory - call this only if you're shutting down the process.
+  // Any attempts of using cache after this call will fail terribly.
+  // Always delete the DB object before calling this method!
+  virtual void DisownData() {
+    // default implementation is noop
+  }
+
+  struct ApplyToAllEntriesOptions {
+    // If the Cache uses locks, setting `average_entries_per_lock` to
+    // a higher value suggests iterating over more entries each time a lock
+    // is acquired, likely reducing the time for ApplyToAllEntries but
+    // increasing latency for concurrent users of the Cache. Setting
+    // `average_entries_per_lock` to a smaller value could be helpful if
+    // callback is relatively expensive, such as using large data structures.
+    size_t average_entries_per_lock = 256;
+  };
+
+  // Apply a callback to all entries in the cache. The Cache must ensure
+  // thread safety but does not guarantee that a consistent snapshot of all
+  // entries is iterated over if other threads are operating on the Cache
+  // also.
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) = 0;
+
+  // DEPRECATED version of above. (Default implementation uses above.)
+  virtual void ApplyToAllCacheEntries(void (*callback)(void* value,
+                                                       size_t charge),
+                                      bool /*thread_safe*/) {
+    ApplyToAllEntries([callback](const Slice&, void* value, size_t charge,
+                                 DeleterFn) { callback(value, charge); },
+                      {});
+  }
+
+  // Remove all entries.
+  // Prerequisite: no entry is referenced.
+  virtual void EraseUnRefEntries() = 0;
+
+  virtual std::string GetPrintableOptions() const { return ""; }
+
+  // Check for any warnings or errors in the operation of the cache and
+  // report them to the logger. This is intended only to be called
+  // periodically so does not need to be very efficient. (Obscure calling
+  // conventions for Logger inherited from env.h)
+  virtual void ReportProblems(
+      const std::shared_ptr<Logger>& /*info_log*/) const {}
+
+  MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
+
+  // EXPERIMENTAL
+  // Release a mapping returned by a previous Lookup(). The "useful"
+  // parameter specifies whether the data was actually used or not,
+  // which may be used by the cache implementation to decide whether
+  // to consider it as a hit for retention purposes. As noted elsewhere,
+  // "pending" handles require Wait()/WaitAll() before Release().
+  virtual bool Release(Handle* handle, bool /*useful*/,
+                       bool erase_if_last_ref) {
+    return Release(handle, erase_if_last_ref);
+  }
+
+  // EXPERIMENTAL
+  // Determines if the handle returned by Lookup() can give a value without
+  // blocking, though Wait()/WaitAll() might be required to publish it to
+  // Value(). See secondary cache compatible Lookup() above for details.
+  // This call is not thread safe on "pending" handles.
+  virtual bool IsReady(Handle* /*handle*/) { return true; }
+
+  // EXPERIMENTAL
+  // Convert a "pending" handle into a full thread-shareable handle by
+  // * If necessary, wait until secondary cache finishes loading the value.
+  // * Construct the value for primary cache and set it in the handle.
+  // Even after Wait() on a pending handle, the caller must check for
+  // Value() == nullptr in case of failure. This call is not thread-safe
+  // on pending handles. This call has no effect on non-pending handles.
+  // See secondary cache compatible Lookup() above for details.
+  virtual void Wait(Handle* /*handle*/) {}
+
+  // EXPERIMENTAL
+  // Wait for a vector of handles to become ready. As with Wait(), the user
+  // should check the Value() of each handle for nullptr. This call is not
+  // thread-safe on pending handles.
+  virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
+
+ private:
+  std::shared_ptr<MemoryAllocator> memory_allocator_;
+};
+
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cache_bench_tool.h b/src/rocksdb/include/rocksdb/cache_bench_tool.h
new file mode 100644
index 000000000..413ce1593
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache_bench_tool.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int cache_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h
new file mode 100644
index 000000000..afc736673
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cleanable.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cleanable {
+ public:
+  Cleanable();
+  // No copy constructor and copy assignment allowed.
+  Cleanable(Cleanable&) = delete;
+  Cleanable& operator=(Cleanable&) = delete;
+
+  // Executes all the registered cleanups
+  ~Cleanable();
+
+  // Move constructor and move assignment is allowed.
+  Cleanable(Cleanable&&) noexcept;
+  Cleanable& operator=(Cleanable&&) noexcept;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  using CleanupFunction = void (*)(void* arg1, void* arg2);
+
+  // Add another Cleanup to the list
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+  // Move the cleanups owned by this Cleanable to another Cleanable, adding to
+  // any existing cleanups it has
+  void DelegateCleanupsTo(Cleanable* other);
+
+  // DoCleanup and also resets the pointers for reuse
+  inline void Reset() {
+    DoCleanup();
+    cleanup_.function = nullptr;
+    cleanup_.next = nullptr;
+  }
+
+  inline bool HasCleanups() { return cleanup_.function != nullptr; }
+
+ protected:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+  // It also becomes the owner of c
+  void RegisterCleanup(Cleanup* c);
+
+ private:
+  // Performs all the cleanups. It does not reset the pointers. Making it
+  // private
+  // to prevent misuse
+  inline void DoCleanup() {
+    if (cleanup_.function != nullptr) {
+      (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+      for (Cleanup* c = cleanup_.next; c != nullptr;) {
+        (*c->function)(c->arg1, c->arg2);
+        Cleanup* next = c->next;
+        delete c;
+        c = next;
+      }
+    }
+  }
+};
+
+// A copyable, reference-counted pointer to a simple Cleanable that only
+// performs registered cleanups after all copies are destroy. This is like
+// shared_ptr<Cleanable> but works more efficiently with wrapping the pointer
+// in an outer Cleanable (see RegisterCopyWith() and MoveAsCleanupTo()).
+// WARNING: if you create a reference cycle, for example:
+//   SharedCleanablePtr scp;
+//   scp.Allocate();
+//   scp.RegisterCopyWith(&*scp);
+// It will prevent cleanups from ever happening!
+class SharedCleanablePtr {
+ public:
+  // Empy/null pointer
+  SharedCleanablePtr() {}
+  // Copy and move constructors and assignment
+  SharedCleanablePtr(const SharedCleanablePtr& from);
+  SharedCleanablePtr(SharedCleanablePtr&& from) noexcept;
+  SharedCleanablePtr& operator=(const SharedCleanablePtr& from);
+  SharedCleanablePtr& operator=(SharedCleanablePtr&& from) noexcept;
+  // Destructor (decrement refcount if non-null)
+  ~SharedCleanablePtr();
+  // Create a new simple Cleanable and make this assign this pointer to it.
+  // (Reset()s first if necessary.)
+  void Allocate();
+  // Reset to empty/null (decrement refcount if previously non-null)
+  void Reset();
+  // Dereference to pointed-to Cleanable
+  Cleanable& operator*();
+  Cleanable* operator->();
+  // Get as raw pointer to Cleanable
+  Cleanable* get();
+
+  // Creates a (virtual) copy of this SharedCleanablePtr and registers its
+  // destruction with target, so that the cleanups registered with the
+  // Cleanable pointed to by this can only happen after the cleanups in the
+  // target Cleanable are run.
+  // No-op if this is empty (nullptr).
+  void RegisterCopyWith(Cleanable* target);
+
+  // Moves (virtually) this shared pointer to a new cleanup in the target.
+  // This is essentilly a move semantics version of RegisterCopyWith(), for
+  // performance optimization. No-op if this is empty (nullptr).
+  void MoveAsCleanupTo(Cleanable* target);
+
+ private:
+  struct Impl;
+  Impl* ptr_ = nullptr;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
new file mode 100644
index 000000000..9c6a9c30d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+
+// CompactionFilter allows an application to modify/delete a key-value during
+// table file creation.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilter : public Customizable {
+ public:
+  enum ValueType {
+    kValue,
+    kMergeOperand,
+    kBlobIndex,  // used internally by BlobDB.
+  };
+
+  enum class Decision {
+    kKeep,
+    kRemove,
+    kChangeValue,
+    kRemoveAndSkipUntil,
+    kChangeBlobIndex,  // used internally by BlobDB.
+    kIOError,          // used internally by BlobDB.
+    kPurge,            // used for keys that can only be SingleDelete'ed
+    kUndetermined,
+  };
+
+  enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
+
+  // Context information for a table file creation.
+  struct Context {
+    // Whether this table file is created as part of a compaction including all
+    // table files.
+    bool is_full_compaction;
+    // Whether this table file is created as part of a compaction requested by
+    // the client.
+    bool is_manual_compaction;
+    // The column family that will contain the created table file.
+    uint32_t column_family_id;
+    // Reason this table file is being created.
+    TableFileCreationReason reason;
+  };
+
+  virtual ~CompactionFilter() {}
+  static const char* Type() { return "CompactionFilter"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& name,
+                                 const CompactionFilter** result);
+
+  // The table file creation process invokes this method before adding a kv to
+  // the table file. A return value of false indicates that the kv should be
+  // preserved in the new table file and a return value of true indicates
+  // that this key-value should be removed from the new table file. The
+  // application can inspect the existing value of the key and make decision
+  // based on it.
+  //
+  // Key-Values that are results of merge operation during table file creation
+  // are not passed into this function. Currently, when you have a mix of Put()s
+  // and Merge()s on a same key, we only guarantee to process the merge operands
+  // through the `CompactionFilter`s. Put()s might be processed, or might not.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  //
+  // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
+  // DB* object) will not guarantee to preserve the state of the DB with
+  // CompactionFilter. Data seen from a snapshot might disappear after a
+  // table file created with a `CompactionFilter` is installed. If you use
+  // snapshots, think twice about whether you want to use `CompactionFilter` and
+  // whether you are using it in a safe way.
+  //
+  // If multithreaded compaction is being used *and* a single CompactionFilter
+  // instance was supplied via Options::compaction_filter, this method may be
+  // called from different threads concurrently.  The application must ensure
+  // that the call is thread-safe.
+  //
+  // If the CompactionFilter was created by a factory, then it will only ever
+  // be used by a single thread that is doing the table file creation, and this
+  // call does not need to be thread-safe.  However, multiple filters may be
+  // in existence and operating concurrently.
+  virtual bool Filter(int /*level*/, const Slice& /*key*/,
+                      const Slice& /*existing_value*/,
+                      std::string* /*new_value*/,
+                      bool* /*value_changed*/) const {
+    return false;
+  }
+
+  // The table file creation process invokes this method on every merge operand.
+  // If this method returns true, the merge operand will be ignored and not
+  // written out in the new table file.
+  //
+  // Note: If you are using a TransactionDB, it is not recommended to implement
+  // FilterMergeOperand().  If a Merge operation is filtered out, TransactionDB
+  // may not realize there is a write conflict and may allow a Transaction to
+  // Commit that should have failed.  Instead, it is better to implement any
+  // Merge filtering inside the MergeOperator.
+  virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                                  const Slice& /*operand*/) const {
+    return false;
+  }
+
+  // An extended API. Called for both values and merge operands.
+  // Allows changing value and skipping ranges of keys.
+  // The default implementation uses Filter() and FilterMergeOperand().
+  // If you're overriding this method, no need to override the other two.
+  // `value_type` indicates whether this key-value corresponds to a normal
+  // value (e.g. written with Put())  or a merge operand (written with Merge()).
+  //
+  // Possible return values:
+  //  * kKeep - keep the key-value pair.
+  //  * kRemove - remove the key-value pair or merge operand.
+  //  * kChangeValue - keep the key and change the value/operand to *new_value.
+  //  * kRemoveAndSkipUntil - remove this key-value pair, and also remove
+  //      all key-value pairs with key in [key, *skip_until). This range
+  //      of keys will be skipped without reading, potentially saving some
+  //      IO operations compared to removing the keys one by one.
+  //
+  //      *skip_until <= key is treated the same as Decision::kKeep
+  //      (since the range [key, *skip_until) is empty).
+  //
+  //      Caveats:
+  //       - The keys are skipped even if there are snapshots containing them,
+  //         i.e. values removed by kRemoveAndSkipUntil can disappear from a
+  //         snapshot - beware if you're using TransactionDB or
+  //         DB::GetSnapshot().
+  //       - If value for a key was overwritten or merged into (multiple Put()s
+  //         or Merge()s), and `CompactionFilter` skips this key with
+  //         kRemoveAndSkipUntil, it's possible that it will remove only
+  //         the new value, exposing the old value that was supposed to be
+  //         overwritten.
+  //       - Doesn't work with PlainTableFactory in prefix mode.
+  //       - If you use kRemoveAndSkipUntil for table files created by
+  //         compaction, consider also reducing compaction_readahead_size
+  //         option.
+  //
+  // Should never return kUndetermined.
+  // Note: If you are using a TransactionDB, it is not recommended to filter
+  // out or modify merge operands (ValueType::kMergeOperand).
+  // If a merge operation is filtered out, TransactionDB may not realize there
+  // is a write conflict and may allow a Transaction to Commit that should have
+  // failed. Instead, it is better to implement any Merge filtering inside the
+  // MergeOperator.
+  // key includes timestamp if user-defined timestamp is enabled.
+  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                            const Slice& existing_value, std::string* new_value,
+                            std::string* /*skip_until*/) const {
+    switch (value_type) {
+      case ValueType::kValue: {
+        bool value_changed = false;
+        bool rv = Filter(level, key, existing_value, new_value, &value_changed);
+        if (rv) {
+          return Decision::kRemove;
+        }
+        return value_changed ? Decision::kChangeValue : Decision::kKeep;
+      }
+      case ValueType::kMergeOperand: {
+        bool rv = FilterMergeOperand(level, key, existing_value);
+        return rv ? Decision::kRemove : Decision::kKeep;
+      }
+      case ValueType::kBlobIndex:
+        return Decision::kKeep;
+    }
+    assert(false);
+    return Decision::kKeep;
+  }
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual BlobDecision PrepareBlobOutput(const Slice& /* key */,
+                                         const Slice& /* existing_value */,
+                                         std::string* /* new_value */) const {
+    return BlobDecision::kKeep;
+  }
+
+  // This function is deprecated. Snapshots will always be ignored for
+  // `CompactionFilter`s, because we realized that not ignoring snapshots
+  // doesn't provide the guarantee we initially thought it would provide.
+  // Repeatable reads will not be guaranteed anyway. If you override the
+  // function and returns false, we will fail the table file creation.
+  virtual bool IgnoreSnapshots() const { return true; }
+
+  // Returns a name that identifies this `CompactionFilter`.
+  // The name will be printed to LOG file on start up for diagnosis.
+  const char* Name() const override = 0;
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
+
+  // In the case of BlobDB, it may be possible to reach a decision with only
+  // the key without reading the actual value. Keys whose value_type is
+  // kBlobIndex will be checked by this method.
+  // Returning kUndetermined will cause FilterV2() to be called to make a
+  // decision as usual.
+  virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
+                                   std::string* /*new_value*/,
+                                   std::string* /*skip_until*/) const {
+    return Decision::kUndetermined;
+  }
+};
+
+// Each thread of work involving creating table files will create a new
+// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This
+// allows the application to know about the different ongoing threads of work
+// and makes it unnecessary for `CompactionFilter` to provide thread-safety.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilterFactory : public Customizable {
+ public:
+  virtual ~CompactionFilterFactory() {}
+  static const char* Type() { return "CompactionFilterFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& name,
+      std::shared_ptr<CompactionFilterFactory>* result);
+
+  // Returns whether a thread creating table files for the specified `reason`
+  // should invoke `CreateCompactionFilter()` and pass KVs through the returned
+  // filter.
+  virtual bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const {
+    // For backward compatibility, default implementation only applies
+    // `CompactionFilter` to files generated by compaction.
+    return reason == TableFileCreationReason::kCompaction;
+  }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this `CompactionFilter` factory.
+  virtual const char* Name() const override = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h
new file mode 100644
index 000000000..5ff8eccc8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct CompactionJobStats {
+  CompactionJobStats() { Reset(); }
+  void Reset();
+  // Aggregate the CompactionJobStats from another instance with this one
+  void Add(const CompactionJobStats& stats);
+
+  // the elapsed time of this compaction in microseconds.
+  uint64_t elapsed_micros;
+
+  // the elapsed CPU time of this compaction in microseconds.
+  uint64_t cpu_micros;
+
+  // the number of compaction input records.
+  uint64_t num_input_records;
+  // the number of blobs read from blob files
+  uint64_t num_blobs_read;
+  // the number of compaction input files (table files)
+  size_t num_input_files;
+  // the number of compaction input files at the output level (table files)
+  size_t num_input_files_at_output_level;
+
+  // the number of compaction output records.
+  uint64_t num_output_records;
+  // the number of compaction output files (table files)
+  size_t num_output_files;
+  // the number of compaction output files (blob files)
+  size_t num_output_files_blob;
+
+  // true if the compaction is a full compaction (all live SST files input)
+  bool is_full_compaction;
+  // true if the compaction is a manual compaction
+  bool is_manual_compaction;
+
+  // the total size of table files in the compaction input
+  uint64_t total_input_bytes;
+  // the total size of blobs read from blob files
+  uint64_t total_blob_bytes_read;
+  // the total size of table files in the compaction output
+  uint64_t total_output_bytes;
+  // the total size of blob files in the compaction output
+  uint64_t total_output_bytes_blob;
+
+  // number of records being replaced by newer record associated with same key.
+  // this could be a new value or a deletion entry for that key so this field
+  // sums up all updated and deleted keys
+  uint64_t num_records_replaced;
+
+  // the sum of the uncompressed input keys in bytes.
+  uint64_t total_input_raw_key_bytes;
+  // the sum of the uncompressed input values in bytes.
+  uint64_t total_input_raw_value_bytes;
+
+  // the number of deletion entries before compaction. Deletion entries
+  // can disappear after compaction because they expired
+  uint64_t num_input_deletion_records;
+  // number of deletion records that were found obsolete and discarded
+  // because it is not possible to delete any more keys with this entry
+  // (i.e. all possible deletions resulting from it have been completed)
+  uint64_t num_expired_deletion_records;
+
+  // number of corrupt keys (ParseInternalKey returned false when applied to
+  // the key) encountered and written out.
+  uint64_t num_corrupt_keys;
+
+  // Following counters are only populated if
+  // options.report_bg_io_stats = true;
+
+  // Time spent on file's Append() call.
+  uint64_t file_write_nanos;
+
+  // Time spent on sync file range.
+  uint64_t file_range_sync_nanos;
+
+  // Time spent on file fsync.
+  uint64_t file_fsync_nanos;
+
+  // Time spent on preparing file write (fallocate, etc)
+  uint64_t file_prepare_write_nanos;
+
+  // 0-terminated strings storing the first 8 bytes of the smallest and
+  // largest key in the output.
+  static const size_t kMaxPrefixLength = 8;
+
+  std::string smallest_output_key_prefix;
+  std::string largest_output_key_prefix;
+
+  // number of single-deletes which do not meet a put
+  uint64_t num_single_del_fallthru;
+
+  // number of single-deletes which meet something other than a put
+  uint64_t num_single_del_mismatch;
+
+  // TODO: Add output_to_penultimate_level output information
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
new file mode 100644
index 000000000..ad1e71a11
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// The general interface for comparing two Slices are defined for both of
+// Comparator and some internal data structures.
+class CompareInterface {
+ public:
+  virtual ~CompareInterface() {}
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  // Note that Compare(a, b) also compares timestamp if timestamp size is
+  // non-zero. For the same user key with different timestamps, larger (newer)
+  // timestamp comes first.
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+};
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Comparator : public Customizable, public CompareInterface {
+ public:
+  Comparator() : timestamp_size_(0) {}
+
+  Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+  Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+  Comparator& operator=(const Comparator& rhs) {
+    if (this != &rhs) {
+      timestamp_size_ = rhs.timestamp_size_;
+    }
+    return *this;
+  }
+
+  ~Comparator() override {}
+
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 const Comparator** comp);
+  static const char* Type() { return "Comparator"; }
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  const char* Name() const override = 0;
+
+  // Compares two slices for equality. The following invariant should always
+  // hold (and is the default implementation):
+  //   Equal(a, b) iff Compare(a, b) == 0
+  // Overwrite only if equality comparisons can be done more efficiently than
+  // three-way comparisons.
+  virtual bool Equal(const Slice& a, const Slice& b) const {
+    return Compare(a, b) == 0;
+  }
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+
+  // given two keys, determine if t is the successor of s
+  // BUG: only return true if no other keys starting with `t` are ordered
+  // before `t`. Otherwise, the auto_prefix_mode can omit entries within
+  // iterator bounds that have same prefix as upper bound but different
+  // prefix from seek key.
+  virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
+                                              const Slice& /*t*/) const {
+    return false;
+  }
+
+  // return true if two keys with different byte sequences can be regarded
+  // as equal by this comparator.
+  // The major use case is to determine if DataBlockHashIndex is compatible
+  // with the customized comparator.
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+  // if it is a wrapped comparator, may return the root one.
+  // return itself it is not wrapped.
+  virtual const Comparator* GetRootComparator() const { return this; }
+
+  inline size_t timestamp_size() const { return timestamp_size_; }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
+  // For two events e1 and e2 whose timestamps are t1 and t2 respectively,
+  // Returns value:
+  // < 0  iff t1 < t2
+  // == 0 iff t1 == t2
+  // > 0  iff t1 > t2
+  // Note that an all-zero byte array will be the smallest (oldest) timestamp
+  // of the same length, and a byte array with all bits 1 will be the largest.
+  // In the future, we can extend Comparator so that subclasses can specify
+  // both largest and smallest timestamps.
+  virtual int CompareTimestamp(const Slice& /*ts1*/,
+                               const Slice& /*ts2*/) const {
+    return 0;
+  }
+
+  virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/,
+                                      const Slice& b, bool /*b_has_ts*/) const {
+    return Compare(a, b);
+  }
+
+  virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return 0 ==
+           CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
+ private:
+  size_t timestamp_size_;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering.
+extern const Comparator* ReverseBytewiseComparator();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compression_type.h b/src/rocksdb/include/rocksdb/compression_type.h
new file mode 100644
index 000000000..bfeb00bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compression_type.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+
+enum CompressionType : unsigned char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3,
+  kLZ4Compression = 0x4,
+  kLZ4HCCompression = 0x5,
+  kXpressCompression = 0x6,
+  kZSTD = 0x7,
+
+  // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
+  // 0.8.0 or consider a possibility of downgrading the service or copying
+  // the database files to another service running with an older version of
+  // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
+  // eventually remove the option from the public API.
+  kZSTDNotFinalCompression = 0x40,
+
+  // kDisableCompressionOption is used to disable some compression options.
+  kDisableCompressionOption = 0xff,
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
new file mode 100644
index 000000000..9ad741f98
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is NOT an extensible interface but a public interface for result of
+// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal.
+class ConcurrentTaskLimiter {
+ public:
+  virtual ~ConcurrentTaskLimiter() {}
+
+  // Returns a name that identifies this concurrent task limiter.
+  virtual const std::string& GetName() const = 0;
+
+  // Set max concurrent tasks.
+  // limit = 0 means no new task allowed.
+  // limit < 0 means no limitation.
+  virtual void SetMaxOutstandingTask(int32_t limit) = 0;
+
+  // Reset to unlimited max concurrent task.
+  virtual void ResetMaxOutstandingTask() = 0;
+
+  // Returns current outstanding task count.
+  virtual int32_t GetOutstandingTask() const = 0;
+};
+
+// Create a ConcurrentTaskLimiter that can be shared with multiple CFs
+// across RocksDB instances to control concurrent tasks.
+//
+// @param name: Name of the limiter.
+// @param limit: max concurrent tasks.
+//        limit = 0 means no new task allowed.
+//        limit < 0 means no limitation.
+extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+                                                       int32_t limit);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/configurable.h b/src/rocksdb/include/rocksdb/configurable.h
new file mode 100644
index 000000000..60ae89f97
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/configurable.h
@@ -0,0 +1,400 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+class ObjectRegistry;
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+
+// Configurable is a base class used by the rocksdb that describes a
+// standard way of configuring objects.  A Configurable object can:
+//   -> Populate itself given:
+//        - One or more "name/value" pair strings
+//        - A string representing the set of name=value properties
+//        - A map of name/value properties.
+//   -> Convert itself into its string representation
+//   -> Dump itself to a Logger
+//   -> Compare itself to another Configurable object to see if the two objects
+// have equivalent options settings
+//
+// If a derived class calls RegisterOptions to register (by name) how its
+// options objects are to be processed, this functionality can typically be
+// handled by this class without additional overrides. Otherwise, the derived
+// class will need to implement the methods for handling the corresponding
+// functionality.
+class Configurable {
+ protected:
+  friend class ConfigurableHelper;
+  struct RegisteredOptions {
+    // The name of the options being registered
+    std::string name;
+    // Pointer to the object being registered
+    void* opt_ptr;
+#ifndef ROCKSDB_LITE
+    // The map of options being registered
+    const std::unordered_map<std::string, OptionTypeInfo>* type_map;
+#endif
+  };
+
+ public:
+  virtual ~Configurable() {}
+
+  // Returns the raw pointer of the named options that is used by this
+  // object, or nullptr if this function is not supported.
+  // Since the return value is a raw pointer, the object owns the
+  // pointer and the caller should not delete the pointer.
+  //
+  // Note that changing the underlying options while the object
+  // is currently used by any open DB is undefined behavior.
+  // Developers should use DB::SetOption() instead to dynamically change
+  // options while the DB is open.
+  template <typename T>
+  const T* GetOptions() const {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  T* GetOptions() {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  const T* GetOptions(const std::string& name) const {
+    return reinterpret_cast<const T*>(GetOptionsPtr(name));
+  }
+  template <typename T>
+  T* GetOptions(const std::string& name) {
+    return reinterpret_cast<T*>(const_cast<void*>(GetOptionsPtr(name)));
+  }
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.
+  // If this method fails, an attempt is made to revert the object to original
+  // state. Note that the revert may not be the original state but may be an
+  // equivalent. For example, if the object contains an option that is a
+  // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not
+  // the Cache object that was passed in, but a Cache object of the same size).
+  //
+  // The acceptable values of the name/value pairs are documented with the
+  // specific class/instance.
+  //
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_map Name/value pairs of the options to update
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all values in the map were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names in the opt_map were not valid
+  //      for this object.  If unused is specified, it will contain the
+  //      collection of NotFound names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  // @see ConfigOptions for a description of the controls.
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map);
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Updates the named option to the input value, returning OK if successful.
+  // Note that ConfigureOption does not cause PrepareOptions to be invoked.
+  // @param config_options Controls how the name/value is processed.
+  // @param name The name of the option to update
+  // @param value The value to set for the named option
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @return NotSupported  If the name is valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If the value cannot be successfully  parsed.
+  Status ConfigureOption(const ConfigOptions& config_options,
+                         const std::string& name, const std::string& value);
+#endif  // ROCKSDB_LITE
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.  If this method fails, an attempt is made to revert the
+  // object to original state.  Note that the revert may not be the original
+  // state but may be an equivalent.
+  // @see ConfigureFromMap for more details
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_str string containing the values to update.
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all specified values were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names were not valid for this object.
+  //      If unused is specified, it will contain the collection of NotFound
+  //      names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  Status ConfigureFromString(const ConfigOptions& config_options,
+                             const std::string& opts);
+
+  // Fills in result with the serialized options for this object.
+  // This is the inverse of ConfigureFromString.
+  // @param config_options Controls how serialization happens.
+  // @param result The string representation of this object.
+  // @return OK If the options for this object were successfully serialized.
+  // @return InvalidArgument If one or more of the options could not be
+  // serialized.
+  Status GetOptionString(const ConfigOptions& config_options,
+                         std::string* result) const;
+#ifndef ROCKSDB_LITE
+  // Returns the serialized options for this object.
+  // This method is similar to GetOptionString with no errors.
+  // @param config_options Controls how serialization happens.
+  // @param prefix A string to prepend to every option.
+  // @return The serialized representation of the options for this object
+  std::string ToString(const ConfigOptions& config_options) const {
+    return ToString(config_options, "");
+  }
+  std::string ToString(const ConfigOptions& config_options,
+                       const std::string& prefix) const;
+
+  // Returns the list of option names associated with this configurable
+  // @param config_options Controls how the names are returned
+  // @param result The set of option names for this object. Note that
+  //      options that are deprecated or aliases are not returned.
+  // @return OK on success.
+  Status GetOptionNames(const ConfigOptions& config_options,
+                        std::unordered_set<std::string>* result) const;
+
+  // Returns the value of the option associated with the input name
+  // This method is the functional inverse of ConfigureOption
+  // @param config_options Controls how the value is returned
+  // @param name The name of the option to return a value for.
+  // @param value The returned value associated with the named option.
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @param InvalidArgument If the name is valid for this object but
+  //      its value cannot be serialized.
+  virtual Status GetOption(const ConfigOptions& config_options,
+                           const std::string& name, std::string* value) const;
+#endif  // ROCKSDB_LITE
+
+  // Checks to see if this Configurable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  virtual bool AreEquivalent(const ConfigOptions& config_options,
+                             const Configurable* other,
+                             std::string* name) const;
+
+  // Returns a pretty-printed, human-readable version of the options.
+  // This method is typically used to dump the options to a log file.
+  // Classes should override this method
+  virtual std::string GetPrintableOptions() const { return ""; }
+
+  // Validates that the settings are valid/consistent and performs any object
+  // initialization required by this object.  This method may be called as part
+  // of Configure (if invoke_prepare_options is set), or may be invoked
+  // separately.
+  //
+  // Once an object has been prepared, non-mutable options can no longer be
+  // updated.
+  //
+  // Classes must override this method to provide any implementation-specific
+  // initialization, such as opening log files or setting up cache parameters.
+  // Implementations should be idempotent (e.g. don't re-open the log file or
+  // reconfigure the cache), as there is the potential this method can be called
+  // more than once.
+  //
+  // By default, this method will also prepare all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param config_options Controls how the object is prepared.  Also contains
+  //      a Logger and Env that can be used to initialize this object.
+  // @return OK If the object was successfully initialized.
+  // @return InvalidArgument If this object could not be successfully
+  // initialized.
+  virtual Status PrepareOptions(const ConfigOptions& config_options);
+
+  // Checks to see if the settings are valid for this object.
+  // This method checks to see if the input DBOptions and ColumnFamilyOptions
+  // are valid for the settings of this object.  For example, an Env might not
+  // support certain mmap modes or a TableFactory might require certain
+  // settings.
+  //
+  // By default, this method will also validate all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param db_opts The DBOptions to validate
+  // @param cf_opts The ColumnFamilyOptions to validate
+  // @return OK if the options are valid
+  // @return InvalidArgument If the arguments are not valid for the options
+  //       of the current object.
+  virtual Status ValidateOptions(const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts) const;
+
+  // Splits the input opt_value into the ID field and the remaining options.
+  // The input opt_value can be in the form of "name" or "name=value
+  // [;name=value]". The first form uses the "name" as an id with no options The
+  // latter form converts the input into a map of name=value pairs and sets "id"
+  // to the "id" value from the map.
+  // @param opt_value The value to split into id and options
+  // @param id The id field from the opt_value
+  // @param options The remaining name/value pairs from the opt_value
+  // @param default_id If specified and there is no id field in the map, this
+  // value is returned as the ID
+  // @return OK if the value was converted to a map successfully and an ID was
+  // found.
+  // @return InvalidArgument if the value could not be converted to a map or
+  // there was or there is no id property in the map.
+  static Status GetOptionsMap(
+      const std::string& opt_value, const std::string& default_id,
+      std::string* id, std::unordered_map<std::string, std::string>* options);
+
+ protected:
+  // Returns the raw pointer for the associated named option.
+  // The name is typically the name of an option registered via the
+  // Classes may override this method to provide further specialization (such as
+  // returning a sub-option)
+  //
+  // The default implementation looks at the registered options.  If the
+  // input name matches that of a registered option, the pointer registered
+  // with that name is returned.
+  // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns
+  // "my_ptr"
+  virtual const void* GetOptionsPtr(const std::string& name) const;
+
+  // Method for allowing options to be configured outside of the normal
+  // registered options framework.  Classes may override this method if they
+  // wish to support non-standard options implementations (such as configuring
+  // themselves from constant or simple ":"-separated strings.
+  //
+  // The default implementation does nothing and returns OK
+  virtual Status ParseStringOptions(const ConfigOptions& config_options,
+                                    const std::string& opts_str);
+
+  // Internal method to configure an object from a map of name-value options.
+  // This method uses the input config_options to drive the configuration of
+  // the options in opt_map.  Any option name that cannot be found from the
+  // input set will be returned in "unused".
+  //
+  // Classes may override this method to extend the functionality if required.
+  // @param config_options Controls how the options are configured and errors
+  // handled.
+  // @param opts_map The set of options to configure
+  // @param unused Any options from opt_map that were not configured.
+  // @returns a Status based on the rules outlined in ConfigureFromMap
+  virtual Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Method that configures a the specific opt_name from opt_value.
+  // By default, this method calls opt_info.ParseOption with the
+  // input parameters.
+  // Classes may override this method to extend the functionality, or
+  // change the returned Status.
+  virtual Status ParseOption(const ConfigOptions& config_options,
+                             const OptionTypeInfo& opt_info,
+                             const std::string& opt_name,
+                             const std::string& opt_value, void* opt_ptr);
+
+  // Internal method to see if the single option name/info matches for this and
+  // that Classes may override this value to change its behavior.
+  // @param config_options Controls how the options are being matched
+  // @param opt_info The OptionTypeInfo registered for this option name
+  //      that controls what field is matched (offset) and how (type).
+  // @param name The name associated with this opt_info.
+  // @param this_ptr The base pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param that_ptr The other pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param bad_name  If the match fails, the name of the option that failed to
+  // match.
+  virtual bool OptionsAreEqual(const ConfigOptions& config_options,
+                               const OptionTypeInfo& opt_info,
+                               const std::string& name,
+                               const void* const this_ptr,
+                               const void* const that_ptr,
+                               std::string* bad_name) const;
+#endif
+#ifndef ROCKSDB_LITE
+  // Internal method to serialize options (ToString)
+  // Classes may override this value to change its behavior.
+  virtual std::string SerializeOptions(const ConfigOptions& config_options,
+                                       const std::string& header) const;
+#endif  // ROCKSDB_LITE
+
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  virtual std::string GetOptionName(const std::string& long_name) const;
+
+  // Registers the input name with the options and associated map.
+  // When classes register their options in this manner, most of the
+  // functionality (excluding unknown options and validate/prepare) is
+  // implemented by the base class.
+  //
+  // This method should be called in the class constructor to register the
+  // option set for this object.  For example, to register the options
+  // associated with the BlockBasedTableFactory, the constructor calls this
+  // method passing in:
+  // - the name of the options ("BlockBasedTableOptions");
+  // - the options object (the BlockBasedTableOptions object for this object;
+  // - the options type map for the BlockBasedTableOptions.
+  // This registration allows the Configurable class to process the option
+  // values associated with the BlockBasedTableOptions without further code in
+  // the derived class.
+  //
+  // @param name    The name of this set of options (@see GetOptionsPtr)
+  // @param opt_ptr Pointer to the options to associate with this name
+  // @param opt_map Options map that controls how this option is configured.
+  template <typename T>
+  void RegisterOptions(
+      T* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
+    RegisterOptions(T::kName(), opt_ptr, opt_map);
+  }
+  void RegisterOptions(
+      const std::string& name, void* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
+
+  // Returns true if there are registered options for this Configurable object
+  inline bool HasRegisteredOptions() const { return !options_.empty(); }
+
+ private:
+  // Contains the collection of options (name, opt_ptr, opt_map) associated with
+  // this object. This collection is typically set in the constructor of the
+  // Configurable option via
+  std::vector<RegisteredOptions> options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h
new file mode 100644
index 000000000..921ec221b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/convenience.h
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Env;
+class Logger;
+class ObjectRegistry;
+
+struct ColumnFamilyOptions;
+struct DBOptions;
+struct Options;
+
+// ConfigOptions containing the parameters/controls for
+// comparing objects and converting to/from strings.
+// These settings control how the methods
+// treat errors (e.g. ignore_unknown_objects), the format
+// of the serialization (e.g. delimiter), and how to compare
+// options (sanity_level).
+struct ConfigOptions {
+  // Constructs a new ConfigOptions with a new object registry.
+  // This method should only be used when a DBOptions is not available,
+  // else registry settings may be lost
+  ConfigOptions();
+
+  // Constructs a new ConfigOptions using the settings from
+  // the input DBOptions.  Currently constructs a new object registry.
+  explicit ConfigOptions(const DBOptions&);
+
+  // This enum defines the RocksDB options sanity level.
+  enum SanityLevel : unsigned char {
+    kSanityLevelNone = 0x01,  // Performs no sanity check at all.
+    // Performs minimum check to ensure the RocksDB instance can be
+    // opened without corrupting / mis-interpreting the data.
+    kSanityLevelLooselyCompatible = 0x02,
+    // Perform exact match sanity check.
+    kSanityLevelExactMatch = 0xFF,
+  };
+
+  enum Depth {
+    kDepthDefault,  // Traverse nested options that are not flagged as "shallow"
+    kDepthShallow,  // Do not traverse into any nested options
+    kDepthDetailed,  // Traverse nested options, overriding the options shallow
+                     // setting
+  };
+
+  // When true, any unused options will be ignored and OK will be returned
+  bool ignore_unknown_options = false;
+
+  // When true, any unsupported options will be ignored and OK will be returned
+  bool ignore_unsupported_options = true;
+
+  // If the strings are escaped (old-style?)
+  bool input_strings_escaped = true;
+
+  // Whether or not to invoke PrepareOptions after configure is called.
+  bool invoke_prepare_options = true;
+
+  // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not.
+  // When "mutable_options_only=false", all options are evaluated.
+  // When "mutable_options_only="true", any option not marked as Mutable is
+  // either ignored (in the case of string/equals methods) or results in an
+  // error (in the case of Configure).
+  bool mutable_options_only = false;
+
+  // The separator between options when converting to a string
+  std::string delimiter = ";";
+
+  // Controls how to traverse options during print/match stages
+  Depth depth = Depth::kDepthDefault;
+
+  // Controls how options are serialized
+  // Controls how pedantic the comparison must be for equivalency
+  SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch;
+  // `file_readahead_size` is used for readahead for the option file.
+  size_t file_readahead_size = 512 * 1024;
+
+  // The environment to use for this option
+  Env* env = Env::Default();
+
+#ifndef ROCKSDB_LITE
+  // The object registry to use for this options
+  std::shared_ptr<ObjectRegistry> registry;
+#endif
+
+  bool IsShallow() const { return depth == Depth::kDepthShallow; }
+  bool IsDetailed() const { return depth == Depth::kDepthDetailed; }
+
+  bool IsCheckDisabled() const {
+    return sanity_level == SanityLevel::kSanityLevelNone;
+  }
+
+  bool IsCheckEnabled(SanityLevel level) const {
+    return (level > SanityLevel::kSanityLevelNone && level <= sanity_level);
+  }
+};
+
+#ifndef ROCKSDB_LITE
+
+// The following set of functions provide a way to construct RocksDB Options
+// from a string or a string-to-string map.  Here is the general rule of
+// setting option values from strings by type.  Some RocksDB types are also
+// supported in these APIs.  Please refer to the comment of the function itself
+// to find more information about how to config those RocksDB types.
+//
+// * Strings:
+//   Strings will be used as values directly without any truncating or
+//   trimming.
+//
+// * Booleans:
+//   - "true" or "1" => true
+//   - "false" or "0" => false.
+//   [Example]:
+//   - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or
+//   - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString.
+//
+// * Integers:
+//   Integers are converted directly from string, in addition to the following
+//   units that we support:
+//   - 'k' or 'K' => 2^10
+//   - 'm' or 'M' => 2^20
+//   - 'g' or 'G' => 2^30
+//   - 't' or 'T' => 2^40  // only for unsigned int with sufficient bits.
+//   [Example]:
+//   - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or
+//   - "arena_block_size=19G" in GetColumnFamilyOptionsFromString.
+//
+// * Doubles / Floating Points:
+//   Doubles / Floating Points are converted directly from string.  Note that
+//   currently we do not support units.
+//   [Example]:
+//   - {"memtable_prefix_bloom_size_ratio", "0.1"} in
+//   GetColumnFamilyOptionsFromMap, or
+//   - "memtable_prefix_bloom_size_ratio=0.1" in
+//   GetColumnFamilyOptionsFromString.
+// * Array / Vectors:
+//   An array is specified by a list of values, where ':' is used as
+//   the delimiter to separate each value.
+//   [Example]:
+//   - {"compression_per_level", "kNoCompression:kSnappyCompression"}
+//     in GetColumnFamilyOptionsFromMap, or
+//   - "compression_per_level=kNoCompression:kSnappyCompression" in
+//     GetColumnFamilyOptionsFromMapString
+// * Enums:
+//   The valid values of each enum are identical to the names of its constants.
+//   [Example]:
+//   - CompressionType: valid values are "kNoCompression",
+//     "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ...
+//   - CompactionStyle: valid values are "kCompactionStyleLevel",
+//     "kCompactionStyleUniversal", "kCompactionStyleFIFO", and
+//     "kCompactionStyleNone".
+//
+
+// Take a default ColumnFamilyOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// ColumnFamilyOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in ColumnFamilyOptions:
+//
+// * table_factory:
+//   table_factory can be configured using our custom nested-option syntax.
+//
+//   {option_a=value_a; option_b=value_b; option_c=value_c; ... }
+//
+//   A nested option is enclosed by two curly braces, within which there are
+//   multiple option assignments.  Each assignment is of the form
+//   "variable_name=value;".
+//
+//   Currently we support the following types of TableFactory:
+//   - BlockBasedTableFactory:
+//     Use name "block_based_table_factory" to initialize table_factory with
+//     BlockBasedTableFactory.  Its BlockBasedTableFactoryOptions can be
+//     configured using the nested-option syntax.
+//     [Example]:
+//     * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"}
+//       is equivalent to assigning table_factory with a BlockBasedTableFactory
+//       that has 1M LRU block-cache with block size equals to 4k:
+//         ColumnFamilyOptions cf_opt;
+//         BlockBasedTableOptions blk_opt;
+//         blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024);
+//         blk_opt.block_size = 4 * 1024;
+//         cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt));
+//   - PlainTableFactory:
+//     Use name "plain_table_factory" to initialize table_factory with
+//     PlainTableFactory.  Its PlainTableFactoryOptions can be configured using
+//     the nested-option syntax.
+//     [Example]:
+//     * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"}
+//
+// * memtable_factory:
+//   Use "memtable" to config memtable_factory.  Here are the supported
+//   memtable factories:
+//   - SkipList:
+//     Pass "skip_list:<lookahead>" to config memtable to use SkipList,
+//     or simply "skip_list" to use the default SkipList.
+//     [Example]:
+//     * {"memtable", "skip_list:5"} is equivalent to setting
+//       memtable to SkipListFactory(5).
+//   - PrefixHash:
+//     Pass "prefix_hash:<hash_bucket_count>" to config memtable
+//     to use PrefixHash, or simply "prefix_hash" to use the default
+//     PrefixHash.
+//     [Example]:
+//     * {"memtable", "prefix_hash:1000"} is equivalent to setting
+//       memtable to NewHashSkipListRepFactory(hash_bucket_count).
+//   - HashLinkedList:
+//     Pass "hash_linkedlist:<hash_bucket_count>" to config memtable
+//     to use HashLinkedList, or simply "hash_linkedlist" to use the default
+//     HashLinkedList.
+//     [Example]:
+//     * {"memtable", "hash_linkedlist:1000"} is equivalent to
+//       setting memtable to NewHashLinkListRepFactory(1000).
+//   - VectorRepFactory:
+//     Pass "vector:<count>" to config memtable to use VectorRepFactory,
+//     or simply "vector" to use the default Vector memtable.
+//     [Example]:
+//     * {"memtable", "vector:1024"} is equivalent to setting memtable
+//       to VectorRepFactory(1024).
+//
+//  * compression_opts:
+//    Use "compression_opts" to config compression_opts.  The value format
+//    is of the form "<window_bits>:<level>:<strategy>:<max_dict_bytes>".
+//    [Example]:
+//    * {"compression_opts", "4:5:6:7"} is equivalent to setting:
+//        ColumnFamilyOptions cf_opt;
+//        cf_opt.compression_opts.window_bits = 4;
+//        cf_opt.compression_opts.level = 5;
+//        cf_opt.compression_opts.strategy = 6;
+//        cf_opt.compression_opts.max_dict_bytes = 7;
+//
+// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release.  The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+//     should be set.
+// @param new_options the resulting options based on "base_options" with the
+//     change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetColumnFamilyOptionsFromMap(
+    const ConfigOptions& config_options,
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a default DBOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// DBOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in DBOptions:
+//
+// * rate_limiter_bytes_per_sec:
+//   RateLimiter can be configured directly by specifying its bytes_per_sec.
+//   [Example]:
+//   - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to
+//     passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec.
+//
+// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+//     should be set.
+// @param new_options the resulting options based on "base_options" with the
+//     change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetDBOptionsFromMap(
+    const ConfigOptions& cfg_options, const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options);
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+//   We currently only support the following FilterPolicy in the convenience
+//   functions:
+//   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+//     to specify BloomFilter.  The above string is equivalent to calling
+//     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+//     [Example]:
+//     - Pass {"filter_policy", "bloomfilter:4:true"} in
+//       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+//       per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+//   We currently only support LRU cache in the GetOptions API.  The LRU
+//   cache can be set by directly specifying its size.
+//   [Example]:
+//   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+//     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options,
+    bool input_strings_escaped = false, bool ignore_unknown_options = false);
+
+// Take a default PlainTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// PlainTableOptions "new_table_options".
+//
+// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a string representation of option names and values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+//   "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+//   "max_write_buffer_num=2"
+//
+//
+// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+                              const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
+
+Status GetDBOptionsFromString(const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
+
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+                              const DBOptions& db_options,
+                              std::string* opts_str);
+
+Status GetStringFromDBOptions(std::string* opts_str,
+                              const DBOptions& db_options,
+                              const std::string& delimiter = ";  ");
+
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& cf_options,
+                                        std::string* opts_str);
+Status GetStringFromColumnFamilyOptions(std::string* opts_str,
+                                        const ColumnFamilyOptions& cf_options,
+                                        const std::string& delimiter = ";  ");
+Status GetStringFromCompressionType(std::string* compression_str,
+                                    CompressionType compression_type);
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str,
+    std::unique_ptr<MemTableRepFactory>* new_mem_factory);
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+Status GetOptionsFromString(const ConfigOptions& config_options,
+                            const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+
+Status StringToMap(const std::string& opts_str,
+                   std::unordered_map<std::string, std::string>* opts_map);
+
+// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+
+// Delete files which are entirely in the given range
+// Could leave some keys in the range which are in files which are not
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
+// Snapshots before the delete might not see the data in the given range.
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+                          const Slice* begin, const Slice* end,
+                          bool include_end = true);
+
+// Delete files in multiple ranges at once
+// Delete files in a lot of ranges one at a time can be slow, use this API for
+// better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n,
+                           bool include_end = true);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path,
+                             const SequenceNumber& largest_seqno = 0);
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/customizable.h b/src/rocksdb/include/rocksdb/customizable.h
new file mode 100644
index 000000000..92f7504ae
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/customizable.h
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * Customizable a base class used by the rocksdb that describes a
+ * standard way of configuring and creating objects.  Customizable objects
+ * are configurable objects that can be created from an ObjectRegistry.
+ *
+ * Customizable classes are used when there are multiple potential
+ * implementations of a class for use by RocksDB (e.g. Table, Cache,
+ * MergeOperator, etc).  The abstract base class is expected to define a method
+ * declaring its type and a factory method for creating one of these, such as:
+ * static const char *Type() { return "Table"; }
+ * static Status CreateFromString(const ConfigOptions& options,
+ *                                const std::string& id,
+ *                                std::shared_ptr<TableFactory>* result);
+ * The "Type" string is expected to be unique (no two base classes are the same
+ * type). This factory is expected, based on the options and id, create and
+ * return the appropriate derived type of the customizable class (e.g.
+ * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers,
+ * helper classes and methods are provided for writing this factory.
+ *
+ * Instances of a Customizable class need to define:
+ * - A "static const char *kClassName()" method.  This method defines the name
+ * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the
+ * CheckedCast method.
+ * - The Name() of the object.  This name is used when creating and saving
+ * instances of this class.  Typically this name will be the same as
+ * kClassName().
+ *
+ * Additionally, Customizable classes should register any options used to
+ * configure themselves with the Configurable subsystem.
+ *
+ * When a Customizable is being created, the "name" property specifies
+ * the name of the instance being created.
+ * For custom objects, their configuration and name can be specified by:
+ * [prop]={name=X;option 1 = value1[; option2=value2...]}
+ *
+ * [prop].name=X
+ * [prop].option1 = value1
+ *
+ * [prop].name=X
+ * X.option1 =value1
+ */
+class Customizable : public Configurable {
+ public:
+  ~Customizable() override {}
+
+  // Returns the name of this class of Customizable
+  virtual const char* Name() const = 0;
+
+  // Returns an identifier for this Customizable.
+  // This could be its name or something more complex (like its URL/pattern).
+  // Used for pretty printing.
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // This is typically determined by if the input name matches the
+  // name of this object.
+  // This method is typically used in conjunction with CheckedCast to find the
+  // derived class instance from its base.  For example, if you have an Env
+  // and want the "Default" env, you would IsInstanceOf("Default") to get
+  // the default implementation.  This method should be used when you need a
+  // specific derivative or implementation of a class.
+  //
+  // Intermediary caches (such as SharedCache) may wish to override this method
+  // to check for the intermediary name (SharedCache).  Classes with multiple
+  // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override
+  // this method.
+  //
+  // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a".
+  // Wrapped classes that have an Inner "has-a" should not be returned.
+  //
+  // @param name The name of the instance to find.
+  // Returns true if the class is an instance of the input name.
+  virtual bool IsInstanceOf(const std::string& name) const {
+    if (name.empty()) {
+      return false;
+    } else if (name == Name()) {
+      return true;
+    } else {
+      const char* nickname = NickName();
+      if (nickname != nullptr && name == nickname) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    const void* ptr = Configurable::GetOptionsPtr(name);
+    if (ptr != nullptr) {
+      return ptr;
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->GetOptionsPtr(name);
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Returns the named instance of the Customizable as a T*, or nullptr if not
+  // found. This method uses IsInstanceOf/Inner to find the appropriate class
+  // instance and then casts it to the expected return type.
+  template <typename T>
+  const T* CheckedCast() const {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<const T*>(this);
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  template <typename T>
+  T* CheckedCast() {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<T*>(this);
+    } else {
+      auto inner = const_cast<Customizable*>(Inner());
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Checks to see if this Customizable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  // @see Configurable::AreEquivalent for more details
+  bool AreEquivalent(const ConfigOptions& config_options,
+                     const Configurable* other,
+                     std::string* mismatch) const override;
+#ifndef ROCKSDB_LITE
+  // Gets the value of the option associated with the input name
+  // @see Configurable::GetOption for more details
+  Status GetOption(const ConfigOptions& config_options, const std::string& name,
+                   std::string* value) const override;
+#endif  // ROCKSDB_LITE
+  // Helper method for getting for parsing the opt_value into the corresponding
+  // options for use in potentially creating a new Customizable object (this
+  // method is primarily a support method for LoadSharedObject et al for new
+  // Customizable objects). The opt_value may be either name-value pairs
+  // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new
+  // Customizable, the ID is determined by:
+  // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this
+  // name;
+  // - Otherwise, if there is a "id=value", the id is set to "value"
+  // - Otherwise, if the input customizable is not null, custom->GetId is used
+  // - Otherwise, an error is returned.
+  //
+  // If the opt_value is name-value pairs, these pairs will be returned in
+  // options (without the id pair). If the ID being returned matches the ID of
+  // the input custom object, then the options from the input object will also
+  // be added to the returned options.
+  //
+  // This method returns non-OK if the ID could not be found, or if the
+  // opt_value could not be parsed into name-value pairs.
+  static Status GetOptionsMap(
+      const ConfigOptions& config_options, const Customizable* custom,
+      const std::string& opt_value, std::string* id,
+      std::unordered_map<std::string, std::string>* options);
+
+  // Helper method to configure a new object with the supplied options.
+  // If the object is not null and invoke_prepare_options=true, the object
+  // will be configured and prepared.
+  // Returns success if the object is properly configured and (optionally)
+  // prepared Returns InvalidArgument if the object is nullptr and there are
+  // options in the map Returns the result of the ConfigureFromMap or
+  // PrepareOptions
+  static Status ConfigureNewObject(
+      const ConfigOptions& config_options, Customizable* object,
+      const std::unordered_map<std::string, std::string>& options);
+
+  // Returns the inner class when a Customizable implements a has-a (wrapped)
+  // relationship.  Derived classes that implement a has-a must override this
+  // method in order to get CheckedCast to function properly.
+  virtual const Customizable* Inner() const { return nullptr; }
+
+ protected:
+  // Generates a ID specific for this instance of the customizable.
+  // The unique ID is of the form <name>:<addr>#pid, where:
+  // - name is the Name() of this object;
+  // - addr is the memory address of this object;
+  // - pid is the process ID of this process ID for this process.
+  // Note that if obj1 and obj2 have the same unique IDs, they must be the
+  // same.  However, if an object is deleted and recreated, it may have the
+  // same unique ID as a predecessor
+  //
+  // This method is useful for objects (especially ManagedObjects) that
+  // wish to generate an ID that is specific for this instance and wish to
+  // override the GetId() method.
+  std::string GenerateIndividualId() const;
+
+  // Some classes have both a class name (e.g. PutOperator) and a nickname
+  // (e.g. put). Classes can override this method to return a
+  // nickname.  Nicknames can be used by InstanceOf and object creation.
+  virtual const char* NickName() const { return ""; }
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  std::string GetOptionName(const std::string& long_name) const override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& options,
+                               const std::string& prefix) const override;
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/data_structure.h b/src/rocksdb/include/rocksdb/data_structure.h
new file mode 100644
index 000000000..f868a6be5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/data_structure.h
@@ -0,0 +1,51 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a data structure specifically designed as a "Set" for a
+// pretty small scale of Enum structure. For now, it can support up
+// to 64 element, and it is expandable in the future.
+template <typename ENUM_TYPE, ENUM_TYPE MAX_VALUE>
+class SmallEnumSet {
+ public:
+  SmallEnumSet() : state_(0) {}
+
+  ~SmallEnumSet() {}
+
+  // Return true if the input enum is included in the "Set" (i.e., changes the
+  // internal scalar state successfully), otherwise, it will return false.
+  bool Add(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t old_state = state_;
+    uint64_t tmp = 1;
+    state_ |= (tmp << value);
+    return old_state != state_;
+  }
+
+  // Return true if the input enum is contained in the "Set".
+  bool Contains(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t tmp = 1;
+    return state_ & (tmp << value);
+  }
+
+ private:
+  uint64_t state_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
new file mode 100644
index 000000000..26c07c19f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -0,0 +1,1859 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/block_cache_trace_writer.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/version.h"
+#include "rocksdb/wide_columns.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ColumnFamilyOptions;
+struct CompactionOptions;
+struct CompactRangeOptions;
+struct DBOptions;
+struct ExternalSstFileInfo;
+struct FlushOptions;
+struct Options;
+struct ReadOptions;
+struct TableProperties;
+struct WriteOptions;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
+class Env;
+class EventListener;
+class FileSystem;
+#ifndef ROCKSDB_LITE
+class Replayer;
+#endif
+class StatsHistoryIterator;
+#ifndef ROCKSDB_LITE
+class TraceReader;
+class TraceWriter;
+#endif
+class WriteBatch;
+
+extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
+struct ColumnFamilyDescriptor {
+  std::string name;
+  ColumnFamilyOptions options;
+  ColumnFamilyDescriptor()
+      : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+  ColumnFamilyDescriptor(const std::string& _name,
+                         const ColumnFamilyOptions& _options)
+      : name(_name), options(_options) {}
+};
+
+class ColumnFamilyHandle {
+ public:
+  virtual ~ColumnFamilyHandle() {}
+  // Returns the name of the column family associated with the current handle.
+  virtual const std::string& GetName() const = 0;
+  // Returns the ID of the column family associated with the current handle.
+  virtual uint32_t GetID() const = 0;
+  // Fills "*desc" with the up-to-date descriptor of the column family
+  // associated with this handle. Since it fills "*desc" with the up-to-date
+  // information, this call might internally lock and release DB mutex to
+  // access the up-to-date CF options.  In addition, all the pointer-typed
+  // options cannot be referenced any longer than the original options exist.
+  //
+  // Note that this function is not supported in RocksDBLite.
+  virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
+  // Returns the comparator of the column family associated with the
+  // current handle.
+  virtual const Comparator* GetComparator() const = 0;
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+// A range of keys
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+struct RangePtr {
+  const Slice* start;
+  const Slice* limit;
+
+  RangePtr() : start(nullptr), limit(nullptr) {}
+  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
+struct IngestExternalFileArg {
+  ColumnFamilyHandle* column_family = nullptr;
+  std::vector<std::string> external_files;
+  IngestExternalFileOptions options;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kUnknown;
+};
+
+struct GetMergeOperandsOptions {
+  int expected_max_number_of_operands = 0;
+};
+
+// A collections of table properties objects, where
+//  key: is the table's file name.
+//  value: the table properties object of the given table.
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+// A DB is a persistent, versioned ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+// DB is an abstract base class with one primary implementation (DBImpl)
+// and a number of wrapper implementations.
+class DB {
+ public:
+  // Open the database with the specified "name" for reads and writes.
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // if the DB is already open (read-write) by another DB object. (This
+  // guarantee depends on options.env->LockFile(), which might not provide
+  // this guarantee in a custom Env implementation.)
+  //
+  // Caller must delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options, const std::string& name,
+                     DB** dbptr);
+
+  // Open DB with column families.
+  // db_options specify database specific options
+  // column_families is the vector of all column families in the database,
+  // containing column family name and options. You need to open ALL column
+  // families in the database. To get the list of column families, you can use
+  // ListColumnFamilies().
+  //
+  // The default column family name is 'default' and it's stored
+  // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
+  // If everything is OK, handles will on return be the same size
+  // as column_families --- handles[i] will be a handle that you
+  // will use to operate on column family column_family[i].
+  // Before delete DB, you have to close All column families by calling
+  // DestroyColumnFamilyHandle() with all the handles.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // OpenForReadOnly() creates a Read-only instance that supports reads alone.
+  //
+  // All DB interfaces that modify data, like put/delete, will return error.
+  // Automatic Flush and Compactions are disabled and any manual calls
+  // to Flush/Compaction will return error.
+  //
+  // While a given DB can be simultaneously opened via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously opened by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+
+  // Open the database for read only.
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
+  static Status OpenForReadOnly(const Options& options, const std::string& name,
+                                DB** dbptr,
+                                bool error_if_wal_file_exists = false);
+
+  // Open the database for read only with column families.
+  //
+  // When opening DB with read only, you can specify only a subset of column
+  // families in the database that should be opened. However, you always need
+  // to specify default column family. The default column family name is
+  // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
+  static Status OpenForReadOnly(
+      const DBOptions& db_options, const std::string& name,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_wal_file_exists = false);
+
+  // OpenAsSecondary() creates a secondary instance that supports read-only
+  // operations and supports dynamic catch up with the primary (through a
+  // call to TryCatchUpWithPrimary()).
+  //
+  // All DB interfaces that modify data, like put/delete, will return error.
+  // Automatic Flush and Compactions are disabled and any manual calls
+  // to Flush/Compaction will return error.
+  //
+  // Multiple secondary instances can co-exist at the same time.
+  //
+
+  // Open DB as secondary instance
+  //
+  // The options argument specifies the options to open the secondary instance.
+  // Options.max_open_files should be set to -1.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the caller should
+  // delete it after use.
+  //
+  // Return OK on success, non-OK on failures.
+  static Status OpenAsSecondary(const Options& options, const std::string& name,
+                                const std::string& secondary_path, DB** dbptr);
+
+  // Open DB as secondary instance with specified column families
+  //
+  // When opening DB in secondary mode, you can specify only a subset of column
+  // families in the database that should be opened. However, you always need
+  // to specify default column family. The default column family name is
+  // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+  //
+  // Column families created by the primary after the secondary instance starts
+  // are currently ignored by the secondary instance.  Column families opened
+  // by secondary and dropped by the primary will be dropped by secondary as
+  // well (on next invocation of TryCatchUpWithPrimary()). However the user
+  // of the secondary instance can still access the data of such dropped column
+  // family as long as they do not destroy the corresponding column family
+  // handle.
+  //
+  // The options argument specifies the options to open the secondary instance.
+  // Options.max_open_files should be set to -1.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The column_families argument specifies a list of column families to open.
+  // If default column family is not specified or if any specified column
+  // families does not exist, the function returns non-OK status.
+  // The handles is an out-arg corresponding to the opened database column
+  // family handles.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the caller should
+  // delete it after use. Before deleting the dbptr, the user should also
+  // delete the pointers stored in handles vector.
+  //
+  // Return OK on success, non-OK on failures.
+  static Status OpenAsSecondary(
+      const DBOptions& db_options, const std::string& name,
+      const std::string& secondary_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // Open DB and run the compaction.
+  // It's a read-only operation, the result won't be installed to the DB, it
+  // will be output to the `output_directory`. The API should only be used with
+  // `options.CompactionService` to run compaction triggered by
+  // `CompactionService`.
+  static Status OpenAndCompact(
+      const std::string& name, const std::string& output_directory,
+      const std::string& input, std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
+  static Status OpenAndCompact(
+      const OpenAndCompactOptions& options, const std::string& name,
+      const std::string& output_directory, const std::string& input,
+      std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
+  // Experimental and subject to change
+  // Open DB and trim data newer than specified timestamp.
+  // The trim_ts specified the user-defined timestamp trim bound.
+  // This API should only be used at timestamp enabled column families recovery.
+  // If some input column families do not support timestamp, nothing will
+  // be happened to them. The data with timestamp > trim_ts
+  // will be removed after this API returns successfully.
+  static Status OpenAndTrimHistory(
+      const DBOptions& db_options, const std::string& dbname,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      std::string trim_ts);
+
+  virtual Status Resume() { return Status::NotSupported(); }
+
+  // Close the DB by releasing resources, closing files etc. This should be
+  // called before calling the destructor so that the caller can get back a
+  // status in case there are any errors. This will not fsync the WAL files.
+  // If syncing is required, the caller must first call SyncWAL(), or Write()
+  // using an empty write batch with WriteOptions.sync=true.
+  // Regardless of the return status, the DB must be freed.
+  // If the return status is Aborted(), closing fails because there is
+  // unreleased snapshot in the system. In this case, users can release
+  // the unreleased snapshots and try again and expect it to succeed. For
+  // other status, re-calling Close() will be no-op and return the original
+  // close status. If the return status is NotSupported(), then the DB
+  // implementation does cleanup in the destructor
+  virtual Status Close() { return Status::NotSupported(); }
+
+  // ListColumnFamilies will open the DB specified by argument name
+  // and return the list of all column families in that DB
+  // through column_families argument. The ordering of
+  // column families in column_families is unspecified.
+  static Status ListColumnFamilies(const DBOptions& db_options,
+                                   const std::string& name,
+                                   std::vector<std::string>* column_families);
+
+  // Abstract class ctor
+  DB() {}
+  // No copying allowed
+  DB(const DB&) = delete;
+  void operator=(const DB&) = delete;
+
+  virtual ~DB();
+
+  // Create a column_family and return the handle of column family
+  // through the argument handle.
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle);
+
+  // Bulk create column families with the same column family options.
+  // Return the handles of the column families through the argument handles.
+  // In case of error, the request may succeed partially, and handles will
+  // contain column family handles that it managed to create, and have size
+  // equal to the number of created column families.
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles);
+
+  // Bulk create column families.
+  // Return the handles of the column families through the argument handles.
+  // In case of error, the request may succeed partially, and handles will
+  // contain column family handles that it managed to create, and have size
+  // equal to the number of created column families.
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles);
+
+  // Drop a column family specified by column_family handle. This call
+  // only records a drop record in the manifest and prevents the column
+  // family from flushing and compacting.
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Bulk drop column families. This call only records drop records in the
+  // manifest and prevents the column families from flushing and compacting.
+  // In case of error, the request may succeed partially. User may call
+  // ListColumnFamilies to check the result.
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families);
+
+  // Release and deallocate a column family handle. A column family is only
+  // removed once it is dropped (DropColumnFamily) and all handles have been
+  // destroyed (DestroyColumnFamilyHandle). Use this method to destroy
+  // column family handles (except for DefaultColumnFamily()!) before closing
+  // a DB.
+  virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
+
+  // Set the database entry for "key" to "value".
+  // If "key" already exists, it will be overwritten.
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& ts, const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, value);
+  }
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& ts, const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, ts, value);
+  }
+
+  // Set the database entry for "key" in the column family specified by
+  // "column_family" to the wide-column entity defined by "columns". If the key
+  // already exists in the column family, it will be overwritten.
+  //
+  // Returns OK on success, and a non-OK status on error.
+  virtual Status PutEntity(const WriteOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           const WideColumns& columns);
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) = 0;
+  virtual Status Delete(const WriteOptions& options, const Slice& key) {
+    return Delete(options, DefaultColumnFamily(), key);
+  }
+  virtual Status Delete(const WriteOptions& options, const Slice& key,
+                        const Slice& ts) {
+    return Delete(options, DefaultColumnFamily(), key, ts);
+  }
+
+  // Remove the database entry for "key". Requires that the key exists
+  // and was not overwritten. Returns OK on success, and a non-OK status
+  // on error.  It is not an error if "key" did not exist in the database.
+  //
+  // If a key is overwritten (by calling Put() multiple times), then the result
+  // of calling SingleDelete() on this key is undefined.  SingleDelete() only
+  // behaves correctly if there has been only one Put() for this key since the
+  // previous call to SingleDelete() for this key.
+  //
+  // This feature is currently an experimental performance optimization
+  // for a very specific workload.  It is up to the caller to ensure that
+  // SingleDelete is only used for a key that is not deleted using Delete() or
+  // written using Merge().  Mixing SingleDelete operations with Deletes and
+  // Merges can result in undefined behavior.
+  //
+  // Note: consider setting options.sync = true.
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& ts) = 0;
+  virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
+    return SingleDelete(options, DefaultColumnFamily(), key);
+  }
+  virtual Status SingleDelete(const WriteOptions& options, const Slice& key,
+                              const Slice& ts) {
+    return SingleDelete(options, DefaultColumnFamily(), key, ts);
+  }
+
+  // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
+  // including "begin_key" and excluding "end_key". Returns OK on success, and
+  // a non-OK status on error. It is not an error if the database does not
+  // contain any existing data in the range ["begin_key", "end_key").
+  //
+  // If "end_key" comes before "start_key" according to the user's comparator,
+  // a `Status::InvalidArgument` is returned.
+  //
+  // This feature is now usable in production, with the following caveats:
+  // 1) Accumulating too many range tombstones in the memtable will degrade read
+  // performance; this can be avoided by manually flushing occasionally.
+  // 2) Limiting the maximum number of open files in the presence of range
+  // tombstones can degrade read performance. To avoid this problem, set
+  // max_open_files to -1 whenever possible.
+  virtual Status DeleteRange(const WriteOptions& options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key);
+  virtual Status DeleteRange(const WriteOptions& options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key,
+                             const Slice& ts);
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+    return Merge(options, DefaultColumnFamily(), key, value);
+  }
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*ts*/,
+                       const Slice& /*value*/);
+
+  // Apply the specified updates to the database.
+  // If `updates` contains no update, WAL will still be synced if
+  // options.sync=true.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the column family specified by "column_family" contains an entry for
+  // "key", return the corresponding value in "*value". If the entry is a plain
+  // key-value, return the value as-is; if it is a wide-column entity, return
+  // the value of its default anonymous column (see kDefaultWideColumnName) if
+  // any, or an empty value otherwise.
+  //
+  // If timestamp is enabled and a non-null timestamp pointer is passed in,
+  // timestamp is returned.
+  //
+  // Returns OK on success. Returns NotFound and an empty value in "*value" if
+  // there is no entry for "key". Returns some other non-OK status on error.
+  virtual inline Status Get(const ReadOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            std::string* value) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    auto s = Get(options, column_family, key, &pinnable_val);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+    return s;
+  }
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) {
+    return Get(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Get() methods that return timestamp. Derived DB classes don't need to worry
+  // about this group of methods if they don't care about timestamp feature.
+  virtual inline Status Get(const ReadOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            std::string* value, std::string* timestamp) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    auto s = Get(options, column_family, key, &pinnable_val, timestamp);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+    return s;
+  }
+  virtual Status Get(const ReadOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, PinnableSlice* /*value*/,
+                     std::string* /*timestamp*/) {
+    return Status::NotSupported(
+        "Get() that returns timestamp is not implemented.");
+  }
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value, std::string* timestamp) {
+    return Get(options, DefaultColumnFamily(), key, value, timestamp);
+  }
+
+  // If the column family specified by "column_family" contains an entry for
+  // "key", return it as a wide-column entity in "*columns". If the entry is a
+  // wide-column entity, return it as-is; if it is a plain key-value, return it
+  // as an entity with a single anonymous column (see kDefaultWideColumnName)
+  // which contains the value.
+  //
+  // Returns OK on success. Returns NotFound and an empty wide-column entity in
+  // "*columns" if there is no entry for "key". Returns some other non-OK status
+  // on error.
+  virtual Status GetEntity(const ReadOptions& /* options */,
+                           ColumnFamilyHandle* /* column_family */,
+                           const Slice& /* key */,
+                           PinnableWideColumns* /* columns */) {
+    return Status::NotSupported("GetEntity not supported");
+  }
+
+  // Populates the `merge_operands` array with all the merge operands in the DB
+  // for `key`. The `merge_operands` array will be populated in the order of
+  // insertion. The number of entries populated in `merge_operands` will be
+  // assigned to `*number_of_operands`.
+  //
+  // If the number of merge operands in DB for `key` is greater than
+  // `merge_operands_options.expected_max_number_of_operands`,
+  // `merge_operands` is not populated and the return value is
+  // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned
+  // the number of merge operands found in the DB for `key`.
+  //
+  // `merge_operands`- Points to an array of at-least
+  //             merge_operands_options.expected_max_number_of_operands and the
+  //             caller is responsible for allocating it.
+  //
+  // The caller should delete or `Reset()` the `merge_operands` entries when
+  // they are no longer needed. All `merge_operands` entries must be destroyed
+  // or `Reset()` before this DB is closed or destroyed.
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* merge_operands,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) = 0;
+
+  // Consistent Get of many keys across column families without the need
+  // for an explicit snapshot. NOTE: the implementation of this MultiGet API
+  // does not have the performance benefits of the void-returning MultiGet
+  // functions.
+  //
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) {
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values);
+  }
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys, std::vector<std::string>* /*values*/,
+      std::vector<std::string>* /*timestamps*/) {
+    return std::vector<Status>(
+        keys.size(), Status::NotSupported(
+                         "MultiGet() returning timestamps not implemented."));
+  }
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values,
+                                       std::vector<std::string>* timestamps) {
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values, timestamps);
+  }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_family);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_family);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& /*options*/,
+                           ColumnFamilyHandle* /*column_family*/,
+                           const Slice& /*key*/, std::string* /*value*/,
+                           std::string* /*timestamp*/,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, column_family, key, value,
+                       /*timestamp=*/nullptr, value_found);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, std::string* timestamp,
+                           bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp,
+                       value_found);
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    return NewIterator(options, DefaultColumnFamily());
+  }
+  // Returns iterators from a consistent database state across multiple
+  // column families. Iterators are heap allocated and need to be deleted
+  // before the db is deleted
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  //
+  // nullptr will be returned if the DB fails to take a snapshot or does
+  // not support snapshot (eg: inplace_update_support enabled).
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+#ifndef ROCKSDB_LITE
+  // Contains all valid property arguments for GetProperty() or
+  // GetMapProperty(). Each is a "string" property for retrieval with
+  // GetProperty() unless noted as a "map" property, for GetMapProperty().
+  //
+  // NOTE: Property names cannot end in numbers since those are interpreted as
+  //       arguments, e.g., see kNumFilesAtLevelPrefix.
+  struct Properties {
+    //  "rocksdb.num-files-at-level<N>" - returns string containing the number
+    //      of files at level <N>, where <N> is an ASCII representation of a
+    //      level number (e.g., "0").
+    static const std::string kNumFilesAtLevelPrefix;
+
+    //  "rocksdb.compression-ratio-at-level<N>" - returns string containing the
+    //      compression ratio of data at level <N>, where <N> is an ASCII
+    //      representation of a level number (e.g., "0"). Here, compression
+    //      ratio is defined as uncompressed data size / compressed file size.
+    //      Returns "-1.0" if no open files at level <N>.
+    static const std::string kCompressionRatioAtLevelPrefix;
+
+    //  "rocksdb.stats" - returns a multi-line string containing the data
+    //      described by kCFStats followed by the data described by kDBStats.
+    static const std::string kStats;
+
+    //  "rocksdb.sstables" - returns a multi-line string summarizing current
+    //      SST files.
+    static const std::string kSSTables;
+
+    //  "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
+    //      and "rocksdb.cf-file-histogram" as a "map" property.
+    static const std::string kCFStats;
+
+    //  "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
+    //      general column family stats per-level over db's lifetime ("L<n>"),
+    //      aggregated over db's lifetime ("Sum"), and aggregated over the
+    //      interval since the last retrieval ("Int").
+    static const std::string kCFStatsNoFileHistogram;
+
+    //  "rocksdb.cf-file-histogram" - print out how many file reads to every
+    //      level, as well as the histogram of latency of single requests.
+    static const std::string kCFFileHistogram;
+
+    //  "rocksdb.dbstats" - As a string property, returns a multi-line string
+    //      with general database stats, both cumulative (over the db's
+    //      lifetime) and interval (since the last retrieval of kDBStats).
+    //      As a map property, returns cumulative stats only and does not
+    //      update the baseline for the interval stats.
+    static const std::string kDBStats;
+
+    //  "rocksdb.levelstats" - returns multi-line string containing the number
+    //      of files per level and total size of each level (MB).
+    static const std::string kLevelStats;
+
+    //  "rocksdb.block-cache-entry-stats" - returns a multi-line string or
+    //      map with statistics on block cache usage. See
+    //      `BlockCacheEntryStatsMapKeys` for structured representation of keys
+    //      available in the map form.
+    static const std::string kBlockCacheEntryStats;
+
+    //  "rocksdb.fast-block-cache-entry-stats" - same as above, but returns
+    //      stale values more frequently to reduce overhead and latency.
+    static const std::string kFastBlockCacheEntryStats;
+
+    //  "rocksdb.num-immutable-mem-table" - returns number of immutable
+    //      memtables that have not yet been flushed.
+    static const std::string kNumImmutableMemTable;
+
+    //  "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
+    //      memtables that have already been flushed.
+    static const std::string kNumImmutableMemTableFlushed;
+
+    //  "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
+    //      pending; otherwise, returns 0.
+    static const std::string kMemTableFlushPending;
+
+    //  "rocksdb.num-running-flushes" - returns the number of currently running
+    //      flushes.
+    static const std::string kNumRunningFlushes;
+
+    //  "rocksdb.compaction-pending" - returns 1 if at least one compaction is
+    //      pending; otherwise, returns 0.
+    static const std::string kCompactionPending;
+
+    //  "rocksdb.num-running-compactions" - returns the number of currently
+    //      running compactions.
+    static const std::string kNumRunningCompactions;
+
+    //  "rocksdb.background-errors" - returns accumulated number of background
+    //      errors.
+    static const std::string kBackgroundErrors;
+
+    //  "rocksdb.cur-size-active-mem-table" - returns approximate size of active
+    //      memtable (bytes).
+    static const std::string kCurSizeActiveMemTable;
+
+    //  "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
+    //      and unflushed immutable memtables (bytes).
+    static const std::string kCurSizeAllMemTables;
+
+    //  "rocksdb.size-all-mem-tables" - returns approximate size of active,
+    //      unflushed immutable, and pinned immutable memtables (bytes).
+    static const std::string kSizeAllMemTables;
+
+    //  "rocksdb.num-entries-active-mem-table" - returns total number of entries
+    //      in the active memtable.
+    static const std::string kNumEntriesActiveMemTable;
+
+    //  "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
+    //      in the unflushed immutable memtables.
+    static const std::string kNumEntriesImmMemTables;
+
+    //  "rocksdb.num-deletes-active-mem-table" - returns total number of delete
+    //      entries in the active memtable.
+    static const std::string kNumDeletesActiveMemTable;
+
+    //  "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
+    //      entries in the unflushed immutable memtables.
+    static const std::string kNumDeletesImmMemTables;
+
+    //  "rocksdb.estimate-num-keys" - returns estimated number of total keys in
+    //      the active and unflushed immutable memtables and storage.
+    static const std::string kEstimateNumKeys;
+
+    //  "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
+    //      reading SST tables, excluding memory used in block cache (e.g.,
+    //      filter and index blocks).
+    static const std::string kEstimateTableReadersMem;
+
+    //  "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
+    //      files is enabled; otherwise, returns a non-zero number.
+    //  This name may be misleading because true(non-zero) means disable,
+    //  but we keep the name for backward compatibility.
+    static const std::string kIsFileDeletionsEnabled;
+
+    //  "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
+    //      database.
+    static const std::string kNumSnapshots;
+
+    //  "rocksdb.oldest-snapshot-time" - returns number representing unix
+    //      timestamp of oldest unreleased snapshot.
+    static const std::string kOldestSnapshotTime;
+
+    //  "rocksdb.oldest-snapshot-sequence" - returns number representing
+    //      sequence number of oldest unreleased snapshot.
+    static const std::string kOldestSnapshotSequence;
+
+    //  "rocksdb.num-live-versions" - returns number of live versions. `Version`
+    //      is an internal data structure. See version_set.h for details. More
+    //      live versions often mean more SST files are held from being deleted,
+    //      by iterators or unfinished compactions.
+    static const std::string kNumLiveVersions;
+
+    //  "rocksdb.current-super-version-number" - returns number of current LSM
+    //  version. It is a uint64_t integer number, incremented after there is
+    //  any change to the LSM tree. The number is not preserved after restarting
+    //  the DB. After DB restart, it will start from 0 again.
+    static const std::string kCurrentSuperVersionNumber;
+
+    //  "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
+    //      live data in bytes. For BlobDB, it also includes the exact value of
+    //      live bytes in the blob files of the version.
+    static const std::string kEstimateLiveDataSize;
+
+    //  "rocksdb.min-log-number-to-keep" - return the minimum log number of the
+    //      log files that should be kept.
+    static const std::string kMinLogNumberToKeep;
+
+    //  "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+    //      number for an obsolete SST to be kept. The max value of `uint64_t`
+    //      will be returned if all obsolete files can be deleted.
+    static const std::string kMinObsoleteSstNumberToKeep;
+
+    //  "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
+    //      files.
+    //  WARNING: may slow down online queries if there are too many files.
+    static const std::string kTotalSstFilesSize;
+
+    //  "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
+    //      files belong to the latest LSM tree.
+    static const std::string kLiveSstFilesSize;
+
+    // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes)
+    //      of SST files at all certain file temperature
+    static const std::string kLiveSstFilesSizeAtTemperature;
+
+    //  "rocksdb.base-level" - returns number of level to which L0 data will be
+    //      compacted.
+    static const std::string kBaseLevel;
+
+    //  "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
+    //      number of bytes compaction needs to rewrite to get all levels down
+    //      to under target size. Not valid for other compactions than level-
+    //      based.
+    static const std::string kEstimatePendingCompactionBytes;
+
+    //  "rocksdb.aggregated-table-properties" - returns a string or map
+    //      representation of the aggregated table properties of the target
+    //      column family. Only properties that make sense for aggregation
+    //      are included.
+    static const std::string kAggregatedTableProperties;
+
+    //  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
+    //      one but only returns the aggregated table properties of the
+    //      specified level "N" at the target column family.
+    static const std::string kAggregatedTablePropertiesAtLevel;
+
+    //  "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
+    //      write rate. 0 means no delay.
+    static const std::string kActualDelayedWriteRate;
+
+    //  "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
+    static const std::string kIsWriteStopped;
+
+    //  "rocksdb.estimate-oldest-key-time" - returns an estimation of
+    //      oldest key timestamp in the DB. Currently only available for
+    //      FIFO compaction with
+    //      compaction_options_fifo.allow_compaction = false.
+    static const std::string kEstimateOldestKeyTime;
+
+    //  "rocksdb.block-cache-capacity" - returns block cache capacity.
+    static const std::string kBlockCacheCapacity;
+
+    //  "rocksdb.block-cache-usage" - returns the memory size for the entries
+    //      residing in block cache.
+    static const std::string kBlockCacheUsage;
+
+    // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
+    //      entries being pinned.
+    static const std::string kBlockCachePinnedUsage;
+
+    // "rocksdb.options-statistics" - returns multi-line string
+    //      of options.statistics
+    static const std::string kOptionsStatistics;
+
+    // "rocksdb.num-blob-files" - returns number of blob files in the current
+    //      version.
+    static const std::string kNumBlobFiles;
+
+    // "rocksdb.blob-stats" - return the total number and size of all blob
+    //      files, and total amount of garbage (bytes) in the blob files in
+    //      the current version.
+    static const std::string kBlobStats;
+
+    // "rocksdb.total-blob-file-size" - returns the total size of all blob
+    //      files over all versions.
+    static const std::string kTotalBlobFileSize;
+
+    // "rocksdb.live-blob-file-size" - returns the total size of all blob
+    //      files in the current version.
+    static const std::string kLiveBlobFileSize;
+
+    // "rocksdb.live-blob-file-garbage-size" - returns the total amount of
+    // garbage in the blob files in the current version.
+    static const std::string kLiveBlobFileGarbageSize;
+
+    //  "rocksdb.blob-cache-capacity" - returns blob cache capacity.
+    static const std::string kBlobCacheCapacity;
+
+    //  "rocksdb.blob-cache-usage" - returns the memory size for the entries
+    //      residing in blob cache.
+    static const std::string kBlobCacheUsage;
+
+    // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the
+    //      entries being pinned in blob cache.
+    static const std::string kBlobCachePinnedUsage;
+  };
+#endif /* ROCKSDB_LITE */
+
+  // DB implementations export properties about their state via this method.
+  // If "property" is a valid "string" property understood by this DB
+  // implementation (see Properties struct above for valid options), fills
+  // "*value" with its current value and returns true.  Otherwise, returns
+  // false.
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) = 0;
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return GetProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Like GetProperty but for valid "map" properties. (Some properties can be
+  // accessed as either "string" properties or "map" properties.)
+  virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property,
+                              std::map<std::string, std::string>* value) = 0;
+  virtual bool GetMapProperty(const Slice& property,
+                              std::map<std::string, std::string>* value) {
+    return GetMapProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Similar to GetProperty(), but only works for a subset of properties whose
+  // return value is an integer. Return the value by integer. Supported
+  // properties:
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending"
+  //  "rocksdb.background-errors"
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.num-deletes-active-mem-table"
+  //  "rocksdb.num-deletes-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys"
+  //  "rocksdb.estimate-table-readers-mem"
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
+  //  "rocksdb.num-live-versions"
+  //  "rocksdb.current-super-version-number"
+  //  "rocksdb.estimate-live-data-size"
+  //  "rocksdb.min-log-number-to-keep"
+  //  "rocksdb.min-obsolete-sst-number-to-keep"
+  //  "rocksdb.total-sst-files-size"
+  //  "rocksdb.live-sst-files-size"
+  //  "rocksdb.base-level"
+  //  "rocksdb.estimate-pending-compaction-bytes"
+  //  "rocksdb.num-running-compactions"
+  //  "rocksdb.num-running-flushes"
+  //  "rocksdb.actual-delayed-write-rate"
+  //  "rocksdb.is-write-stopped"
+  //  "rocksdb.estimate-oldest-key-time"
+  //  "rocksdb.block-cache-capacity"
+  //  "rocksdb.block-cache-usage"
+  //  "rocksdb.block-cache-pinned-usage"
+  //
+  //  Properties dedicated for BlobDB:
+  //  "rocksdb.num-blob-files"
+  //  "rocksdb.total-blob-file-size"
+  //  "rocksdb.live-blob-file-size"
+  //  "rocksdb.blob-cache-capacity"
+  //  "rocksdb.blob-cache-usage"
+  //  "rocksdb.blob-cache-pinned-usage"
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) = 0;
+  virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
+    return GetIntProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Reset internal stats for DB and all column families.
+  // Note this doesn't reset options.statistics as it is not owned by
+  // DB.
+  virtual Status ResetStats() {
+    return Status::NotSupported("Not implemented");
+  }
+
+  // Same as GetIntProperty(), but this one returns the aggregated int
+  // property from all column families.
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* value) = 0;
+
+  // Flags for DB::GetSizeApproximation that specify whether memtable
+  // stats should be included, or file stats approximation or both
+  enum class SizeApproximationFlags : uint8_t {
+    NONE = 0,
+    INCLUDE_MEMTABLES = 1 << 0,
+    INCLUDE_FILES = 1 << 1
+  };
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)"
+  // in a single column family.
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes) = 0;
+
+  // Simpler versions of the GetApproximateSizes() method above.
+  // The include_flags argument must of type DB::SizeApproximationFlags
+  // and can not be NONE.
+  virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     SizeApproximationFlags include_flags =
+                                         SizeApproximationFlags::INCLUDE_FILES);
+
+  virtual Status GetApproximateSizes(
+      const Range* ranges, int n, uint64_t* sizes,
+      SizeApproximationFlags include_flags =
+          SizeApproximationFlags::INCLUDE_FILES) {
+    return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
+                               include_flags);
+  }
+
+  // The method is similar to GetApproximateSizes, except it
+  // returns approximate number of records in memtables.
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) = 0;
+  virtual void GetApproximateMemTableStats(const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) {
+    GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
+  }
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  // This call blocks until the operation completes successfully, fails,
+  // or is aborted (Status::Incomplete). See DisableManualCompaction.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(options, nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size after
+  // compaction is reduced, that level might not be appropriate for hosting all
+  // the files. In this case, client could set options.change_level to true, to
+  // move the files back to the minimum level capable of holding the data set
+  // or a given level (specified by non-negative options.target_level).
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) = 0;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              const Slice* begin, const Slice* end) {
+    return CompactRange(options, DefaultColumnFamily(), begin, end);
+  }
+
+  // Dynamically change column family options or table factory options in a
+  // running DB, for the specified column family. Only options internally
+  // marked as "mutable" can be changed. Options not listed in `opts_map` will
+  // keep their current values. See GetColumnFamilyOptionsFromMap() in
+  // convenience.h for the details of `opts_map`. Not supported in LITE mode.
+  //
+  // USABILITY NOTE: SetOptions is intended only for expert users, and does
+  // not apply the same sanitization to options as the standard DB::Open code
+  // path does. Use with caution.
+  //
+  // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for
+  // reliability, and this is a slow call because a new OPTIONS file is
+  // serialized and persisted for each call. Use only infrequently.
+  //
+  // EXAMPLES:
+  //  s = db->SetOptions(cfh, {{"ttl", "36000"}});
+  //  s = db->SetOptions(cfh, {{"block_based_table_factory",
+  //                            "{prepopulate_block_cache=kDisable;}"}});
+  virtual Status SetOptions(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::unordered_map<std::string, std::string>& /*opts_map*/) {
+    return Status::NotSupported("Not implemented");
+  }
+  // Shortcut for SetOptions on the default column family handle.
+  virtual Status SetOptions(
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return SetOptions(DefaultColumnFamily(), new_options);
+  }
+
+  // Like SetOptions but for DBOptions, including the same caveats for
+  // usability, reliability, and performance. See GetDBOptionsFromMap() (and
+  // GetColumnFamilyOptionsFromMap()) in convenience.h for details on
+  // `opts_map`. Note supported in LITE mode.
+  //
+  // EXAMPLES:
+  //  s = db->SetDBOptions({{"max_subcompactions", "2"}});
+  //  s = db->SetDBOptions({{"stats_dump_period_sec", "0"},
+  //                        {"stats_persist_period_sec", "0"}});
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& new_options) = 0;
+
+  // CompactFiles() inputs a list of files specified by file numbers and
+  // compacts them to the specified level. A small difference compared to
+  // CompactRange() is that CompactFiles() performs the compaction job
+  // using the CURRENT thread, so is not considered a "background" job.
+  //
+  // @see GetDataBaseMetaData
+  // @see GetColumnFamilyMetaData
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) = 0;
+
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) {
+    return CompactFiles(compact_options, DefaultColumnFamily(),
+                        input_file_names, output_level, output_path_id,
+                        output_file_names, compaction_job_info);
+  }
+
+  // This function will wait until all currently running background processes
+  // finish. After it returns, no background process will be run until
+  // ContinueBackgroundWork is called, once for each preceding OK-returning
+  // call to PauseBackgroundWork.
+  virtual Status PauseBackgroundWork() = 0;
+  virtual Status ContinueBackgroundWork() = 0;
+
+  // This function will enable automatic compactions for the given column
+  // families if they were previously disabled. The function will first set the
+  // disable_auto_compactions option for each column family to 'false', after
+  // which it will schedule a flush/compaction.
+  //
+  // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
+  // does NOT schedule a flush/compaction afterwards, and only changes the
+  // parameter itself within the column family option.
+  //
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
+
+  // After this function call, CompactRange() or CompactFiles() will not
+  // run compactions and fail. Calling this function will tell outstanding
+  // manual compactions to abort and will wait for them to finish or abort
+  // before returning.
+  virtual void DisableManualCompaction() = 0;
+  // Re-enable CompactRange() and ComapctFiles() that are disabled by
+  // DisableManualCompaction(). This function must be called as many times
+  // as DisableManualCompaction() has been called in order to re-enable
+  // manual compactions, and must not be called more times than
+  // DisableManualCompaction() has been called.
+  virtual void EnableManualCompaction() = 0;
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+  virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+  virtual int MaxMemCompactionLevel() {
+    return MaxMemCompactionLevel(DefaultColumnFamily());
+  }
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+  virtual int Level0StopWriteTrigger() {
+    return Level0StopWriteTrigger(DefaultColumnFamily());
+  }
+
+  // Get DB name -- the exact same name that was provided as an argument to
+  // DB::Open()
+  virtual const std::string& GetName() const = 0;
+
+  // Get Env object from the DB
+  virtual Env* GetEnv() const = 0;
+
+  // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for
+  // efficiency.
+  virtual FileSystem* GetFileSystem() const;
+
+  // Get DB Options that we use.  During the process of opening the
+  // column family, the options provided when calling DB::Open() or
+  // DB::CreateColumnFamily() will have been "sanitized" and transformed
+  // in an implementation-defined manner.
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
+  virtual Options GetOptions() const {
+    return GetOptions(DefaultColumnFamily());
+  }
+
+  virtual DBOptions GetDBOptions() const = 0;
+
+  // Flush all mem-table data.
+  // Flush a single column family, even when atomic flush is enabled. To flush
+  // multiple column families, use Flush(options, column_families).
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) = 0;
+  virtual Status Flush(const FlushOptions& options) {
+    return Flush(options, DefaultColumnFamily());
+  }
+  // Flushes multiple column families.
+  // If atomic flush is not enabled, Flush(options, column_families) is
+  // equivalent to calling Flush(options, column_family) multiple times.
+  // If atomic flush is enabled, Flush(options, column_families) will flush all
+  // column families specified in 'column_families' up to the latest sequence
+  // number at the time when flush is requested.
+  // Note that RocksDB 5.15 and earlier may not be able to open later versions
+  // with atomic flush enabled.
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) = 0;
+
+  // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
+  // afterwards.
+  virtual Status FlushWAL(bool /*sync*/) {
+    return Status::NotSupported("FlushWAL not implemented");
+  }
+  // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
+  // same as Write() with sync=true: in the latter case the changes won't be
+  // visible until the sync is done.
+  // Currently only works if allow_mmap_writes = false in Options.
+  virtual Status SyncWAL() = 0;
+
+  // Lock the WAL. Also flushes the WAL after locking.
+  virtual Status LockWAL() {
+    return Status::NotSupported("LockWAL not implemented");
+  }
+
+  // Unlock the WAL.
+  virtual Status UnlockWAL() {
+    return Status::NotSupported("UnlockWAL not implemented");
+  }
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Increase the full_history_ts of column family. The new ts_low value should
+  // be newer than current full_history_ts value.
+  // If another thread updates full_history_ts_low concurrently to a higher
+  // timestamp than the requested ts_low, a try again error will be returned.
+  virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                          std::string ts_low) = 0;
+
+  // Get current full_history_ts value.
+  virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                     std::string* ts_low) = 0;
+
+  // Allow compactions to delete obsolete files.
+  // If force == true, the call to EnableFileDeletions() will guarantee that
+  // file deletions are enabled after the call, even if DisableFileDeletions()
+  // was called multiple times before.
+  // If force == false, EnableFileDeletions will only enable file deletion
+  // after it's been called at least as many times as DisableFileDeletions(),
+  // enabling the two methods to be called by two threads concurrently without
+  // synchronization -- i.e., file deletions will be enabled only after both
+  // threads call EnableFileDeletions()
+  virtual Status EnableFileDeletions(bool force = true) = 0;
+
+#ifndef ROCKSDB_LITE
+  // Retrieves the creation time of the oldest file in the DB.
+  // This API only works if max_open_files = -1, if it is not then
+  // Status returned is Status::NotSupported()
+  // The file creation time is set using the env provided to the DB.
+  // If the DB was created from a very old release then its possible that
+  // the SST files might not have file_creation_time property and even after
+  // moving to a newer release its possible that some files never got compacted
+  // and may not have file_creation_time property. In both the cases
+  // file_creation_time is considered 0 which means this API will return
+  // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
+  // Note: this API is not yet consistent with WritePrepared transactions.
+  //
+  // Sets iter to an iterator that is positioned at a write-batch whose
+  // sequence number range [start_seq, end_seq] covers seq_number. If no such
+  // write-batch exists, then iter is positioned at the next write-batch whose
+  // start_seq > seq_number.
+  //
+  // Returns Status::OK if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) = 0;
+
+// Windows API macro interference
+#undef DeleteFile
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
+  //
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Obtains a list of all live table (SST) files and how they fit into the
+  // LSM-trees, such as column family, level, key range, etc.
+  // This builds a de-normalized form of GetAllColumnFamilyMetaData().
+  // For information about all files in a DB, use GetLiveFilesStorageInfo().
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* /*metadata*/) {}
+
+  // Return a list of all table (SST) and blob files checksum info.
+  // Note: This function might be of limited use because it cannot be
+  // synchronized with other "live files" APIs. GetLiveFilesStorageInfo()
+  // is recommended instead.
+  virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
+
+  // Get information about all live files that make up a DB, for making
+  // live copies (Checkpoint, backups, etc.) or other storage-related purposes.
+  // If creating a live copy, use DisableFileDeletions() before and
+  // EnableFileDeletions() after to prevent deletions.
+  // For LSM-tree metadata, use Get*MetaData() functions instead.
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) = 0;
+
+  // Obtains the LSM-tree meta data of the specified column family of the DB,
+  // including metadata for each live table (SST) file in that column family.
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                                       ColumnFamilyMetaData* /*metadata*/) {}
+
+  // Get the metadata of the default column family.
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+  }
+
+  // Obtains the LSM-tree meta data of all column families of the DB, including
+  // metadata for each live table (SST) file and each blob file in the DB.
+  virtual void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* /*metadata*/) {}
+
+  // Retrieve the list of all files in the database except WAL files. The files
+  // are relative to the dbname (or db_paths/cf_paths), not absolute paths.
+  // (Not recommended with db_paths/cf_paths because that information is not
+  // returned.) Despite being relative paths, the file names begin with "/".
+  // The valid size of the manifest file is returned in manifest_file_size.
+  // The manifest file is an ever growing file, but only the portion specified
+  // by manifest_file_size is valid for this snapshot. Setting flush_memtable
+  // to true does Flush before recording the live files (unless DB is
+  // read-only). Setting flush_memtable to false is useful when we don't want
+  // to wait for flush which may have to wait for compaction to complete
+  // taking an indeterminate time.
+  //
+  // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
+  // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended
+  // instead, because it ensures a single consistent view of all files is
+  // captured in one call.
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // Retrieve information about the current wal file
+  //
+  // Note that the log might have rolled after this call in which case
+  // the current_log_file would not point to the current log file.
+  //
+  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // would always be set to 0
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) = 0;
+
+  // IngestExternalFile() will load a list of external SST files (1) into the DB
+  // Two primary modes are supported:
+  // - Duplicate keys in the new files will overwrite exiting keys (default)
+  // - Duplicate keys will be skipped (set ingest_behind=true)
+  // In the first mode we will try to find the lowest possible level that
+  // the file can fit in, and ingest the file into this level (2). A file that
+  // have a key range that overlap with the memtable key range will require us
+  // to Flush the memtable first before ingesting the file.
+  // In the second mode we will always ingest in the bottom most level (see
+  // docs to IngestExternalFileOptions::ingest_behind).
+  //
+  // (1) External SST files can be created using SstFileWriter
+  // (2) We will try to ingest the files to the lowest possible level
+  //     even if the file compression doesn't match the level compression
+  // (3) If IngestExternalFileOptions->ingest_behind is set to true,
+  //     we always ingest at the bottommost level, which should be reserved
+  //     for this purpose (see DBOPtions::allow_ingest_behind flag).
+  // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
+  //     true, then this method can return Status:TryAgain() indicating that
+  //     the files cannot be ingested to the bottommost level, and it is the
+  //     user's responsibility to clear the bottommost level in the overlapping
+  //     range before re-attempting the ingestion.
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) = 0;
+
+  virtual Status IngestExternalFile(
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) {
+    return IngestExternalFile(DefaultColumnFamily(), external_files, options);
+  }
+
+  // IngestExternalFiles() will ingest files for multiple column families, and
+  // record the result atomically to the MANIFEST.
+  // If this function returns OK, all column families' ingestion must succeed.
+  // If this function returns NOK, or the process crashes, then non-of the
+  // files will be ingested into the database after recovery.
+  // Note that it is possible for application to observe a mixed state during
+  // the execution of this function. If the user performs range scan over the
+  // column families with iterators, iterator on one column family may return
+  // ingested data, while iterator on other column family returns old data.
+  // Users can use snapshot for a consistent view of data.
+  // If your db ingests multiple SST files using this API, i.e. args.size()
+  // > 1, then RocksDB 5.15 and earlier will not be able to open it.
+  //
+  // REQUIRES: each arg corresponds to a different column family: namely, for
+  // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) = 0;
+
+  // CreateColumnFamilyWithImport() will create a new column family with
+  // column_family_name and import external SST files specified in metadata into
+  // this column family.
+  // (1) External SST files can be created using SstFileWriter.
+  // (2) External SST files can be exported from a particular column family in
+  //     an existing DB using Checkpoint::ExportColumnFamily.
+  // Option in import_options specifies whether the external files are copied or
+  // moved (default is copy). When option specifies copy, managing files at
+  // external_file_path is caller's responsibility. When option specifies a
+  // move, the call makes a best effort to delete the specified files at
+  // external_file_path on successful return, logging any failure to delete
+  // rather than returning in Status. Files are not modified on any error
+  // return, and a best effort is made to remove any newly-created files.
+  // On error return, column family handle returned will be nullptr.
+  // ColumnFamily will be present on successful return and will not be present
+  // on error return. ColumnFamily may be present on any crash during this call.
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) = 0;
+
+  // Verify the checksums of files in db. Currently the whole-file checksum of
+  // table files are checked.
+  virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
+    return Status::NotSupported("File verification not supported");
+  }
+
+  // Verify the block checksums of files in db. The block checksums of table
+  // files are checked.
+  virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+  virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+#endif  // ROCKSDB_LITE
+
+  // Returns the unique ID which is read from IDENTITY file during the opening
+  // of database by setting in the identity variable
+  // Returns Status::OK if identity could be set properly
+  virtual Status GetDbIdentity(std::string& identity) const = 0;
+
+  // Return a unique identifier for each DB object that is opened
+  // This DB session ID should be unique among all open DB instances on all
+  // hosts, and should be unique among re-openings of the same or other DBs.
+  // (Two open DBs have the same identity from other function GetDbIdentity when
+  // one is physically copied from the other.)
+  virtual Status GetDbSessionId(std::string& session_id) const = 0;
+
+  // Returns default column family handle
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) = 0;
+  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+    return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+  }
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) = 0;
+
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
+                                     const Slice* /*begin*/,
+                                     const Slice* /*end*/) {
+    return Status::NotSupported("SuggestCompactRange() is not implemented.");
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
+                           int /*target_level*/) {
+    return Status::NotSupported("PromoteL0() is not implemented.");
+  }
+
+  // Trace DB operations. Use EndTrace() to stop tracing.
+  virtual Status StartTrace(const TraceOptions& /*options*/,
+                            std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartTrace() is not implemented.");
+  }
+
+  virtual Status EndTrace() {
+    return Status::NotSupported("EndTrace() is not implemented.");
+  }
+
+  // IO Tracing operations. Use EndIOTrace() to stop tracing.
+  virtual Status StartIOTrace(const TraceOptions& /*options*/,
+                              std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartIOTrace() is not implemented.");
+  }
+
+  virtual Status EndIOTrace() {
+    return Status::NotSupported("EndIOTrace() is not implemented.");
+  }
+
+  // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+  virtual Status StartBlockCacheTrace(
+      const TraceOptions& /*trace_options*/,
+      std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& /*options*/,
+      std::unique_ptr<BlockCacheTraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status EndBlockCacheTrace() {
+    return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+  }
+
+  // Create a default trace replayer.
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& /*handles*/,
+      std::unique_ptr<TraceReader>&& /*reader*/,
+      std::unique_ptr<Replayer>* /*replayer*/) {
+    return Status::NotSupported("NewDefaultReplayer() is not implemented.");
+  }
+
+#endif  // ROCKSDB_LITE
+
+  // Needed for StackableDB
+  virtual DB* GetRootDB() { return this; }
+
+  // Given a window [start_time, end_time), setup a StatsHistoryIterator
+  // to access stats history. Note the start_time and end_time are epoch
+  // time measured in seconds, and end_time is an exclusive bound.
+  virtual Status GetStatsHistory(
+      uint64_t /*start_time*/, uint64_t /*end_time*/,
+      std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
+    return Status::NotSupported("GetStatsHistory() is not implemented.");
+  }
+
+#ifndef ROCKSDB_LITE
+  // Make the secondary instance catch up with the primary by tailing and
+  // replaying the MANIFEST and WAL of the primary.
+  // Column families created by the primary after the secondary instance starts
+  // will be ignored unless the secondary instance closes and restarts with the
+  // newly created column families.
+  // Column families that exist before secondary instance starts and dropped by
+  // the primary afterwards will be marked as dropped. However, as long as the
+  // secondary instance does not delete the corresponding column family
+  // handles, the data of the column family is still accessible to the
+  // secondary.
+  virtual Status TryCatchUpWithPrimary() {
+    return Status::NotSupported("Supported only by secondary instance");
+  }
+#endif  // !ROCKSDB_LITE
+};
+
+// Overloaded operators for enum class SizeApproximationFlags.
+inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs,
+                                            DB::SizeApproximationFlags rhs) {
+  return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) &
+                                                 static_cast<uint8_t>(rhs));
+}
+inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs,
+                                            DB::SizeApproximationFlags rhs) {
+  return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) |
+                                                 static_cast<uint8_t>(rhs));
+}
+
+inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                      const Range* ranges, int n,
+                                      uint64_t* sizes,
+                                      SizeApproximationFlags include_flags) {
+  SizeApproximationOptions options;
+  options.include_memtables =
+      ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
+       SizeApproximationFlags::NONE);
+  options.include_files =
+      ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
+       SizeApproximationFlags::NONE);
+  return GetApproximateSizes(options, column_family, ranges, n, sizes);
+}
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families =
+                     std::vector<ColumnFamilyDescriptor>());
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+//
+// With this API, we will warn and skip data associated with column families not
+// specified in column_families.
+//
+// @param column_families Descriptors for known column families
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families);
+
+// @param unknown_cf_opts Options for column families encountered during the
+//                        repair that were not specified in column_families.
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                const ColumnFamilyOptions& unknown_cf_opts);
+
+// @param options These options will be used for the database and for ALL column
+//                families encountered during the repair
+Status RepairDB(const std::string& dbname, const Options& options);
+
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h
new file mode 100644
index 000000000..17f4e6bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_bench_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h
new file mode 100644
index 000000000..b7d4766a2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_dump_tool.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct DumpOptions {
+  // Database that will be dumped
+  std::string db_path;
+  // File location that will contain dump output
+  std::string dump_location;
+  // Don't include db information header in the dump
+  bool anonymous = false;
+};
+
+class DbDumpTool {
+ public:
+  bool Run(const DumpOptions& dump_options,
+           ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+
+struct UndumpOptions {
+  // Database that we will load the dumped file into
+  std::string db_path;
+  // File location of the dumped file that will be loaded
+  std::string dump_location;
+  // Compact the db after loading the dumped file
+  bool compact_db = false;
+};
+
+class DbUndumpTool {
+ public:
+  bool Run(const UndumpOptions& undump_options,
+           ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h
new file mode 100644
index 000000000..7d3d42c9d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_stress_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
new file mode 100644
index 000000000..bef60a212
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -0,0 +1,1893 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/functor_wrapper.h"
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
+  __attribute__((__format__(__printf__, format_param, dots_param)))
+#else
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DynamicLibrary;
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+struct DataVerificationInfo;
+class WritableFile;
+class RandomRWFile;
+class MemoryMappedFileBuffer;
+class Directory;
+struct DBOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+class ThreadStatusUpdater;
+struct ThreadStatus;
+class FileSystem;
+class SystemClock;
+struct ConfigOptions;
+
+const size_t kDefaultPageSize = 4 * 1024;
+
+enum class CpuPriority {
+  kIdle = 0,
+  kLow = 1,
+  kNormal = 2,
+  kHigh = 3,
+};
+
+// Options while opening a file to read/write
+struct EnvOptions {
+  // Construct with default Options
+  EnvOptions();
+
+  // Construct from Options
+  explicit EnvOptions(const DBOptions& options);
+
+  // If true, then use mmap to read data.
+  // Not recommended for 32-bit OS.
+  bool use_mmap_reads = false;
+
+  // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, then use O_DIRECT for reading data
+  bool use_direct_reads = false;
+
+  // If true, then use O_DIRECT for writing data
+  bool use_direct_writes = false;
+
+  // If false, fallocate() calls are bypassed
+  bool allow_fallocate = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec = true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+
+  // When true, guarantees the file has at most `bytes_per_sync` bytes submitted
+  // for writeback at any given time.
+  //
+  //  - If `sync_file_range` is supported it achieves this by waiting for any
+  //    prior `sync_file_range`s to finish before proceeding. In this way,
+  //    processing (compression, etc.) can proceed uninhibited in the gap
+  //    between `sync_file_range`s, and we block only when I/O falls behind.
+  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+  //    always blocks, thus preventing the interleaving of I/O and processing.
+  //
+  // Note: Enabling this option does not provide any additional persistence
+  // guarantees, as it may use `sync_file_range`, which does not write out
+  // metadata.
+  //
+  // Default: false
+  bool strict_bytes_per_sync = false;
+
+  // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+  // means that file size won't change as part of preallocation.
+  // If false, preallocation will also change the file size. This option will
+  // improve the performance in workloads where you sync the data on every
+  // write. By default, we set it to true for MANIFEST writes and false for
+  // WAL writes
+  bool fallocate_with_keep_size = true;
+
+  // See DBOptions doc
+  size_t compaction_readahead_size = 0;
+
+  // See DBOptions doc
+  size_t random_access_max_buffer_size = 0;
+
+  // See DBOptions doc
+  size_t writable_file_max_buffer_size = 1024 * 1024;
+
+  // If not nullptr, write rate limiting is enabled for flush and compaction
+  RateLimiter* rate_limiter = nullptr;
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Env : public Customizable {
+ public:
+  static const char* kDefaultName() { return "DefaultEnv"; }
+  struct FileAttributes {
+    // File name
+    std::string name;
+
+    // Size of file in bytes
+    uint64_t size_bytes;
+  };
+
+  Env();
+  // Construct an Env with a separate FileSystem and/or SystemClock
+  // implementation
+  explicit Env(const std::shared_ptr<FileSystem>& fs);
+  Env(const std::shared_ptr<FileSystem>& fs,
+      const std::shared_ptr<SystemClock>& clock);
+  // No copying allowed
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
+
+  ~Env() override;
+
+  static const char* Type() { return "Environment"; }
+
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return ""; }
+
+  // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status LoadEnv(const std::string& value, Env** result);
+
+  // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status LoadEnv(const std::string& value, Env** result,
+                        std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  //
+  // @param config_options Controls how the environment is loaded.
+  // @param value the name and associated properties for the environment.
+  // @param result On success, the environment that was loaded.
+  // @param guard If specified and the loaded environment is not static,
+  //      this value will contain the loaded environment (guard.get() ==
+  //      result).
+  // @return OK If the environment was successfully loaded (and optionally
+  // prepared)
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result,
+                                 std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the env and fs uri.
+  // If both are specified, an error is returned.
+  // Otherwise, the environment is created by loading (via CreateFromString)
+  // the appropriate env/fs from the corresponding values.
+  static Status CreateFromUri(const ConfigOptions& options,
+                              const std::string& env_uri,
+                              const std::string& fs_uri, Env** result,
+                              std::shared_ptr<Env>* guard);
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // See FileSystem::RegisterDbPaths.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // See FileSystem::UnregisterDbPaths.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) = 0;
+  // These values match Linux definition
+  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+  enum WriteLifeTimeHint {
+    WLTH_NOT_SET = 0,  // No hint information set
+    WLTH_NONE,         // No hints about write life time
+    WLTH_SHORT,        // Data written has a short life time
+    WLTH_MEDIUM,       // Data written has a medium life time
+    WLTH_LONG,         // Data written has a long life time
+    WLTH_EXTREME,      // Data written has an extremely long life time
+  };
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that writes to a file with the specified name.
+  // `WritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status ReopenWritableFile(const std::string& /*fname*/,
+                                    std::unique_ptr<WritableFile>* /*result*/,
+                                    const EnvOptions& /*options*/) {
+    return Status::NotSupported("Env::ReopenWritableFile() not supported.");
+  }
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  virtual Status ReuseWritableFile(const std::string& fname,
+                                   const std::string& old_fname,
+                                   std::unique_ptr<WritableFile>* result,
+                                   const EnvOptions& options);
+
+  // Open `fname` for random read and write, if file doesn't exist the file
+  // will be created.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewRandomRWFile(const std::string& /*fname*/,
+                                 std::unique_ptr<RandomRWFile>* /*result*/,
+                                 const EnvOptions& /*options*/) {
+    return Status::NotSupported("RandomRWFile is not implemented in this Env");
+  }
+
+  // Opens `fname` as a memory-mapped file for read and write (in-place updates
+  // only, i.e., no appends). On success, stores a raw buffer covering the whole
+  // file in `*result`. The file must exist prior to this call.
+  virtual Status NewMemoryMappedFileBuffer(
+      const std::string& /*fname*/,
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+    return Status::NotSupported(
+        "MemoryMappedFileBuffer is not implemented in this Env");
+  }
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual Status NewDirectory(const std::string& name,
+                              std::unique_ptr<Directory>* result) = 0;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir", and shall never include the
+  // names `.` or `..`.
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Store in *result the attributes of the children of the specified directory.
+  // In case the implementation lists the directory prior to iterating the files
+  // and files are concurrently deleted, the deleted files will be omitted from
+  // result.
+  // The name attributes are relative to "dir", and shall never include the
+  // names `.` or `..`.
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status GetChildrenFileAttributes(const std::string& dir,
+                                           std::vector<FileAttributes>* result);
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Truncate the named file to the specified size.
+  virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) {
+    return Status::NotSupported("Truncate is not supported for this Env");
+  }
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  // Many implementations of this function will only delete a directory if it is
+  // empty.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Hard Link file src to target.
+  virtual Status LinkFile(const std::string& /*src*/,
+                          const std::string& /*target*/) {
+    return Status::NotSupported("LinkFile is not supported for this Env");
+  }
+
+  virtual Status NumFileLinks(const std::string& /*fname*/,
+                              uint64_t* /*count*/) {
+    return Status::NotSupported(
+        "Getting number of file links is not supported for this Env");
+  }
+
+  virtual Status AreFilesSame(const std::string& /*first*/,
+                              const std::string& /*second*/, bool* /*res*/) {
+    return Status::NotSupported("AreFilesSame is not supported for this Env");
+  }
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  // Opens `lib_name` as a dynamic library.
+  // If the 'search_path' is specified, breaks the path into its components
+  // based on the appropriate platform separator (";" or ";") and looks for the
+  // library in those directories.  If 'search path is not specified, uses the
+  // default library path search mechanism (such as LD_LIBRARY_PATH). On
+  // success, stores a dynamic library in `*result`.
+  virtual Status LoadLibrary(const std::string& /*lib_name*/,
+                             const std::string& /*search_path */,
+                             std::shared_ptr<DynamicLibrary>* /*result*/) {
+    return Status::NotSupported("LoadLibrary is not implemented in this Env");
+  }
+
+  // Priority for scheduling job in thread pool
+  enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
+
+  static std::string PriorityToString(Priority priority);
+
+  // Priority for requesting bytes in rate limiter scheduler
+  enum IOPriority {
+    IO_LOW = 0,
+    IO_MID = 1,
+    IO_HIGH = 2,
+    IO_USER = 3,
+    IO_TOTAL = 4
+  };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  // When the UnSchedule function is called, the unschedFunction
+  // registered at the time of Schedule is invoked with arg as a parameter.
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW, void* tag = nullptr,
+                        void (*unschedFunction)(void* arg) = nullptr) = 0;
+
+  // Arrange to remove jobs for given arg from the queue_ if they are not
+  // already scheduled. Caller is expected to have exclusive lock on arg.
+  virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; }
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // Start a new thread, invoking "function(args...)" within the new thread.
+  // When "function(args...)" returns, the thread will be destroyed.
+  template <typename FunctionT, typename... Args>
+  void StartThreadTyped(FunctionT function, Args&&... args) {
+    using FWType = FunctorWrapper<Args...>;
+    StartThread(
+        [](void* arg) {
+          auto* functor = static_cast<FWType*>(arg);
+          functor->invoke();
+          delete functor;
+        },
+        new FWType(std::function<void(Args...)>(function),
+                   std::forward<Args>(args)...));
+  }
+
+  // Wait for all threads started by StartThread to terminate.
+  virtual void WaitForJoin() {}
+
+  // Reserve available background threads in the specified thread pool.
+  virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) {
+    return 0;
+  }
+
+  // Release a specific number of reserved threads from the specified thread
+  // pool
+  virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) {
+    return 0;
+  }
+
+  // Get thread pool queue length for specific thread pool.
+  virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
+    return 0;
+  }
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  virtual Status NewLogger(const std::string& fname,
+                           std::shared_ptr<Logger>* result);
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // 0 indicates not supported.
+  virtual uint64_t NowCPUNanos() { return 0; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name as a null terminated string iff the string
+  // length is < len. The hostname should otherwise be truncated to len.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the current hostname from the given env as a std::string in result.
+  // The result may be truncated if the hostname is too
+  // long
+  virtual Status GetHostNameString(std::string* result);
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+                                 std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+  virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
+
+  virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
+    return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported.");
+  }
+
+  // Enlarge number of background worker threads of a specific thread pool
+  // for this environment if it is smaller than specified. 'LOW' is the default
+  // pool.
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
+  // Lower IO priority for threads from the specified pool.
+  virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
+
+  // Lower CPU priority for threads from the specified pool.
+  virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/,
+                                            CpuPriority /*pri*/) {
+    return Status::NotSupported(
+        "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported");
+  }
+
+  // Lower CPU priority for threads from the specified pool.
+  virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a human-readable unique ID that can be used to identify a DB.
+  // In built-in implementations, this is an RFC-4122 UUID string, but might
+  // not be in all implementations. Overriding is not recommended.
+  // NOTE: this has not be validated for use in cryptography
+  virtual std::string GenerateUniqueId();
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for reading log files.
+  virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const;
+
+  // OptimizeForManifestRead will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for reading manifest
+  // files.
+  virtual EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const;
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                         const DBOptions& db_options) const;
+  // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for writing manifest
+  // files. Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const;
+
+  // OptimizeForCompactionTableWrite will create a new EnvOptions object that is
+  // a copy of the EnvOptions in the parameters, but is optimized for writing
+  // table files.
+  virtual EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const;
+
+  // OptimizeForCompactionTableWrite will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // table files.
+  virtual EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // OptimizeForBlobFileRead will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // blob files.
+  virtual EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // Returns the status of all threads that belong to the current Env.
+  virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
+    return Status::NotSupported("Env::GetThreadList() not supported.");
+  }
+
+  // Returns the pointer to ThreadStatusUpdater.  This function will be
+  // used in RocksDB internally to update thread status and supports
+  // GetThreadList().
+  virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+    return thread_status_updater_;
+  }
+
+  // Returns the ID of the current thread.
+  virtual uint64_t GetThreadID() const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+  // Get the amount of free disk space
+  virtual Status GetFreeSpace(const std::string& /*path*/,
+                              uint64_t* /*diskfree*/) {
+    return Status::NotSupported("Env::GetFreeSpace() not supported.");
+  }
+
+  // Check whether the specified path is a directory
+  virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) {
+    return Status::NotSupported("Env::IsDirectory() not supported.");
+  }
+
+  virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
+  // Get the FileSystem implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<FileSystem>& GetFileSystem() const;
+
+  // Get the SystemClock implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<SystemClock>& GetSystemClock() const;
+
+  // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ protected:
+  // The pointer to an internal structure that will update the
+  // status of each thread.
+  ThreadStatusUpdater* thread_status_updater_;
+
+  // Pointer to the underlying FileSystem implementation
+  std::shared_ptr<FileSystem> file_system_;
+
+  // Pointer to the underlying SystemClock implementation
+  std::shared_ptr<SystemClock> system_clock_;
+
+ private:
+  static const size_t kMaxHostNameLen = 256;
+};
+
+// The factory function to construct a ThreadStatusUpdater.  Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() {}
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported(
+        "SequentialFile::InvalidateCache not supported.");
+  }
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+                                Slice* /*result*/, char* /*scratch*/) {
+    return Status::NotSupported(
+        "SequentialFile::PositionedRead() not supported.");
+  }
+
+  // If you're adding methods here, remember to add them to
+  // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+  // File offset in bytes
+  uint64_t offset;
+
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
+  size_t len;
+
+  // A buffer that MultiRead()  can optionally place data in. It can
+  // ignore this and allocate its own buffer
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  Slice result;
+
+  // Status of read
+  Status status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() {}
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) {
+    return Status::OK();
+  }
+
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping. If the function return Status is not ok, status of
+  // individual requests will be ignored and return status will be assumed
+  // for all read requests. The function return status is only meant for
+  // any errors that occur before even processing specific read requests
+  virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest& req = reqs[i];
+      req.status = Read(req.offset, req.len, &req.result, req.scratch);
+    }
+    return Status::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+               // compatibility.
+  }
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern /*pattern*/) {}
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported(
+        "RandomAccessFile::InvalidateCache not supported.");
+  }
+
+  // If you're adding methods here, remember to add them to
+  // RandomAccessFileWrapper too.
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile()
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(false) {}
+
+  explicit WritableFile(const EnvOptions& options)
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+  // No copying allowed
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
+
+  virtual ~WritableFile();
+
+  // Append data to the end of the file
+  // Note: A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  virtual Status Append(const Slice& data) = 0;
+
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status Append(const Slice& data,
+                        const DataVerificationInfo& /* verification_info */) {
+    return Append(data);
+  }
+
+  // PositionedAppend data to the specified offset. The new EOF after append
+  // must be larger than the previous EOF. This is to be used when writes are
+  // not backed by OS buffers and hence has to always start from the start of
+  // the sector. The implementation thus needs to also rewrite the last
+  // partial sector.
+  // Note: PositionAppend does not guarantee moving the file offset after the
+  // write. A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  //
+  // PositionedAppend() can only happen on the page/sector boundaries. For that
+  // reason, if the last write was an incomplete sector we still need to rewind
+  // back to the nearest sector/page and rewrite the portion of it with whatever
+  // we need to add. We need to keep where we stop writing.
+  //
+  // PositionedAppend() can only write whole sectors. For that reason we have to
+  // pad with zeros for the last write and trim the file when closing according
+  // to the position we keep in the previous step.
+  //
+  // PositionedAppend() requires aligned buffer to be passed in. The alignment
+  // required is queried via GetRequiredBufferAlignment()
+  virtual Status PositionedAppend(const Slice& /* data */,
+                                  uint64_t /* offset */) {
+    return Status::NotSupported(
+        "WritableFile::PositionedAppend() not supported.");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const DataVerificationInfo& /* verification_info */) {
+    return Status::NotSupported("PositionedAppend");
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); }
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0;  // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() { return Sync(); }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  virtual bool IsSyncThreadSafe() const { return false; }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  /*
+   * If rate limiting is enabled, change the file-granularity priority used in
+   * rate-limiting writes.
+   *
+   * In the presence of finer-granularity priority such as
+   * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+   * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+   * a fallback for Env::IO_TOTAL finer-granularity priority.
+   *
+   * If rate limiting is not enabled, this call has no effect.
+   */
+  virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+  virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+    write_hint_ = hint;
+  }
+
+  virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() { return 0; }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  virtual void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported("WritableFile::InvalidateCache not supported.");
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
+    if (strict_bytes_per_sync_) {
+      return Sync();
+    }
+    return Status::OK();
+  }
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  virtual void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+        (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+          new_last_preallocated_block - last_preallocated_block_;
+      // TODO: Don't ignore errors from allocate
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks)
+          .PermitUncheckedError();
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  // Pre-allocates space for a file.
+  virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) {
+    return Status::OK();
+  }
+
+  // If you're adding methods here, remember to add them to
+  // WritableFileWrapper too.
+
+ protected:
+  size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+
+ protected:
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&) = delete;
+  RandomRWFile& operator=(const RandomRWFile&) = delete;
+
+  virtual ~RandomRWFile() {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Returns Status::OK() on success.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  virtual Status Flush() = 0;
+
+  virtual Status Sync() = 0;
+
+  virtual Status Fsync() { return Sync(); }
+
+  virtual Status Close() = 0;
+
+  // If you're adding methods here, remember to add them to
+  // RandomRWFileWrapper too.
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class MemoryMappedFileBuffer {
+ public:
+  MemoryMappedFileBuffer(void* _base, size_t _length)
+      : base_(_base), length_(_length) {}
+
+  virtual ~MemoryMappedFileBuffer() = 0;
+
+  // We do not want to unmap this twice. We can make this class
+  // movable if desired, however, since
+  MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete;
+  MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete;
+
+  void* GetBase() const { return base_; }
+  size_t GetLen() const { return length_; }
+
+ protected:
+  void* base_;
+  const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+  virtual ~Directory() {}
+  // Fsync directory. Can be called concurrently from multiple threads.
+  virtual Status Fsync() = 0;
+  // Close directory.
+  virtual Status Close() { return Status::NotSupported("Close"); }
+
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;
+  }
+
+  // If you're adding methods here, remember to add them to
+  // DirectoryWrapper too.
+};
+
+enum InfoLogLevel : unsigned char {
+  DEBUG_LEVEL = 0,
+  INFO_LEVEL,
+  WARN_LEVEL,
+  ERROR_LEVEL,
+  FATAL_LEVEL,
+  HEADER_LEVEL,
+  NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Logger {
+ public:
+  size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
+
+  explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : closed_(false), log_level_(log_level) {}
+  // No copying allowed
+  Logger(const Logger&) = delete;
+  void operator=(const Logger&) = delete;
+
+  virtual ~Logger();
+
+  // Close the log file. Must be called before destructor. If the return
+  // status is NotSupported(), it means the implementation does cleanup in
+  // the destructor
+  virtual Status Close();
+
+  // Write a header to the log file with the specified format
+  // It is recommended that you log all header information at the start of the
+  // application. But it is not enforced.
+  virtual void LogHeader(const char* format, va_list ap) {
+    // Default implementation does a simple INFO level log write.
+    // Please override as per the logger class requirement.
+    Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+  }
+
+  // Write an entry to the log file with the specified format.
+  //
+  // Users who override the `Logv()` overload taking `InfoLogLevel` do not need
+  // to implement this, unless they explicitly invoke it in
+  // `Logv(InfoLogLevel, ...)`.
+  virtual void Logv(const char* /* format */, va_list /* ap */) {
+    assert(false);
+  }
+
+  // Write an entry to the log file with the specified log level
+  // and format.  Any log with level under the internal log level
+  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+  // printed.
+  virtual void Logv(const InfoLogLevel log_level, const char* format,
+                    va_list ap);
+
+  virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+  virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+  virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+    log_level_ = log_level;
+  }
+
+  // If you're adding methods here, remember to add them to LoggerWrapper too.
+
+ protected:
+  virtual Status CloseImpl();
+  bool closed_;
+
+ private:
+  InfoLogLevel log_level_;
+};
+
+// Identifies a locked file. Except in custom Env/Filesystem implementations,
+// the lifetime of a FileLock object should be managed only by LockFile() and
+// UnlockFile().
+class FileLock {
+ public:
+  FileLock() {}
+  virtual ~FileLock();
+
+ private:
+  // No copying allowed
+  FileLock(const FileLock&) = delete;
+  void operator=(const FileLock&) = delete;
+};
+
+class DynamicLibrary {
+ public:
+  virtual ~DynamicLibrary() {}
+
+  // Returns the name of the dynamic library.
+  virtual const char* Name() const = 0;
+
+  // Loads the symbol for sym_name from the library and updates the input
+  // function. Returns the loaded symbol.
+  template <typename T>
+  Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+    assert(nullptr != function);
+    void* ptr = nullptr;
+    Status s = LoadSymbol(sym_name, &ptr);
+    *function = reinterpret_cast<T*>(ptr);
+    return s;
+  }
+  // Loads and returns the symbol for sym_name from the library.
+  virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
+};
+
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+                const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// a set of log functions with different log levels.
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+                   ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+extern void LogFlush(Logger* info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+                const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname,
+                                bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::SequentialFileWrapper {
+//  public:
+//   MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target):
+//     ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {}
+//   Status Read(size_t n, Slice* result, char* scratch) override {
+//     cout << "Doing a read of size " << n << "!" << endl;
+//     return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result,
+//     scratch);
+//   }
+//   // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+//     forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+//     rocksdb class. Unless you actually want to override the behavior.
+//     (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // The Target struct allows an Env to be stored as a raw (Env*) or
+  // std::shared_ptr<Env>.  By using this struct, the wrapping/calling
+  // class does not need to worry about the ownership/lifetime of the
+  // wrapped target env.  If the guard is set, then the Env will point
+  // to the guard.get().
+  struct Target {
+    Env* env;                    // The raw Env
+    std::shared_ptr<Env> guard;  // The guarded Env
+
+    // Creates a Target without assuming ownership of the target Env
+    explicit Target(Env* t) : env(t) {}
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(std::unique_ptr<Env>&& t) : guard(t.release()) {
+      env = guard.get();
+    }
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(const std::shared_ptr<Env>& t) : guard(t) {
+      env = guard.get();
+    }
+
+    // Makes sure the raw Env is not nullptr
+    void Prepare() {
+      if (guard.get() != nullptr) {
+        env = guard.get();
+      } else if (env == nullptr) {
+        env = Env::Default();
+      }
+    }
+  };
+
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t);
+  explicit EnvWrapper(std::unique_ptr<Env>&& t);
+  explicit EnvWrapper(const std::shared_ptr<Env>& t);
+  ~EnvWrapper() override;
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_.env; }
+
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return target_.env->Name(); }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->RegisterDbPaths(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->UnregisterDbPaths(paths);
+  }
+
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override {
+    return target_.env->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override {
+    return target_.env->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override {
+    return target_.env->NewWritableFile(f, r, options);
+  }
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override {
+    return target_.env->ReopenWritableFile(fname, result, options);
+  }
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* r,
+                           const EnvOptions& options) override {
+    return target_.env->ReuseWritableFile(fname, old_fname, r, options);
+  }
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
+    return target_.env->NewRandomRWFile(fname, result, options);
+  }
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return target_.env->NewMemoryMappedFileBuffer(fname, result);
+  }
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
+    return target_.env->NewDirectory(name, result);
+  }
+  Status FileExists(const std::string& f) override {
+    return target_.env->FileExists(f);
+  }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    return target_.env->GetChildren(dir, r);
+  }
+  Status GetChildrenFileAttributes(
+      const std::string& dir, std::vector<FileAttributes>* result) override {
+    return target_.env->GetChildrenFileAttributes(dir, result);
+  }
+  Status DeleteFile(const std::string& f) override {
+    return target_.env->DeleteFile(f);
+  }
+  Status Truncate(const std::string& fname, size_t size) override {
+    return target_.env->Truncate(fname, size);
+  }
+  Status CreateDir(const std::string& d) override {
+    return target_.env->CreateDir(d);
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    return target_.env->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) override {
+    return target_.env->DeleteDir(d);
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    return target_.env->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    return target_.env->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    return target_.env->RenameFile(s, t);
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    return target_.env->LinkFile(s, t);
+  }
+
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+    return target_.env->NumFileLinks(fname, count);
+  }
+
+  Status AreFilesSame(const std::string& first, const std::string& second,
+                      bool* res) override {
+    return target_.env->AreFilesSame(first, second, res);
+  }
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    return target_.env->LockFile(f, l);
+  }
+
+  Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); }
+
+  Status IsDirectory(const std::string& path, bool* is_dir) override {
+    return target_.env->IsDirectory(path, is_dir);
+  }
+
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_.env->LoadLibrary(lib_name, search_path, result);
+  }
+
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+    return target_.env->Schedule(f, a, pri, tag, u);
+  }
+
+  int UnSchedule(void* tag, Priority pri) override {
+    return target_.env->UnSchedule(tag, pri);
+  }
+
+  void StartThread(void (*f)(void*), void* a) override {
+    return target_.env->StartThread(f, a);
+  }
+  void WaitForJoin() override { return target_.env->WaitForJoin(); }
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+    return target_.env->GetThreadPoolQueueLen(pri);
+  }
+
+  int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
+    return target_.env->ReserveThreads(threads_to_be_reserved, pri);
+  }
+
+  int ReleaseThreads(int threads_to_be_released, Priority pri) override {
+    return target_.env->ReleaseThreads(threads_to_be_released, pri);
+  }
+
+  Status GetTestDirectory(std::string* path) override {
+    return target_.env->GetTestDirectory(path);
+  }
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
+    return target_.env->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() override { return target_.env->NowMicros(); }
+  uint64_t NowNanos() override { return target_.env->NowNanos(); }
+  uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); }
+
+  void SleepForMicroseconds(int micros) override {
+    target_.env->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) override {
+    return target_.env->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_.env->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
+    return target_.env->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) override {
+    return target_.env->SetBackgroundThreads(num, pri);
+  }
+  int GetBackgroundThreads(Priority pri) override {
+    return target_.env->GetBackgroundThreads(pri);
+  }
+
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
+  }
+
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    target_.env->LowerThreadPoolIOPriority(pool);
+  }
+
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    target_.env->LowerThreadPoolCPUPriority(pool);
+  }
+
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return target_.env->LowerThreadPoolCPUPriority(pool, pri);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_.env->TimeToString(time);
+  }
+
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return target_.env->GetThreadList(thread_list);
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_.env->GetThreadStatusUpdater();
+  }
+
+  uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
+
+  std::string GenerateUniqueId() override {
+    return target_.env->GenerateUniqueId();
+  }
+
+  EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForLogRead(env_options);
+  }
+  EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForManifestRead(env_options);
+  }
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
+    return target_.env->OptimizeForLogWrite(env_options, db_options);
+  }
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForManifestWrite(env_options);
+  }
+  EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_.env->OptimizeForCompactionTableWrite(env_options,
+                                                        immutable_ops);
+  }
+  EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_.env->OptimizeForCompactionTableRead(env_options, db_options);
+  }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_.env->OptimizeForBlobFileRead(env_options, db_options);
+  }
+  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+    return target_.env->GetFreeSpace(path, diskfree);
+  }
+  void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+    target_.env->SanitizeEnvOptions(env_opts);
+  }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+
+ private:
+  Target target_;
+};
+
+class SequentialFileWrapper : public SequentialFile {
+ public:
+  explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    return target_->Read(n, result, scratch);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return target_->PositionedRead(offset, n, result, scratch);
+  }
+
+ private:
+  SequentialFile* target_;
+};
+
+class RandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileWrapper(RandomAccessFile* target)
+      : target_(target) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    return target_->MultiRead(reqs, num_reqs);
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    return target_->Prefetch(offset, n);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  RandomAccessFile* target_;
+};
+
+class WritableFileWrapper : public WritableFile {
+ public:
+  explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
+
+  Status Append(const Slice& data) override { return target_->Append(data); }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    return target_->Append(data, verification_info);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    return target_->PositionedAppend(data, offset, verification_info);
+  }
+  Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+  Status Close() override { return target_->Close(); }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetIOPriority(Env::IOPriority pri) override {
+    target_->SetIOPriority(pri);
+  }
+
+  Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    return target_->RangeSync(offset, nbytes);
+  }
+
+  void PrepareWrite(size_t offset, size_t len) override {
+    target_->PrepareWrite(offset, len);
+  }
+
+  Status Allocate(uint64_t offset, uint64_t len) override {
+    return target_->Allocate(offset, len);
+  }
+
+ private:
+  WritableFile* target_;
+};
+
+class RandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    return target_->Write(offset, data);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  Status Close() override { return target_->Close(); }
+
+ private:
+  RandomRWFile* target_;
+};
+
+class DirectoryWrapper : public Directory {
+ public:
+  explicit DirectoryWrapper(Directory* target) : target_(target) {}
+
+  Status Fsync() override { return target_->Fsync(); }
+  Status Close() override { return target_->Close(); }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  Directory* target_;
+};
+
+class LoggerWrapper : public Logger {
+ public:
+  explicit LoggerWrapper(Logger* target) : target_(target) {}
+
+  Status Close() override { return target_->Close(); }
+  void LogHeader(const char* format, va_list ap) override {
+    return target_->LogHeader(format, ap);
+  }
+  void Logv(const char* format, va_list ap) override {
+    return target_->Logv(format, ap);
+  }
+  void Logv(const InfoLogLevel log_level, const char* format,
+            va_list ap) override {
+    return target_->Logv(log_level, format, ap);
+  }
+  size_t GetLogFileSize() const override { return target_->GetLogFileSize(); }
+  void Flush() override { return target_->Flush(); }
+  InfoLogLevel GetInfoLogLevel() const override {
+    return target_->GetInfoLogLevel();
+  }
+  void SetInfoLogLevel(const InfoLogLevel log_level) override {
+    return target_->SetInfoLogLevel(log_level);
+  }
+
+ private:
+  Logger* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+// Returns a new environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+// This is a factory method for TimedEnv defined in utilities/env_timed.cc.
+Env* NewTimedEnv(Env* base_env);
+
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result);
+
+// Creates a new Env based on Env::Default() but modified to use the specified
+// FileSystem.
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h
new file mode 100644
index 000000000..282db6ed4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env_encryption.h
@@ -0,0 +1,465 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EncryptionProvider;
+
+struct ConfigOptions;
+
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env,
+                     const std::shared_ptr<EncryptionProvider>& provider);
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base_fs,
+    const std::shared_ptr<EncryptionProvider>& provider);
+
+// BlockAccessCipherStream is the base class for any cipher stream that
+// supports random access at block level (without requiring data from other
+// blocks). E.g. CTR (Counter operation mode) supports this requirement.
+class BlockAccessCipherStream {
+ public:
+  virtual ~BlockAccessCipherStream(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
+
+  // Encrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+  // Decrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  virtual void AllocateScratch(std::string&) = 0;
+
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
+
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
+};
+
+// BlockCipher
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class BlockCipher : public Customizable {
+ public:
+  virtual ~BlockCipher(){};
+
+  // Creates a new BlockCipher from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "ROT13", a ROT13BlockCipher is created.
+  //
+  // @param config_options  Options to control how this cipher is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - ROT13         Create a ROT13 Cipher
+  //   - ROT13:nn      Create a ROT13 Cipher with block size of nn
+  // @param result The new cipher object
+  // @return OK if the cipher was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<BlockCipher>* result);
+
+  static const char* Type() { return "BlockCipher"; }
+  // Short-cut method to create a ROT13 BlockCipher.
+  // This cipher is only suitable for test purposes and should not be used in
+  // production!!!
+  static std::shared_ptr<BlockCipher> NewROT13Cipher(size_t block_size);
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
+
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Encrypt(char* data) = 0;
+
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Decrypt(char* data) = 0;
+};
+
+// The encryption provider is used to create a cipher stream for a specific
+// file. The returned cipher stream will be used for actual
+// encryption/decryption actions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class EncryptionProvider : public Customizable {
+ public:
+  virtual ~EncryptionProvider(){};
+
+  // Creates a new EncryptionProvider from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "CTR", a CTREncryptionProvider will be
+  // created. If the value is ends with "://test" (e.g CTR://test"), the
+  // provider will be initialized in "TEST" mode prior to being returned.
+  //
+  // @param config_options  Options to control how this provider is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - CTR         Create a CTR provider
+  //   - CTR://test Create a CTR provider and initialize it for tests.
+  // @param result The new provider object
+  // @return OK if the provider was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<EncryptionProvider>* result);
+
+  static const char* Type() { return "EncryptionProvider"; }
+
+  // Short-cut method to create a CTR-provider
+  static std::shared_ptr<EncryptionProvider> NewCTRProvider(
+      const std::shared_ptr<BlockCipher>& cipher);
+
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file and used for storing encryption options. For optimal performance, the
+  // prefix length should be a multiple of the page size.
+  virtual size_t GetPrefixLength() const = 0;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+                                 size_t prefixLength) const = 0;
+
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  virtual Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) = 0;
+
+  // Returns a string representing an encryption marker prefix for this
+  // provider. If a marker is provided, this marker can be used to tell whether
+  // or not a file is encrypted by this provider.  The maker will also be part
+  // of any encryption prefix for this provider.
+  virtual std::string GetMarker() const { return ""; }
+};
+
+class EncryptedSequentialFile : public FSSequentialFile {
+ protected:
+  std::unique_ptr<FSSequentialFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  uint64_t offset_;
+  size_t prefixLength_;
+
+ public:
+  // Default ctor. Given underlying sequential file is supposed to be at
+  // offset == prefixLength.
+  EncryptedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                          std::unique_ptr<BlockAccessCipherStream>&& s,
+                          size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        offset_(prefixLength),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Skip(uint64_t n) override;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class EncryptedRandomAccessFile : public FSRandomAccessFile {
+ protected:
+  std::unique_ptr<FSRandomAccessFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  EncryptedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+                            std::unique_ptr<BlockAccessCipherStream>&& s,
+                            size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  void Hint(AccessPattern pattern) override;
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class EncryptedWritableFile : public FSWritableFile {
+ protected:
+  std::unique_ptr<FSWritableFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  // Default ctor. Prefix is assumed to be written already.
+  EncryptedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  bool IsSyncThreadSafe() const override;
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  void SetPreallocationBlockSize(size_t size) override;
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override;
+
+  // Pre-allocates space for a file.
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+// A file abstraction for random reading and writing.
+class EncryptedRandomRWFile : public FSRandomRWFile {
+ protected:
+  std::unique_ptr<FSRandomRWFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  EncryptedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  // Returns Status::OK() on success.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+class EncryptedFileSystem : public FileSystemWrapper {
+ public:
+  explicit EncryptedFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+  static const char* kClassName() { return "EncryptedFileSystem"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return FileSystemWrapper::IsInstanceOf(name);
+    }
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h
new file mode 100644
index 000000000..b59395255
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/experimental.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+// Supported only for Leveled compaction
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end);
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
+
+// Move all L0 files to target_level skipping compaction.
+// This operation succeeds only if the files in L0 have disjoint ranges; this
+// is guaranteed to happen, for instance, if keys are inserted in sorted
+// order. Furthermore, all levels between 1 and target_level must be empty.
+// If any of the above condition is violated, InvalidArgument will be
+// returned.
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
+                 int target_level = 1);
+
+struct UpdateManifestForFilesStateOptions {
+  // When true, read current file temperatures from FileSystem and update in
+  // DB manifest when a temperature other than Unknown is reported and
+  // inconsistent with manifest.
+  bool update_temperatures = true;
+
+  // TODO: new_checksums: to update files to latest file checksum algorithm
+};
+
+// Utility for updating manifest of DB directory (not open) for current state
+// of files on filesystem. See UpdateManifestForFilesStateOptions.
+//
+// To minimize interference with ongoing DB operations, only the following
+// guarantee is provided, assuming no IO error encountered:
+// * Only files live in DB at start AND end of call to
+// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in
+// manifest.
+//   * For example, new files after start of call to
+//   UpdateManifestForFilesState() might not be updated, but that is not
+//   typically required to achieve goal of manifest consistency/completeness
+//   (because current DB configuration would ensure new files get the desired
+//   consistent metadata).
+Status UpdateManifestForFilesState(
+    const DBOptions& db_opts, const std::string& db_name,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    const UpdateManifestForFilesStateOptions& opts = {});
+
+}  // namespace experimental
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h
new file mode 100644
index 000000000..758bae4ac
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_checksum.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The unknown file checksum.
+constexpr char kUnknownFileChecksum[] = "";
+// The unknown sst file checksum function name.
+constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// The standard DB file checksum function name.
+// This is the name of the checksum function returned by
+// GetFileChecksumGenCrc32cFactory();
+constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+
+struct FileChecksumGenContext {
+  std::string file_name;
+  // The name of the requested checksum generator.
+  // Checksum factories may use or ignore requested_checksum_func_name,
+  // and checksum factories written before this field was available are still
+  // compatible.
+  std::string requested_checksum_func_name;
+};
+
+// FileChecksumGenerator is the class to generates the checksum value
+// for each file when the file is written to the file system.
+// Implementations may assume that
+// * Finalize is called at most once during the life of the object
+// * All calls to Update come before Finalize
+// * All calls to GetChecksum come after Finalize
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenerator {
+ public:
+  virtual ~FileChecksumGenerator() {}
+
+  // Update the current result after process the data. For different checksum
+  // functions, the temporal results may be stored and used in Update to
+  // include the new data.
+  virtual void Update(const char* data, size_t n) = 0;
+
+  // Generate the final results if no further new data will be updated.
+  virtual void Finalize() = 0;
+
+  // Get the checksum. The result should not be the empty string and may
+  // include arbitrary bytes, including non-printable characters.
+  virtual std::string GetChecksum() const = 0;
+
+  // Returns a name that identifies the current file checksum function.
+  virtual const char* Name() const = 0;
+};
+
+// Create the FileChecksumGenerator object for each SST file.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenFactory : public Customizable {
+ public:
+  ~FileChecksumGenFactory() override {}
+  static const char* Type() { return "FileChecksumGenFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<FileChecksumGenFactory>* result);
+
+  // Create a new FileChecksumGenerator.
+  virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) = 0;
+
+  // Return the name of this FileChecksumGenFactory.
+  const char* Name() const override = 0;
+};
+
+// FileChecksumList stores the checksum information of a list of files (e.g.,
+// SST files). The FileChecksumList can be used to store the checksum
+// information of all SST file getting  from the MANIFEST, which are
+// the checksum information of all valid SST file of a DB instance. It can
+// also be used to store the checksum information of a list of SST files to
+// be ingested.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumList {
+ public:
+  virtual ~FileChecksumList() {}
+
+  // Clean the previously stored file checksum information.
+  virtual void reset() = 0;
+
+  // Get the number of checksums in the checksum list
+  virtual size_t size() const = 0;
+
+  // Return all the file checksum information being stored in a unordered_map.
+  // File_number is the key, the first part of the value is checksum value,
+  // and the second part of the value is checksum function name.
+  virtual Status GetAllFileChecksums(
+      std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+      std::vector<std::string>* checksum_func_names) = 0;
+
+  // Given the file_number, it searches if the file checksum information is
+  // stored.
+  virtual Status SearchOneFileChecksum(uint64_t file_number,
+                                       std::string* checksum,
+                                       std::string* checksum_func_name) = 0;
+
+  // Insert the checksum information of one file to the FileChecksumList.
+  virtual Status InsertOneFileChecksum(
+      uint64_t file_number, const std::string& checksum,
+      const std::string& checksum_func_name) = 0;
+
+  // Remove the checksum information of one SST file.
+  virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0;
+};
+
+// Create a new file checksum list.
+extern FileChecksumList* NewFileChecksumList();
+
+// Return a shared_ptr of the builtin Crc32c based file checksum generator
+// factory object, which can be shared to create the Crc32c based checksum
+// generator object.
+// Note: this implementation is compatible with many other crc32c checksum
+// implementations and uses big-endian encoding of the result, unlike most
+// other crc32c checksums in RocksDB, which alter the result with
+// crc32c::Mask and use little-endian encoding.
+extern std::shared_ptr<FileChecksumGenFactory>
+GetFileChecksumGenCrc32cFactory();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h
new file mode 100644
index 000000000..91ad47218
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_system.h
@@ -0,0 +1,1849 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// A FileSystem is an interface used by the rocksdb implementation to access
+// storage functionality like the filesystem etc.  Callers
+// may wish to provide a custom FileSystem object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All FileSystem implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+//
+// WARNING: Since this is a new interface, it is expected that there will be
+// some changes as storage systems are ported over.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <chrono>
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileLock;
+class FSDirectory;
+class FSRandomAccessFile;
+class FSRandomRWFile;
+class FSSequentialFile;
+class FSWritableFile;
+class Logger;
+class Slice;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+struct ConfigOptions;
+
+using AccessPattern = RandomAccessFile::AccessPattern;
+using FileAttributes = Env::FileAttributes;
+
+// DEPRECATED
+// Priority of an IO request. This is a hint and does not guarantee any
+// particular QoS.
+// IO_LOW - Typically background reads/writes such as compaction/flush
+// IO_HIGH - Typically user reads/synchronous WAL writes
+enum class IOPriority : uint8_t {
+  kIOLow,
+  kIOHigh,
+  kIOTotal,
+};
+
+// Type of the data begin read/written. It can be passed down as a flag
+// for the FileSystem implementation to optionally handle different types in
+// different ways
+enum class IOType : uint8_t {
+  kData,
+  kFilter,
+  kIndex,
+  kMetadata,
+  kWAL,
+  kManifest,
+  kLog,
+  kUnknown,
+  kInvalid,
+};
+
+// Per-request options that can be passed down to the FileSystem
+// implementation. These are hints and are not necessarily guaranteed to be
+// honored. More hints can be added here in the future to indicate things like
+// storage media (HDD/SSD) to be used, replication level etc.
+struct IOOptions {
+  // Timeout for the operation in microseconds
+  std::chrono::microseconds timeout;
+
+  // DEPRECATED
+  // Priority - high or low
+  IOPriority prio;
+
+  // Priority used to charge rate limiter configured in file system level (if
+  // any)
+  // Limitation: right now RocksDB internal does not consider this
+  // rate_limiter_priority
+  Env::IOPriority rate_limiter_priority;
+
+  // Type of data being read/written
+  IOType type;
+
+  // EXPERIMENTAL
+  // An option map that's opaque to RocksDB. It can be used to implement a
+  // custom contract between a FileSystem user and the provider. This is only
+  // useful in cases where a RocksDB user directly uses the FileSystem or file
+  // object for their own purposes, and wants to pass extra options to APIs
+  // such as NewRandomAccessFile and NewWritableFile.
+  std::unordered_map<std::string, std::string> property_bag;
+
+  // Force directory fsync, some file systems like btrfs may skip directory
+  // fsync, set this to force the fsync
+  bool force_dir_fsync;
+
+  // Can be used by underlying file systems to skip recursing through sub
+  // directories and list only files in GetChildren API.
+  bool do_not_recurse;
+
+  IOOptions() : IOOptions(false) {}
+
+  explicit IOOptions(bool force_dir_fsync_)
+      : timeout(std::chrono::microseconds::zero()),
+        prio(IOPriority::kIOLow),
+        rate_limiter_priority(Env::IO_TOTAL),
+        type(IOType::kUnknown),
+        force_dir_fsync(force_dir_fsync_),
+        do_not_recurse(false) {}
+};
+
+struct DirFsyncOptions {
+  enum FsyncReason : uint8_t {
+    kNewFileSynced,
+    kFileRenamed,
+    kDirRenamed,
+    kFileDeleted,
+    kDefault,
+  } reason;
+
+  std::string renamed_new_name;  // for kFileRenamed
+  // add other options for other FsyncReason
+
+  DirFsyncOptions();
+
+  explicit DirFsyncOptions(std::string file_renamed_new_name);
+
+  explicit DirFsyncOptions(FsyncReason fsync_reason);
+};
+
+// File scope options that control how a file is opened/created and accessed
+// while its open. We may add more options here in the future such as
+// redundancy level, media to use etc.
+struct FileOptions : EnvOptions {
+  // Embedded IOOptions to control the parameters for any IOs that need
+  // to be issued for the file open/creation
+  IOOptions io_options;
+
+  // EXPERIMENTAL
+  // The feature is in development and is subject to change.
+  // When creating a new file, set the temperature of the file so that
+  // underlying file systems can put it with appropriate storage media and/or
+  // coding.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum type that is used to calculate the checksum value for
+  // handoff during file writes.
+  ChecksumType handoff_checksum_type;
+
+  FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const DBOptions& opts)
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const EnvOptions& opts)
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const FileOptions& opts)
+      : EnvOptions(opts),
+        io_options(opts.io_options),
+        temperature(opts.temperature),
+        handoff_checksum_type(opts.handoff_checksum_type) {}
+
+  FileOptions& operator=(const FileOptions&) = default;
+};
+
+// A structure to pass back some debugging information from the FileSystem
+// implementation to RocksDB in case of an IO error
+struct IODebugContext {
+  // file_path to be filled in by RocksDB in case of an error
+  std::string file_path;
+
+  // A map of counter names to values - set by the FileSystem implementation
+  std::map<std::string, uint64_t> counters;
+
+  // To be set by the FileSystem implementation
+  std::string msg;
+
+  // To be set by the underlying FileSystem implementation.
+  std::string request_id;
+
+  // In order to log required information in IO tracing for different
+  // operations, Each bit in trace_data stores which corresponding info from
+  // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
+  // means bit at position 0 is set so TraceData::kRequestID (request_id) will
+  // be logged in the trace record.
+  //
+  enum TraceData : char {
+    // The value of each enum represents the bitwise position for
+    // that information in trace_data which will be used by IOTracer for
+    // tracing. Make sure to add them sequentially.
+    kRequestID = 0,
+  };
+  uint64_t trace_data = 0;
+
+  IODebugContext() {}
+
+  void AddCounter(std::string& name, uint64_t value) {
+    counters.emplace(name, value);
+  }
+
+  // Called by underlying file system to set request_id and log request_id in
+  // IOTracing.
+  void SetRequestId(const std::string& _request_id) {
+    request_id = _request_id;
+    trace_data |= (1 << TraceData::kRequestID);
+  }
+
+  std::string ToString() {
+    std::ostringstream ss;
+    ss << file_path << ", ";
+    for (auto counter : counters) {
+      ss << counter.first << " = " << counter.second << ",";
+    }
+    ss << msg;
+    return ss.str();
+  }
+};
+
+// A function pointer type for custom destruction of void pointer passed to
+// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer
+// allocated by FS in ReadAsync API.
+using IOHandleDeleter = std::function<void(void*)>;
+
+// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile,
+// FSRandomRWFileclass, and FSDIrectory classes define the interface between
+// RocksDB and storage systems, such as Posix filesystems,
+// remote filesystems etc.
+// The interface allows for fine grained control of individual IO operations,
+// such as setting a timeout, prioritization, hints on data placement,
+// different handling based on type of IO etc.
+// This is accomplished by passing an instance of IOOptions to every
+// API call that can potentially perform IO. Additionally, each such API is
+// passed a pointer to a IODebugContext structure that can be used by the
+// storage system to include troubleshooting information. The return values
+// of the APIs is of type IOStatus, which can indicate an error code/sub-code,
+// as well as metadata about the error such as its scope and whether its
+// retryable.
+// NewCompositeEnv can be used to create an Env with a custom FileSystem for
+// DBOptions::env.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileSystem : public Customizable {
+ public:
+  FileSystem();
+
+  // No copying allowed
+  FileSystem(const FileSystem&) = delete;
+
+  virtual ~FileSystem();
+
+  static const char* Type() { return "FileSystem"; }
+  static const char* kDefaultName() { return "DefaultFileSystem"; }
+
+  // Loads the FileSystem specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status Load(const std::string& value,
+                     std::shared_ptr<FileSystem>* result);
+
+  // Loads the FileSystem specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  // @param config_options Controls how the FileSystem is loaded
+  // @param value The name and optional properties describing the file system
+  //      to load.
+  // @param result On success, returns the loaded FileSystem
+  // @return OK if the FileSystem was successfully loaded.
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<FileSystem>* result);
+
+  // Return a default FileSystem suitable for the current operating
+  // system.
+  static std::shared_ptr<FileSystem> Default();
+
+  // Handles the event when a new DB or a new ColumnFamily starts using the
+  // specified data paths.
+  //
+  // The data paths might be shared by different DBs or ColumnFamilies,
+  // so RegisterDbPaths might be called with the same data paths.
+  // For example, when CreateColumnFamily is called multiple times with the same
+  // data path, RegisterDbPaths will also be called with the same data path.
+  //
+  // If the return status is ok, then the paths must be correspondingly
+  // called in UnregisterDbPaths;
+  // otherwise this method should have no side effect, and UnregisterDbPaths
+  // do not need to be called for the paths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // Handles the event a DB or a ColumnFamily stops using the specified data
+  // paths.
+  //
+  // It should be called corresponding to each successful RegisterDbPaths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewSequentialFile(const std::string& fname,
+                                     const FileOptions& file_opts,
+                                     std::unique_ptr<FSSequentialFile>* result,
+                                     IODebugContext* dbg) = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual IOStatus NewRandomAccessFile(
+      const std::string& fname, const FileOptions& file_opts,
+      std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) = 0;
+  // These values match Linux definition
+  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+  enum WriteLifeTimeHint {
+    kWLTHNotSet = 0,  // No hint information set
+    kWLTHNone,        // No hints about write life time
+    kWLTHShort,       // Data written has a short life time
+    kWLTHMedium,      // Data written has a medium life time
+    kWLTHLong,        // Data written has a long life time
+    kWLTHExtreme,     // Data written has an extremely long life time
+  };
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewWritableFile(const std::string& fname,
+                                   const FileOptions& file_opts,
+                                   std::unique_ptr<FSWritableFile>* result,
+                                   IODebugContext* dbg) = 0;
+
+  // Create an object that writes to a file with the specified name.
+  // `FSWritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus ReopenWritableFile(
+      const std::string& /*fname*/, const FileOptions& /*options*/,
+      std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("ReopenWritableFile");
+  }
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  virtual IOStatus ReuseWritableFile(const std::string& fname,
+                                     const std::string& old_fname,
+                                     const FileOptions& file_opts,
+                                     std::unique_ptr<FSWritableFile>* result,
+                                     IODebugContext* dbg);
+
+  // Open `fname` for random read and write, if file doesn't exist the file
+  // will be created.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewRandomRWFile(const std::string& /*fname*/,
+                                   const FileOptions& /*options*/,
+                                   std::unique_ptr<FSRandomRWFile>* /*result*/,
+                                   IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "RandomRWFile is not implemented in this FileSystem");
+  }
+
+  // Opens `fname` as a memory-mapped file for read and write (in-place updates
+  // only, i.e., no appends). On success, stores a raw buffer covering the whole
+  // file in `*result`. The file must exist prior to this call.
+  virtual IOStatus NewMemoryMappedFileBuffer(
+      const std::string& /*fname*/,
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+    return IOStatus::NotSupported(
+        "MemoryMappedFileBuffer is not implemented in this FileSystem");
+  }
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual IOStatus NewDirectory(const std::string& name,
+                                const IOOptions& io_opts,
+                                std::unique_ptr<FSDirectory>* result,
+                                IODebugContext* dbg) = 0;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus FileExists(const std::string& fname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                               std::vector<std::string>* result,
+                               IODebugContext* dbg) = 0;
+
+  // Store in *result the attributes of the children of the specified directory.
+  // In case the implementation lists the directory prior to iterating the files
+  // and files are concurrently deleted, the deleted files will be omitted from
+  // result.
+  // The name attributes are relative to "dir".
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus GetChildrenFileAttributes(
+      const std::string& dir, const IOOptions& options,
+      std::vector<FileAttributes>* result, IODebugContext* dbg) {
+    assert(result != nullptr);
+    std::vector<std::string> child_fnames;
+    IOStatus s = GetChildren(dir, options, &child_fnames, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+    result->resize(child_fnames.size());
+    size_t result_size = 0;
+    for (size_t i = 0; i < child_fnames.size(); ++i) {
+      const std::string path = dir + "/" + child_fnames[i];
+      if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes,
+                            dbg))
+               .ok()) {
+        if (FileExists(path, options, dbg).IsNotFound()) {
+          // The file may have been deleted since we listed the directory
+          continue;
+        }
+        return s;
+      }
+      (*result)[result_size].name = std::move(child_fnames[i]);
+      result_size++;
+    }
+    result->resize(result_size);
+    return IOStatus::OK();
+  }
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+  // Delete the named file.
+  virtual IOStatus DeleteFile(const std::string& fname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Truncate the named file to the specified size.
+  virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "Truncate is not supported for this FileSystem");
+  }
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual IOStatus CreateDir(const std::string& dirname,
+                             const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual IOStatus CreateDirIfMissing(const std::string& dirname,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) = 0;
+
+  // Delete the specified directory.
+  virtual IOStatus DeleteDir(const std::string& dirname,
+                             const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual IOStatus GetFileSize(const std::string& fname,
+                               const IOOptions& options, uint64_t* file_size,
+                               IODebugContext* dbg) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual IOStatus GetFileModificationTime(const std::string& fname,
+                                           const IOOptions& options,
+                                           uint64_t* file_mtime,
+                                           IODebugContext* dbg) = 0;
+  // Rename file src to target.
+  virtual IOStatus RenameFile(const std::string& src, const std::string& target,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Hard Link file src to target.
+  virtual IOStatus LinkFile(const std::string& /*src*/,
+                            const std::string& /*target*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "LinkFile is not supported for this FileSystem");
+  }
+
+  virtual IOStatus NumFileLinks(const std::string& /*fname*/,
+                                const IOOptions& /*options*/,
+                                uint64_t* /*count*/, IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "Getting number of file links is not supported for this FileSystem");
+  }
+
+  virtual IOStatus AreFilesSame(const std::string& /*first*/,
+                                const std::string& /*second*/,
+                                const IOOptions& /*options*/, bool* /*res*/,
+                                IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "AreFilesSame is not supported for this FileSystem");
+  }
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                            FileLock** lock, IODebugContext* dbg) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                                    IODebugContext* dbg) = 0;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                             std::shared_ptr<Logger>* result,
+                             IODebugContext* dbg);
+
+  // Get full directory name for this db.
+  virtual IOStatus GetAbsolutePath(const std::string& db_path,
+                                   const IOOptions& options,
+                                   std::string* output_path,
+                                   IODebugContext* dbg) = 0;
+
+  // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions
+  // copy constructor
+  virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {}
+
+  // OptimizeForLogRead will create a new FileOptions object that is a copy of
+  // the FileOptions in the parameters, but is optimized for reading log files.
+  virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
+
+  // OptimizeForManifestRead will create a new FileOptions object that is a copy
+  // of the FileOptions in the parameters, but is optimized for reading manifest
+  // files.
+  virtual FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const;
+
+  // OptimizeForLogWrite will create a new FileOptions object that is a copy of
+  // the FileOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                          const DBOptions& db_options) const;
+
+  // OptimizeForManifestWrite will create a new FileOptions object that is a
+  // copy of the FileOptions in the parameters, but is optimized for writing
+  // manifest files. Default implementation returns the copy of the same
+  // object.
+  virtual FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const;
+
+  // OptimizeForCompactionTableWrite will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // writing table files.
+  virtual FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const;
+
+  // OptimizeForCompactionTableRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading table files.
+  virtual FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // OptimizeForBlobFileRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading blob files.
+  virtual FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+
+  // Get the amount of free disk space
+  virtual IOStatus GetFreeSpace(const std::string& /*path*/,
+                                const IOOptions& /*options*/,
+                                uint64_t* /*diskfree*/,
+                                IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("GetFreeSpace");
+  }
+
+  virtual IOStatus IsDirectory(const std::string& /*path*/,
+                               const IOOptions& options, bool* is_dir,
+                               IODebugContext* /*dgb*/) = 0;
+
+  // EXPERIMENTAL
+  // Poll for completion of read IO requests. The Poll() method should call the
+  // callback functions to indicate completion of read requests.
+  // Underlying FS is required to support Poll API. Poll implementation should
+  // ensure that the callback gets called at IO completion, and return only
+  // after the callback has been called.
+  // If Poll returns partial results for any reads, its caller reponsibility to
+  // call Read or ReadAsync in order to get the remaining bytes.
+  //
+  // Default implementation is to return IOStatus::OK.
+
+  virtual IOStatus Poll(std::vector<void*>& /*io_handles*/,
+                        size_t /*min_completions*/) {
+    return IOStatus::OK();
+  }
+
+  // EXPERIMENTAL
+  // Abort the read IO requests submitted asynchronously. Underlying FS is
+  // required to support AbortIO API. AbortIO implementation should ensure that
+  // the all the read requests related to io_handles should be aborted and
+  // it shouldn't call the callback for these io_handles.
+  //
+  // Default implementation is to return IOStatus::OK.
+  virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
+    return IOStatus::OK();
+  }
+
+  // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ private:
+  void operator=(const FileSystem&);
+};
+
+// A file abstraction for reading sequentially through a file
+class FSSequentialFile {
+ public:
+  FSSequentialFile() {}
+
+  virtual ~FSSequentialFile() {}
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // REQUIRES: External synchronization
+  virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                        char* scratch, IODebugContext* dbg) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual IOStatus Skip(uint64_t n) = 0;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+                                  const IOOptions& /*options*/,
+                                  Slice* /*result*/, char* /*scratch*/,
+                                  IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedRead");
+  }
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead and asynchronous Read APIs.
+struct FSReadRequest {
+  // Input parameter that represents the file offset in bytes.
+  uint64_t offset;
+
+  // Input parameter that represents the length to read in bytes. `result` only
+  // returns fewer bytes if end of file is hit (or `status` is not OK).
+  size_t len;
+
+  // A buffer that MultiRead() can optionally place data in. It can
+  // ignore this and allocate its own buffer.
+  // The lifecycle of scratch will be until IO is completed.
+  //
+  // In case of asynchronous reads, its an output parameter and it will be
+  // maintained until callback has been called. Scratch is allocated by RocksDB
+  // and will be passed to underlying FileSystem.
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  //
+  // In case of asynchronous reads, this output parameter is set by Async Read
+  // APIs to point to the data buffer, and
+  // the number of valid bytes.
+  // Slice result should point to scratch i.e the data should
+  // always be read into scratch.
+  Slice result;
+
+  // Output parameter set by underlying FileSystem that represents status of
+  // read request.
+  IOStatus status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class FSRandomAccessFile {
+ public:
+  FSRandomAccessFile() {}
+
+  virtual ~FSRandomAccessFile() {}
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const = 0;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  // If it's not implemented (default: `NotSupported`), RocksDB will create
+  // internal prefetch buffer to improve read performance.
+  virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("Prefetch");
+  }
+
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping but can be in any order. If the function return Status
+  // is not ok, status of individual requests will be ignored and return
+  // status will be assumed for all read requests. The function return status
+  // is only meant for errors that occur before processing individual read
+  // requests.
+  virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                             const IOOptions& options, IODebugContext* dbg) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      FSReadRequest& req = reqs[i];
+      req.status =
+          Read(req.offset, req.len, options, &req.result, req.scratch, dbg);
+    }
+    return IOStatus::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+               // compatibility.
+  };
+
+  enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed };
+
+  virtual void Hint(AccessPattern /*pattern*/) {}
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // EXPERIMENTAL
+  // This API reads the requested data in FSReadRequest asynchronously. This is
+  // a asynchronous call, i.e it should return after submitting the request.
+  //
+  // When the read request is completed, callback function specified in cb
+  // should be called with arguments cb_arg and the result populated in
+  // FSReadRequest with result and status fileds updated by FileSystem.
+  // cb_arg should be used by the callback to track the original request
+  // submitted.
+  //
+  // This API should also populate io_handle which should be used by
+  // underlying FileSystem to store the context in order to distinguish the read
+  // requests at their side and provide the custom deletion function in del_fn.
+  // RocksDB guarantees that the del_fn for io_handle will be called after
+  // receiving the callback. Furthermore, RocksDB guarantees that if it calls
+  // the Poll API for this io_handle, del_fn will be called after the Poll
+  // returns. RocksDB is responsible for managing the lifetime of io_handle.
+  //
+  // req contains the request offset and size passed as input parameter of read
+  // request and result and status fields are output parameter set by underlying
+  // FileSystem. The data should always be read into scratch field.
+  //
+  // Default implementation is to read the data synchronously.
+  virtual IOStatus ReadAsync(
+      FSReadRequest& req, const IOOptions& opts,
+      std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+      void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) {
+    req.status =
+        Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg);
+    cb(req, cb_arg);
+    return IOStatus::OK();
+  }
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // RandomAccessFileWrapper too.
+};
+
+// A data structure brings the data verification information, which is
+// used together with data being written to a file.
+struct DataVerificationInfo {
+  // checksum of the data being written.
+  Slice checksum;
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class FSWritableFile {
+ public:
+  FSWritableFile()
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(false) {}
+
+  explicit FSWritableFile(const FileOptions& options)
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+
+  virtual ~FSWritableFile() {}
+
+  // Append data to the end of the file
+  // Note: A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          IODebugContext* dbg) = 0;
+
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) {
+    return Append(data, options, dbg);
+  }
+
+  // PositionedAppend data to the specified offset. The new EOF after append
+  // must be larger than the previous EOF. This is to be used when writes are
+  // not backed by OS buffers and hence has to always start from the start of
+  // the sector. The implementation thus needs to also rewrite the last
+  // partial sector.
+  // Note: PositionAppend does not guarantee moving the file offset after the
+  // write. A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  //
+  // PositionedAppend() can only happen on the page/sector boundaries. For that
+  // reason, if the last write was an incomplete sector we still need to rewind
+  // back to the nearest sector/page and rewrite the portion of it with whatever
+  // we need to add. We need to keep where we stop writing.
+  //
+  // PositionedAppend() can only write whole sectors. For that reason we have to
+  // pad with zeros for the last write and trim the file when closing according
+  // to the position we keep in the previous step.
+  //
+  // PositionedAppend() requires aligned buffer to be passed in. The alignment
+  // required is queried via GetRequiredBufferAlignment()
+  virtual IOStatus PositionedAppend(const Slice& /* data */,
+                                    uint64_t /* offset */,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedAppend");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const IOOptions& /*options*/,
+      const DataVerificationInfo& /* verification_info */,
+      IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedAppend");
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::OK();
+  }
+  virtual IOStatus Close(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) = 0;
+
+  virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+  virtual IOStatus Sync(const IOOptions& options,
+                        IODebugContext* dbg) = 0;  // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+    return Sync(options, dbg);
+  }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  virtual bool IsSyncThreadSafe() const { return false; }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+    write_hint_ = hint;
+  }
+
+  /*
+   * If rate limiting is enabled, change the file-granularity priority used in
+   * rate-limiting writes.
+   *
+   * In the presence of finer-granularity priority such as
+   * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+   * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+   * a fallback for Env::IO_TOTAL finer-granularity priority.
+   *
+   * If rate limiting is not enabled, this call has no effect.
+   */
+  virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+  virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+  virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize(const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  virtual void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+                             const IOOptions& options, IODebugContext* dbg) {
+    if (strict_bytes_per_sync_) {
+      return Sync(options, dbg);
+    }
+    return IOStatus::OK();
+  }
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                            IODebugContext* dbg) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+        (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+          new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks, options, dbg)
+          .PermitUncheckedError();
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  // Pre-allocates space for a file.
+  virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::OK();
+  }
+
+  // If you're adding methods here, remember to add them to
+  // WritableFileWrapper too.
+
+ protected:
+  size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  FSWritableFile(const FSWritableFile&);
+  void operator=(const FSWritableFile&);
+
+ protected:
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class FSRandomRWFile {
+ public:
+  FSRandomRWFile() {}
+
+  virtual ~FSRandomRWFile() {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  virtual IOStatus Write(uint64_t offset, const Slice& data,
+                         const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Returns Status::OK() on success.
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const = 0;
+
+  virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+    return Sync(options, dbg);
+  }
+
+  virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // RandomRWFileWrapper too.
+
+  // No copying allowed
+  FSRandomRWFile(const RandomRWFile&) = delete;
+  FSRandomRWFile& operator=(const RandomRWFile&) = delete;
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class FSMemoryMappedFileBuffer {
+ public:
+  FSMemoryMappedFileBuffer(void* _base, size_t _length)
+      : base_(_base), length_(_length) {}
+
+  virtual ~FSMemoryMappedFileBuffer() = 0;
+
+  // We do not want to unmap this twice. We can make this class
+  // movable if desired, however, since
+  FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete;
+  FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete;
+
+  void* GetBase() const { return base_; }
+  size_t GetLen() const { return length_; }
+
+ protected:
+  void* base_;
+  const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class FSDirectory {
+ public:
+  virtual ~FSDirectory() {}
+  // Fsync directory. Can be called concurrently from multiple threads.
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it
+  // may fsync directory or just the renaming file (e.g. btrfs). By default, it
+  // just calls directory fsync.
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& /*dir_fsync_options*/) {
+    return Fsync(options, dbg);
+  }
+
+  // Close directory
+  virtual IOStatus Close(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("Close");
+  }
+
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;
+  }
+
+  // If you're adding methods here, remember to add them to
+  // DirectoryWrapper too.
+};
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper {
+//  public:
+//   MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target):
+//     ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {}
+//   Status Read(size_t n, FileSystem::IOOptions& options, Slice* result,
+//               char* scratch, FileSystem::IODebugContext* dbg) override {
+//     cout << "Doing a read of size " << n << "!" << endl;
+//     return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options,
+//     result,
+//                                                 scratch, dbg);
+//   }
+//   // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+//     forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+//     rocksdb class. Unless you actually want to override the behavior.
+//     (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class FileSystemWrapper : public FileSystem {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit FileSystemWrapper(const std::shared_ptr<FileSystem>& t);
+  ~FileSystemWrapper() override {}
+
+  // Return the target to which this Env forwards all calls
+  FileSystem* target() const { return target_.get(); }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override {
+    return target_->NewSequentialFile(f, file_opts, r, dbg);
+  }
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override {
+    return target_->NewRandomAccessFile(f, file_opts, r, dbg);
+  }
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override {
+    return target_->NewWritableFile(f, file_opts, r, dbg);
+  }
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
+    return target_->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* r,
+                             IODebugContext* dbg) override {
+    return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg);
+  }
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
+    return target_->NewRandomRWFile(fname, file_opts, result, dbg);
+  }
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return target_->NewMemoryMappedFileBuffer(fname, result);
+  }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override {
+    return target_->NewDirectory(name, io_opts, result, dbg);
+  }
+  IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+                      IODebugContext* dbg) override {
+    return target_->FileExists(f, io_opts, dbg);
+  }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override {
+    return target_->GetChildren(dir, io_opts, r, dbg);
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    return target_->GetChildrenFileAttributes(dir, options, result, dbg);
+  }
+  IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    return target_->DeleteFile(f, options, dbg);
+  }
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Truncate(fname, size, options, dbg);
+  }
+  IOStatus CreateDir(const std::string& d, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->CreateDir(d, options, dbg);
+  }
+  IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+                              IODebugContext* dbg) override {
+    return target_->CreateDirIfMissing(d, options, dbg);
+  }
+  IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->DeleteDir(d, options, dbg);
+  }
+  IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+                       uint64_t* s, IODebugContext* dbg) override {
+    return target_->GetFileSize(f, options, s, dbg);
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override {
+    return target_->GetFileModificationTime(fname, options, file_mtime, dbg);
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override {
+    return target_->GetAbsolutePath(db_path, options, output_path, dbg);
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& options, IODebugContext* dbg) override {
+    return target_->RenameFile(s, t, options, dbg);
+  }
+
+  IOStatus LinkFile(const std::string& s, const std::string& t,
+                    const IOOptions& options, IODebugContext* dbg) override {
+    return target_->LinkFile(s, t, options, dbg);
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
+                        uint64_t* count, IODebugContext* dbg) override {
+    return target_->NumFileLinks(fname, options, count, dbg);
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& options, bool* res,
+                        IODebugContext* dbg) override {
+    return target_->AreFilesSame(first, second, options, res, dbg);
+  }
+
+  IOStatus LockFile(const std::string& f, const IOOptions& options,
+                    FileLock** l, IODebugContext* dbg) override {
+    return target_->LockFile(f, options, l, dbg);
+  }
+
+  IOStatus UnlockFile(FileLock* l, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    return target_->UnlockFile(l, options, dbg);
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override {
+    return target_->GetTestDirectory(options, path, dbg);
+  }
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override {
+    return target_->NewLogger(fname, options, result, dbg);
+  }
+
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeFileOptions(opts);
+  }
+
+  FileOptions OptimizeForLogRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForLogRead(file_options);
+  }
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestRead(file_options);
+  }
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(file_options, db_options);
+  }
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestWrite(file_options);
+  }
+  FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(file_options,
+                                                    immutable_ops);
+  }
+  FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(file_options, db_options);
+  }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
+  IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
+                        uint64_t* diskfree, IODebugContext* dbg) override {
+    return target_->GetFreeSpace(path, options, diskfree, dbg);
+  }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override {
+    return target_->IsDirectory(path, options, is_dir, dbg);
+  }
+
+  const Customizable* Inner() const override { return target_.get(); }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+
+  virtual IOStatus Poll(std::vector<void*>& io_handles,
+                        size_t min_completions) override {
+    return target_->Poll(io_handles, min_completions);
+  }
+
+  virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
+    return target_->AbortIO(io_handles);
+  }
+
+ protected:
+  std::shared_ptr<FileSystem> target_;
+};
+
+class FSSequentialFileWrapper : public FSSequentialFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {}
+
+  FSSequentialFile* target() const { return target_; }
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    return target_->Read(n, options, result, scratch, dbg);
+  }
+  IOStatus Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    return target_->PositionedRead(offset, n, options, result, scratch, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  FSSequentialFile* target_;
+};
+
+class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSSequentialFileOwnerWrapper(std::unique_ptr<FSSequentialFile>&& t)
+      : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSSequentialFile> guard_;
+};
+
+class FSRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {}
+
+  FSRandomAccessFile* target() const { return target_; }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return target_->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override {
+    return target_->MultiRead(reqs, num_reqs, options, dbg);
+  }
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Prefetch(offset, n, options, dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  };
+  void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override {
+    return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
+  FSRandomAccessFile* target_;
+};
+
+class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomAccessFileOwnerWrapper(
+      std::unique_ptr<FSRandomAccessFile>&& t)
+      : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
+};
+
+class FSWritableFileWrapper : public FSWritableFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {}
+
+  FSWritableFile* target() const { return target_; }
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    return target_->Append(data, options, dbg);
+  }
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override {
+    return target_->Append(data, options, verification_info, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, verification_info,
+                                     dbg);
+  }
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Truncate(size, options, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->GetFileSize(options, dbg);
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->RangeSync(offset, nbytes, options, dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    target_->PrepareWrite(offset, len, options, dbg);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Allocate(offset, len, options, dbg);
+  }
+
+ private:
+  FSWritableFile* target_;
+};
+
+class FSWritableFileOwnerWrapper : public FSWritableFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSWritableFileOwnerWrapper(std::unique_ptr<FSWritableFile>&& t)
+      : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSWritableFile> guard_;
+};
+
+class FSRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {}
+
+  FSRandomRWFile* target() const { return target_; }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    return target_->Write(offset, data, options, dbg);
+  }
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return target_->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  FSRandomRWFile* target_;
+};
+
+class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomRWFileOwnerWrapper(std::unique_ptr<FSRandomRWFile>&& t)
+      : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomRWFile> guard_;
+};
+
+class FSDirectoryWrapper : public FSDirectory {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSDirectoryWrapper(std::unique_ptr<FSDirectory>&& t)
+      : guard_(std::move(t)) {
+    target_ = guard_.get();
+  }
+
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {}
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+
+  IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& dir_fsync_options) override {
+    return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> guard_;
+  FSDirectory* target_;
+};
+
+// A utility routine: write "data" to the named file.
+extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+                                  const std::string& fname,
+                                  bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+                                 std::string* data);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h
new file mode 100644
index 000000000..954d15b4a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/filter_policy.h
@@ -0,0 +1,206 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#pragma once
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct BlockBasedTableOptions;
+struct ConfigOptions;
+
+// As of RocksDB 7.0, the details of these classes are internal
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext(). References other
+// structs because this is expected to be a temporary, stack-allocated object.
+struct FilterBuildingContext {
+  // This constructor is for internal use only and subject to change.
+  FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+  // Options for the table being built
+  const BlockBasedTableOptions& table_options;
+
+  // BEGIN from (DB|ColumnFamily)Options in effect at table creation time
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // Number of LSM levels, or -1 if unknown
+  int num_levels = -1;
+
+  // An optional logger for reporting errors, warnings, etc.
+  Logger* info_log = nullptr;
+  // END from (DB|ColumnFamily)Options
+
+  // Name of the column family for the table (or empty string if unknown)
+  // TODO: consider changing to Slice
+  std::string column_family_name;
+
+  // The table level at time of constructing the SST file, or -1 if unknown
+  // or N/A as in SstFileWriter. (The table file could later be used at a
+  // different level.)
+  int level_at_creation = -1;
+
+  // True if known to be going into bottommost sorted run for applicable
+  // key range (which might not even be last level with data). False
+  // otherwise.
+  bool is_bottommost = false;
+
+  // Reason for creating the file with the filter
+  TableFileCreationReason reason = TableFileCreationReason::kMisc;
+};
+
+// Determines what kind of filter (if any) to generate in SST files, and under
+// which conditions. API users can create custom filter policies that
+// defer to other built-in policies (see NewBloomFilterPolicy and
+// NewRibbonFilterPolicy) based on the context provided to
+// GetBuilderWithContext.
+class FilterPolicy : public Customizable {
+ public:
+  virtual ~FilterPolicy();
+  static const char* Type() { return "FilterPolicy"; }
+
+  // The name used for identifying whether a filter on disk is readable
+  // by this FilterPolicy. If this FilterPolicy is part of a family that
+  // can read each others filters, such as built-in BloomFilterPolcy and
+  // RibbonFilterPolicy, the CompatibilityName is a shared family name,
+  // while kinds of filters in the family can have distinct Customizable
+  // Names. This function is pure virtual so that wrappers around built-in
+  // policies are prompted to defer to CompatibilityName() of the wrapped
+  // policy, which is important for compatibility.
+  //
+  // For custom filter policies that are not part of a read-compatible
+  // family (rare), implementations may return Name().
+  virtual const char* CompatibilityName() const = 0;
+
+  // Creates a new FilterPolicy based on the input value string and returns the
+  // result The value might be an ID, and ID with properties, or an old-style
+  // policy string.
+  // The value describes the FilterPolicy being created.
+  // For BloomFilters, value may be a ":"-delimited value of the form:
+  //   "bloomfilter:[bits_per_key]",
+  //   e.g. ""bloomfilter:4"
+  //   The above string is equivalent to calling NewBloomFilterPolicy(4).
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<const FilterPolicy>* result);
+
+  // Return a new FilterBitsBuilder for constructing full or partitioned
+  // filter blocks, or return nullptr to indicate "no filter". Custom
+  // implementations should defer to a built-in FilterPolicy to get a
+  // new FilterBitsBuilder, but the FilterBuildingContext can be used
+  // to decide which built-in FilterPolicy to defer to.
+  virtual FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const = 0;
+
+  // Return a new FilterBitsReader for full or partitioned filter blocks.
+  // Caller retains ownership of any buffer pointed to by the input Slice.
+  // Custom implementation should defer to GetFilterBitsReader on any
+  // built-in FilterPolicy, which can read filters generated by any other
+  // built-in FilterPolicy.
+  virtual FilterBitsReader* GetFilterBitsReader(
+      const Slice& /*contents*/) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key. See
+// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+//
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
+// To avoid configurations that are unlikely to produce good filtering
+// value for the CPU overhead, bits_per_key < 0.5 is rounded down to 0.0
+// which means "generate no filter", and 0.5 <= bits_per_key < 1.0 is
+// rounded up to 1.0, for a 62% FP rate.
+//
+// The caller is responsible for eventually deleting the result, though
+// this is typically handled automatically with BlockBasedTableOptions:
+//   table_options.filter_policy.reset(NewBloomFilterPolicy(...));
+//
+// As of RocksDB 7.0, the use_block_based_builder parameter is ignored.
+// (The old, inefficient block-based filter is no longer accessible in
+// the public API.)
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(
+    double bits_per_key, bool IGNORED_use_block_based_builder = false);
+
+// A new Bloom alternative that saves about 30% space compared to
+// Bloom filters, with similar query times but roughly 3-4x CPU time
+// and 3x temporary space usage during construction.  For example, if
+// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same
+// 0.95% FP rate as Bloom filter but only using about 7 bits per key.
+//
+// The space savings of Ribbon filters makes sense for lower (higher
+// numbered; larger; longer-lived) levels of LSM, whereas the speed of
+// Bloom filters make sense for highest levels of LSM. Setting
+// bloom_before_level allows for this design with Level and Universal
+// compaction styles. For example, bloom_before_level=1 means that Bloom
+// filters will be used in level 0, including flushes, and Ribbon
+// filters elsewhere, including FIFO compaction and external SST files.
+// For this option, memtable flushes are considered level -1 (so that
+// flushes can be distinguished from intra-L0 compaction).
+// bloom_before_level=0 (default) -> Generate Bloom filters only for
+// flushes under Level and Universal compaction styles.
+// bloom_before_level=-1 -> Always generate Ribbon filters (except in
+// some extreme or exceptional cases).
+//
+// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier
+// versions reading the data will behave as if no filter was used
+// (degraded performance until compaction rebuilds filters). All
+// built-in FilterPolicies (Bloom or Ribbon) are able to read other
+// kinds of built-in filters.
+//
+// Note: the current Ribbon filter schema uses some extra resources
+// when constructing very large filters. For example, for 100 million
+// keys in a single filter (one SST file without partitioned filters),
+// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom.
+// However, the savings in filter space from just ~60 open SST files
+// makes up for the additional temporary memory use.
+//
+// Also consider using optimize_filters_for_memory to save filter
+// memory.
+extern const FilterPolicy* NewRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key, int bloom_before_level = 0);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h
new file mode 100644
index 000000000..7a5dd957e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class BlockBuilder;
+struct ConfigOptions;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key, const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() {}
+};
+
+class FlushBlockPolicyFactory : public Customizable {
+ public:
+  static const char* Type() { return "FlushBlockPolicyFactory"; }
+
+  // Creates a FlushBlockPolicyFactory based on the input value.
+  // By default, this method can create EveryKey or BySize PolicyFactory,
+  // which take now config_options.
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& value,
+      std::shared_ptr<FlushBlockPolicyFactory>* result);
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() {}
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory();
+
+  static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const override;
+
+  static FlushBlockPolicy* NewFlushBlockPolicy(
+      const uint64_t size, const int deviation,
+      const BlockBuilder& data_block_builder);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/functor_wrapper.h b/src/rocksdb/include/rocksdb/functor_wrapper.h
new file mode 100644
index 000000000..17b021bf7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/functor_wrapper.h
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+template <std::size_t...>
+struct IndexSequence {};
+
+template <std::size_t N, std::size_t... Next>
+struct IndexSequenceHelper
+    : public IndexSequenceHelper<N - 1U, N - 1U, Next...> {};
+
+template <std::size_t... Next>
+struct IndexSequenceHelper<0U, Next...> {
+  using type = IndexSequence<Next...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSequenceHelper<N>::type;
+
+template <typename Function, typename Tuple, size_t... I>
+void call(Function f, Tuple t, IndexSequence<I...>) {
+  f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+void call(Function f, Tuple t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  call(f, t, make_index_sequence<size>{});
+}
+}  // namespace detail
+
+template <typename... Args>
+class FunctorWrapper {
+ public:
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+      : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
+
+  void invoke() { detail::call(functor_, args_); }
+
+ private:
+  std::function<void(Args...)> functor_;
+  std::tuple<Args...> args_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h
new file mode 100644
index 000000000..0bf5e939a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/io_status.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// An IOStatus encapsulates the result of an operation.  It may indicate
+// success, or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on an IOStatus without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same IOStatus must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+
+#include "status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IOStatus : public Status {
+ public:
+  using Code = Status::Code;
+  using SubCode = Status::SubCode;
+
+  enum IOErrorScope : unsigned char {
+    kIOErrorScopeFileSystem,
+    kIOErrorScopeFile,
+    kIOErrorScopeRange,
+    kIOErrorScopeMax,
+  };
+
+  // Create a success status.
+  IOStatus() : IOStatus(kOk, kNone) {}
+  ~IOStatus() {}
+
+  // Copy the specified status.
+  IOStatus(const IOStatus& s);
+  IOStatus& operator=(const IOStatus& s);
+  IOStatus(IOStatus&& s) noexcept;
+  IOStatus& operator=(IOStatus&& s) noexcept;
+  bool operator==(const IOStatus& rhs) const;
+  bool operator!=(const IOStatus& rhs) const;
+
+  void SetRetryable(bool retryable) { retryable_ = retryable; }
+  void SetDataLoss(bool data_loss) { data_loss_ = data_loss; }
+  void SetScope(IOErrorScope scope) {
+    scope_ = static_cast<unsigned char>(scope);
+  }
+
+  bool GetRetryable() const { return retryable_; }
+  bool GetDataLoss() const { return data_loss_; }
+  IOErrorScope GetScope() const { return static_cast<IOErrorScope>(scope_); }
+
+  // Return a success status.
+  static IOStatus OK() { return IOStatus(); }
+
+  static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kNotSupported, msg, msg2);
+  }
+  static IOStatus NotSupported(SubCode msg = kNone) {
+    return IOStatus(kNotSupported, msg);
+  }
+
+  // Return error status of an appropriate type.
+  static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kNotFound, msg, msg2);
+  }
+  // Fast path for not found without malloc;
+  static IOStatus NotFound(SubCode msg = kNone) {
+    return IOStatus(kNotFound, msg);
+  }
+
+  static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kCorruption, msg, msg2);
+  }
+  static IOStatus Corruption(SubCode msg = kNone) {
+    return IOStatus(kCorruption, msg);
+  }
+
+  static IOStatus InvalidArgument(const Slice& msg,
+                                  const Slice& msg2 = Slice()) {
+    return IOStatus(kInvalidArgument, msg, msg2);
+  }
+  static IOStatus InvalidArgument(SubCode msg = kNone) {
+    return IOStatus(kInvalidArgument, msg);
+  }
+
+  static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, msg, msg2);
+  }
+  static IOStatus IOError(SubCode msg = kNone) {
+    return IOStatus(kIOError, msg);
+  }
+
+  static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); }
+  static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kBusy, msg, msg2);
+  }
+
+  static IOStatus TimedOut(SubCode msg = kNone) {
+    return IOStatus(kTimedOut, msg);
+  }
+  static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kTimedOut, msg, msg2);
+  }
+
+  static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); }
+  static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kNoSpace, msg, msg2);
+  }
+
+  static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); }
+  static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kPathNotFound, msg, msg2);
+  }
+
+  static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); }
+  static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kIOFenced, msg, msg2);
+  }
+
+  static IOStatus Aborted(SubCode msg = kNone) {
+    return IOStatus(kAborted, msg);
+  }
+  static IOStatus Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kAborted, msg, msg2);
+  }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  // std::string ToString() const;
+
+ private:
+  friend IOStatus status_to_io_status(Status&&);
+
+  explicit IOStatus(Code _code, SubCode _subcode = kNone)
+      : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {}
+
+  IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+  IOStatus(Code _code, const Slice& msg, const Slice& msg2)
+      : IOStatus(_code, kNone, msg, msg2) {}
+};
+
+inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg,
+                          const Slice& msg2)
+    : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {
+  assert(code_ != kOk);
+  assert(subcode_ != kMaxSubCode);
+  const size_t len1 = msg.size();
+  const size_t len2 = msg2.size();
+  const size_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* const result = new char[size + 1];  // +1 for null terminator
+  memcpy(result, msg.data(), len1);
+  if (len2) {
+    result[len1] = ':';
+    result[len1 + 1] = ' ';
+    memcpy(result + len1 + 2, msg2.data(), len2);
+  }
+  result[size] = '\0';  // null terminator for C style string
+  state_.reset(result);
+}
+
+inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  s.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  retryable_ = s.retryable_;
+  data_loss_ = s.data_loss_;
+  scope_ = s.scope_;
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline IOStatus& IOStatus::operator=(const IOStatus& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    code_ = s.code_;
+    subcode_ = s.subcode_;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+  }
+  return *this;
+}
+
+inline IOStatus::IOStatus(IOStatus&& s) noexcept : IOStatus() {
+  *this = std::move(s);
+}
+
+inline IOStatus& IOStatus::operator=(IOStatus&& s) noexcept {
+  if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    code_ = std::move(s.code_);
+    s.code_ = kOk;
+    subcode_ = std::move(s.subcode_);
+    s.subcode_ = kNone;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    s.scope_ = kIOErrorScopeFileSystem;
+    state_ = std::move(s.state_);
+  }
+  return *this;
+}
+
+inline bool IOStatus::operator==(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  return (code_ == rhs.code_);
+}
+
+inline bool IOStatus::operator!=(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  return !(*this == rhs);
+}
+
+inline IOStatus status_to_io_status(Status&& status) {
+  IOStatus io_s;
+  Status& s = io_s;
+  s = std::move(status);
+  return io_s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
new file mode 100644
index 000000000..559d44c57
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+// A thread local context for gathering io-stats efficiently and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each
+// item in Temperature class.
+struct FileIOByTemperature {
+  // the number of bytes read to Temperature::kHot file
+  uint64_t hot_file_bytes_read;
+  // the number of bytes read to Temperature::kWarm file
+  uint64_t warm_file_bytes_read;
+  // the number of bytes read to Temperature::kCold file
+  uint64_t cold_file_bytes_read;
+  // total number of reads to Temperature::kHot file
+  uint64_t hot_file_read_count;
+  // total number of reads to Temperature::kWarm file
+  uint64_t warm_file_read_count;
+  // total number of reads to Temperature::kCold file
+  uint64_t cold_file_read_count;
+  // reset all the statistics to 0.
+  void Reset() {
+    hot_file_bytes_read = 0;
+    warm_file_bytes_read = 0;
+    cold_file_bytes_read = 0;
+    hot_file_read_count = 0;
+    warm_file_read_count = 0;
+    cold_file_read_count = 0;
+  }
+};
+
+struct IOStatsContext {
+  // reset all io-stats counter to zero
+  void Reset();
+
+  std::string ToString(bool exclude_zero_counters = false) const;
+
+  // the thread pool id
+  uint64_t thread_pool_id;
+
+  // number of bytes that has been written.
+  uint64_t bytes_written;
+  // number of bytes that has been read.
+  uint64_t bytes_read;
+
+  // time spent in open() and fopen().
+  uint64_t open_nanos;
+  // time spent in fallocate().
+  uint64_t allocate_nanos;
+  // time spent in write() and pwrite().
+  uint64_t write_nanos;
+  // time spent in read() and pread()
+  uint64_t read_nanos;
+  // time spent in sync_file_range().
+  uint64_t range_sync_nanos;
+  // time spent in fsync
+  uint64_t fsync_nanos;
+  // time spent in preparing write (fallocate etc).
+  uint64_t prepare_write_nanos;
+  // time spent in Logger::Logv().
+  uint64_t logger_nanos;
+  // CPU time spent in write() and pwrite()
+  uint64_t cpu_write_nanos;
+  // CPU time spent in read() and pread()
+  uint64_t cpu_read_nanos;
+
+  FileIOByTemperature file_io_stats_by_temperature;
+
+  // It is not consistent that whether iostats follows PerfLevel.Timer counters
+  // follows it but BackupEngine relies on counter metrics to always be there.
+  // Here we create a backdoor option to disable some counters, so that some
+  // existing stats are not polluted by file operations, such as logging, by
+  // turning this off.
+  bool disable_iostats = false;
+};
+
+// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
+// non-thread-local IOStatsContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise, a pointer to a thread-local IOStatsContext object will be
+// returned.
+//
+// This function never returns nullptr.
+IOStatsContext* get_iostats_context();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h
new file mode 100644
index 000000000..9d4c9f73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iterator.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/cleanable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator : public Cleanable {
+ public:
+  Iterator() {}
+  // No copying allowed
+  Iterator(const Iterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
+  virtual ~Iterator() {}
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target.
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
+  // Target does not contain timestamp.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Position at the last key in the source that at or before target.
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or before target.
+  // Target does not contain timestamp.
+  virtual void SeekForPrev(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of the
+  // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev
+  // operation).
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  If the entry is a plain key-value,
+  // return the value as-is; if it is a wide-column entity, return the value of
+  // the default anonymous column (see kDefaultWideColumnName) if any, or an
+  // empty value otherwise.  The underlying storage for the returned slice is
+  // valid only until the next modification of the iterator (i.e. the next
+  // SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+  // REQUIRES: Valid()
+  virtual Slice value() const = 0;
+
+  // Return the wide columns for the current entry.  If the entry is a
+  // wide-column entity, return it as-is; if it is a plain key-value, return it
+  // as an entity with a single anonymous column (see kDefaultWideColumnName)
+  // which contains the value.  The underlying storage for the returned
+  // structure is valid only until the next modification of the iterator (i.e.
+  // the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+  // REQUIRES: Valid()
+  virtual const WideColumns& columns() const {
+    assert(false);
+    return kNoWideColumns;
+  }
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // If supported, renew the iterator to represent the latest state. The
+  // iterator will be invalidated after the call. Not supported if
+  // ReadOptions.snapshot is given when creating the iterator.
+  virtual Status Refresh() {
+    return Status::NotSupported("Refresh() is not supported");
+  }
+
+  // Property "rocksdb.iterator.is-key-pinned":
+  //   If returning "1", this means that the Slice returned by key() is valid
+  //   as long as the iterator is not deleted.
+  //   It is guaranteed to always return "1" if
+  //      - Iterator created with ReadOptions::pin_data = true
+  //      - DB tables were created with
+  //        BlockBasedTableOptions::use_delta_encoding = false.
+  // Property "rocksdb.iterator.super-version-number":
+  //   LSM version used by the iterator. The same format as DB Property
+  //   kCurrentSuperVersionNumber. See its comment for more information.
+  // Property "rocksdb.iterator.internal-key":
+  //   Get the user-key portion of the internal key at which the iteration
+  //   stopped.
+  virtual Status GetProperty(std::string prop_name, std::string* prop);
+
+  virtual Slice timestamp() const {
+    assert(false);
+    return Slice();
+  }
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h
new file mode 100644
index 000000000..7408cbc87
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/ldb_tool.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An interface for converting a slice to a readable string
+class SliceFormatter {
+ public:
+  virtual ~SliceFormatter() {}
+  virtual std::string Format(const Slice& s) const = 0;
+};
+
+// Options for customizing ldb tool (beyond the DB Options)
+struct LDBOptions {
+  // Create LDBOptions with default values for all fields
+  LDBOptions();
+
+  // Key formatter that converts a slice to a readable string.
+  // Default: Slice::ToString()
+  std::shared_ptr<SliceFormatter> key_formatter;
+
+  std::string print_help_header = "ldb - RocksDB Tool";
+};
+
+class LDBTool {
+ public:
+  void Run(
+      int argc, char** argv, Options db_options = Options(),
+      const LDBOptions& ldb_options = LDBOptions(),
+      const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
new file mode 100644
index 000000000..8644fcf3f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -0,0 +1,847 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+class DB;
+class ColumnFamilyHandle;
+class Status;
+struct CompactionJobStats;
+
+struct FileCreationBriefInfo {
+  FileCreationBriefInfo() = default;
+  FileCreationBriefInfo(const std::string& _db_name,
+                        const std::string& _cf_name,
+                        const std::string& _file_path, int _job_id)
+      : db_name(_db_name),
+        cf_name(_cf_name),
+        file_path(_file_path),
+        job_id(_job_id) {}
+  // the name of the database where the file was created.
+  std::string db_name;
+  // the name of the column family where the file was created.
+  std::string cf_name;
+  // the path to the created file.
+  std::string file_path;
+  // the id of the job (which could be flush or compaction) that
+  // created the file.
+  int job_id = 0;
+};
+
+struct TableFileCreationBriefInfo : public FileCreationBriefInfo {
+  // reason of creating the table.
+  TableFileCreationReason reason;
+};
+
+struct TableFileCreationInfo : public TableFileCreationBriefInfo {
+  TableFileCreationInfo() = default;
+  explicit TableFileCreationInfo(TableProperties&& prop)
+      : table_properties(prop) {}
+  // the size of the file.
+  uint64_t file_size;
+  // Detailed properties of the created file.
+  TableProperties table_properties;
+  // The status indicating whether the creation was successful or not.
+  Status status;
+  // The checksum of the table file being created
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this table file
+  std::string file_checksum_func_name;
+};
+
+struct BlobFileCreationBriefInfo : public FileCreationBriefInfo {
+  BlobFileCreationBriefInfo(const std::string& _db_name,
+                            const std::string& _cf_name,
+                            const std::string& _file_path, int _job_id,
+                            BlobFileCreationReason _reason)
+      : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id),
+        reason(_reason) {}
+  // reason of creating the blob file.
+  BlobFileCreationReason reason;
+};
+
+struct BlobFileCreationInfo : public BlobFileCreationBriefInfo {
+  BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name,
+                       const std::string& _file_path, int _job_id,
+                       BlobFileCreationReason _reason,
+                       uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+                       Status _status, const std::string& _file_checksum,
+                       const std::string& _file_checksum_func_name)
+      : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id,
+                                  _reason),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        status(_status),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name) {}
+
+  // the number of blob in a file.
+  uint64_t total_blob_count;
+  // the total bytes in a file.
+  uint64_t total_blob_bytes;
+  // The status indicating whether the creation was successful or not.
+  Status status;
+  // The checksum of the blob file being created.
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this blob file.
+  std::string file_checksum_func_name;
+};
+
+enum class CompactionReason : int {
+  kUnknown = 0,
+  // [Level] number of L0 files > level0_file_num_compaction_trigger
+  kLevelL0FilesNum,
+  // [Level] total size of level > MaxBytesForLevel()
+  kLevelMaxLevelSize,
+  // [Universal] Compacting for size amplification
+  kUniversalSizeAmplification,
+  // [Universal] Compacting for size ratio
+  kUniversalSizeRatio,
+  // [Universal] number of sorted runs > level0_file_num_compaction_trigger
+  kUniversalSortedRunNum,
+  // [FIFO] total size > max_table_files_size
+  kFIFOMaxSize,
+  // [FIFO] reduce number of files.
+  kFIFOReduceNumFiles,
+  // [FIFO] files with creation time < (current_time - interval)
+  kFIFOTtl,
+  // Manual compaction
+  kManualCompaction,
+  // DB::SuggestCompactRange() marked files for compaction
+  kFilesMarkedForCompaction,
+  // [Level] Automatic compaction within bottommost level to cleanup duplicate
+  // versions of same user key, usually due to a released snapshot.
+  kBottommostFiles,
+  // Compaction based on TTL
+  kTtl,
+  // According to the comments in flush_job.cc, RocksDB treats flush as
+  // a level 0 compaction in internal stats.
+  kFlush,
+  // Compaction caused by external sst file ingestion
+  kExternalSstIngestion,
+  // Compaction due to SST file being too old
+  kPeriodicCompaction,
+  // Compaction in order to move files to temperature
+  kChangeTemperature,
+  // Compaction scheduled to force garbage collection of blob files
+  kForcedBlobGC,
+  // A special TTL compaction for RoundRobin policy, which basically the same as
+  // kLevelMaxLevelSize, but the goal is to compact TTLed files.
+  kRoundRobinTtl,
+  // total number of compaction reasons, new reasons must be added above this.
+  kNumOfReasons,
+};
+
+enum class FlushReason : int {
+  kOthers = 0x00,
+  kGetLiveFiles = 0x01,
+  kShutDown = 0x02,
+  kExternalFileIngestion = 0x03,
+  kManualCompaction = 0x04,
+  kWriteBufferManager = 0x05,
+  kWriteBufferFull = 0x06,
+  kTest = 0x07,
+  kDeleteFiles = 0x08,
+  kAutoCompaction = 0x09,
+  kManualFlush = 0x0a,
+  kErrorRecovery = 0xb,
+  // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
+  // will not be called to avoid many small immutable memtables.
+  kErrorRecoveryRetryFlush = 0xc,
+  kWalFull = 0xd,
+};
+
+// TODO: In the future, BackgroundErrorReason will only be used to indicate
+// why the BG Error is happening (e.g., flush, compaction). We may introduce
+// other data structure to indicate other essential information such as
+// the file type (e.g., Manifest, SST) and special context.
+enum class BackgroundErrorReason {
+  kFlush,
+  kCompaction,
+  kWriteCallback,
+  kMemTable,
+  kManifestWrite,
+  kFlushNoWAL,
+  kManifestWriteNoWAL,
+};
+
+enum class WriteStallCondition {
+  kNormal,
+  kDelayed,
+  kStopped,
+};
+
+struct WriteStallInfo {
+  // the name of the column family
+  std::string cf_name;
+  // state of the write controller
+  struct {
+    WriteStallCondition cur;
+    WriteStallCondition prev;
+  } condition;
+};
+
+#ifndef ROCKSDB_LITE
+
+struct FileDeletionInfo {
+  FileDeletionInfo() = default;
+
+  FileDeletionInfo(const std::string& _db_name, const std::string& _file_path,
+                   int _job_id, Status _status)
+      : db_name(_db_name),
+        file_path(_file_path),
+        job_id(_job_id),
+        status(_status) {}
+  // The name of the database where the file was deleted.
+  std::string db_name;
+  // The path to the deleted file.
+  std::string file_path;
+  // The id of the job which deleted the file.
+  int job_id = 0;
+  // The status indicating whether the deletion was successful or not.
+  Status status;
+};
+
+struct TableFileDeletionInfo : public FileDeletionInfo {};
+
+struct BlobFileDeletionInfo : public FileDeletionInfo {
+  BlobFileDeletionInfo(const std::string& _db_name,
+                       const std::string& _file_path, int _job_id,
+                       Status _status)
+      : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {}
+};
+
+enum class FileOperationType {
+  kRead,
+  kWrite,
+  kTruncate,
+  kClose,
+  kFlush,
+  kSync,
+  kFsync,
+  kRangeSync,
+  kAppend,
+  kPositionedAppend,
+  kOpen
+};
+
+struct FileOperationInfo {
+  using Duration = std::chrono::nanoseconds;
+  using SteadyTimePoint =
+      std::chrono::time_point<std::chrono::steady_clock, Duration>;
+  using SystemTimePoint =
+      std::chrono::time_point<std::chrono::system_clock, Duration>;
+  using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
+  using FinishTimePoint = SteadyTimePoint;
+
+  FileOperationType type;
+  const std::string& path;
+  // Rocksdb try to provide file temperature information, but it's not
+  // guaranteed.
+  Temperature temperature;
+  uint64_t offset;
+  size_t length;
+  const Duration duration;
+  const SystemTimePoint& start_ts;
+  Status status;
+
+  FileOperationInfo(const FileOperationType _type, const std::string& _path,
+                    const StartTimePoint& _start_ts,
+                    const FinishTimePoint& _finish_ts, const Status& _status,
+                    const Temperature _temperature = Temperature::kUnknown)
+      : type(_type),
+        path(_path),
+        temperature(_temperature),
+        duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
+            _finish_ts - _start_ts.second)),
+        start_ts(_start_ts.first),
+        status(_status) {}
+  static StartTimePoint StartNow() {
+    return std::make_pair<SystemTimePoint, SteadyTimePoint>(
+        std::chrono::system_clock::now(), std::chrono::steady_clock::now());
+  }
+  static FinishTimePoint FinishNow() {
+    return std::chrono::steady_clock::now();
+  }
+};
+
+struct BlobFileInfo {
+  BlobFileInfo(const std::string& _blob_file_path,
+               const uint64_t _blob_file_number)
+      : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {}
+
+  std::string blob_file_path;
+  uint64_t blob_file_number;
+};
+
+struct BlobFileAdditionInfo : public BlobFileInfo {
+  BlobFileAdditionInfo(const std::string& _blob_file_path,
+                       const uint64_t _blob_file_number,
+                       const uint64_t _total_blob_count,
+                       const uint64_t _total_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes) {}
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+};
+
+struct BlobFileGarbageInfo : public BlobFileInfo {
+  BlobFileGarbageInfo(const std::string& _blob_file_path,
+                      const uint64_t _blob_file_number,
+                      const uint64_t _garbage_blob_count,
+                      const uint64_t _garbage_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes) {}
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
+};
+
+struct FlushJobInfo {
+  // the id of the column family
+  uint32_t cf_id;
+  // the name of the column family
+  std::string cf_name;
+  // the path to the newly created file
+  std::string file_path;
+  // the file number of the newly created file
+  uint64_t file_number;
+  // the oldest blob file referenced by the newly created file
+  uint64_t oldest_blob_file_number;
+  // the id of the thread that completed this flush job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+  // If true, then rocksdb is currently slowing-down all writes to prevent
+  // creating too many Level 0 files as compaction seems not able to
+  // catch up the write request speed.  This indicates that there are
+  // too many files in Level 0.
+  bool triggered_writes_slowdown;
+  // If true, then rocksdb is currently blocking any writes to prevent
+  // creating more L0 files.  This indicates that there are too many
+  // files in level 0.  Compactions should try to compact L0 files down
+  // to lower levels as soon as possible.
+  bool triggered_writes_stop;
+  // The smallest sequence number in the newly created file
+  SequenceNumber smallest_seqno;
+  // The largest sequence number in the newly created file
+  SequenceNumber largest_seqno;
+  // Table properties of the table being flushed
+  TableProperties table_properties;
+
+  FlushReason flush_reason;
+
+  // Compression algorithm used for blob output files
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during flush in Integrated BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+};
+
+struct CompactionFileInfo {
+  // The level of the file.
+  int level;
+
+  // The file number of the file.
+  uint64_t file_number;
+
+  // The file number of the oldest blob file this SST file references.
+  uint64_t oldest_blob_file_number;
+};
+
+struct SubcompactionJobInfo {
+  ~SubcompactionJobInfo() { status.PermitUncheckedError(); }
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the id of the thread that completed this compaction job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+
+  // sub-compaction job id, which is only unique within the same compaction, so
+  // use both 'job_id' and 'subcompaction_job_id' to identify a subcompaction
+  // within an instance.
+  // For non subcompaction job, it's set to -1.
+  int subcompaction_job_id;
+  // the smallest input level of the compaction.
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
+  // Reason to run the compaction
+  CompactionReason compaction_reason;
+
+  // Compression algorithm used for output files
+  CompressionType compression;
+
+  // Statistics and other additional details on the compaction
+  CompactionJobStats stats;
+
+  // Compression algorithm used for blob output files.
+  CompressionType blob_compression_type;
+};
+
+struct CompactionJobInfo {
+  ~CompactionJobInfo() { status.PermitUncheckedError(); }
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the id of the thread that completed this compaction job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+
+  // the smallest input level of the compaction.
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
+  // The following variables contain information about compaction inputs
+  // and outputs. A file may appear in both the input and output lists
+  // if it was simply moved to a different level. The order of elements
+  // is the same across input_files and input_file_infos; similarly, it is
+  // the same across output_files and output_file_infos.
+
+  // The names of the compaction input files.
+  std::vector<std::string> input_files;
+
+  // Additional information about the compaction input files.
+  std::vector<CompactionFileInfo> input_file_infos;
+
+  // The names of the compaction output files.
+  std::vector<std::string> output_files;
+
+  // Additional information about the compaction output files.
+  std::vector<CompactionFileInfo> output_file_infos;
+
+  // Table properties for input and output tables.
+  // The map is keyed by values from input_files and output_files.
+  TablePropertiesCollection table_properties;
+
+  // Reason to run the compaction
+  CompactionReason compaction_reason;
+
+  // Compression algorithm used for output files
+  CompressionType compression;
+
+  // Statistics and other additional details on the compaction
+  CompactionJobStats stats;
+
+  // Compression algorithm used for blob output files.
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+
+  // Information about blob files deleted during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
+};
+
+struct MemTableInfo {
+  // the name of the column family to which memtable belongs
+  std::string cf_name;
+  // Sequence number of the first element that was inserted
+  // into the memtable.
+  SequenceNumber first_seqno;
+  // Sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  SequenceNumber earliest_seqno;
+  // Total number of entries in memtable
+  uint64_t num_entries;
+  // Total number of deletes in memtable
+  uint64_t num_deletes;
+};
+
+struct ExternalFileIngestionInfo {
+  // the name of the column family
+  std::string cf_name;
+  // Path of the file outside the DB
+  std::string external_file_path;
+  // Path of the file inside the DB
+  std::string internal_file_path;
+  // The global sequence number assigned to keys in this file
+  SequenceNumber global_seqno;
+  // Table properties of the table being flushed
+  TableProperties table_properties;
+};
+
+// Result of auto background error recovery
+struct BackgroundErrorRecoveryInfo {
+  // The original error that triggered the recovery
+  Status old_bg_error;
+
+  // The final bg_error after all recovery attempts. Status::OK() means
+  // the recovery was successful and the database is fully operational.
+  Status new_bg_error;
+};
+
+struct IOErrorInfo {
+  IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
+              const std::string& _file_path, size_t _length, uint64_t _offset)
+      : io_status(_io_status),
+        operation(_operation),
+        file_path(_file_path),
+        length(_length),
+        offset(_offset) {}
+
+  IOStatus io_status;
+  FileOperationType operation;
+  std::string file_path;
+  size_t length;
+  uint64_t offset;
+};
+
+// EventListener class contains a set of callback functions that will
+// be called when specific RocksDB event happens such as flush.  It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// IMPORTANT
+// Because compaction is needed to resolve a "writes stopped" condition,
+// calling or waiting for any blocking DB write function (no_slowdown=false)
+// from a compaction-related listener callback can hang RocksDB. For DB
+// writes from a callback we recommend a WriteBatch and no_slowdown=true,
+// because the WriteBatch can accumulate writes for later in case DB::Write
+// returns Status::Incomplete. Similarly, calling CompactRange or similar
+// could hang by waiting for a background worker that is occupied until the
+// callback returns.
+//
+// Otherwise, callback functions should not run for an extended period of
+// time before the function returns, because this will slow RocksDB.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event.   For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+//
+// [Locking] All EventListener callbacks are designed to be called without
+// the current thread holding any DB mutex. This is to prevent potential
+// deadlock and performance issue when using EventListener callback
+// in a complex way.
+//
+// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into
+// RocksDB, because RocksDB is not exception-safe. This could cause undefined
+// behavior including data loss, unreported corruption, deadlocks, and more.
+class EventListener : public Customizable {
+ public:
+  static const char* Type() { return "EventListener"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& id,
+                                 std::shared_ptr<EventListener>* result);
+  const char* Name() const override {
+    // Since EventListeners did not have a name previously, we will assume
+    // an empty name.  Instances should override this method.
+    return "";
+  }
+  // A callback function to RocksDB which will be called whenever a
+  // registered RocksDB flushes a file.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnFlushCompleted(DB* /*db*/,
+                                const FlushJobInfo& /*flush_job_info*/) {}
+
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to flush memtables.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnFlushBegin(DB* /*db*/,
+                            const FlushJobInfo& /*flush_job_info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a SST file is deleted.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this callback is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from the
+  // returned value.
+  virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
+
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to compact.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a registered RocksDB compacts a file. The default implementation
+  // is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns. Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just compacted
+  //   a file.
+  // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+  //  after this function is returned, and must be copied if it is needed
+  //  outside of this function.
+  virtual void OnCompactionCompleted(DB* /*db*/,
+                                     const CompactionJobInfo& /*ci*/) {}
+
+  // A callback function to RocksDB which will be called before a sub-compaction
+  // begins. If a compaction is split to 2 sub-compactions, it will trigger one
+  // `OnCompactionBegin()` first, then two `OnSubcompactionBegin()`.
+  // If compaction is not split, it will still trigger one
+  // `OnSubcompactionBegin()`, as internally, compaction is always handled by
+  // sub-compaction. The default implementation is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param ci a reference to a CompactionJobInfo struct, it contains a
+  //  `sub_job_id` which is only unique within the specified compaction (which
+  //  can be identified by `job_id`). 'ci' is released after this function is
+  //  returned, and must be copied if it's needed outside this function.
+  //  Note: `table_properties` is not set for sub-compaction, the information
+  //  could be got from `OnCompactionBegin()`.
+  virtual void OnSubcompactionBegin(const SubcompactionJobInfo& /*si*/) {}
+
+  // A callback function to RocksDB which will be called whenever a
+  // sub-compaction completed. The same as `OnSubcompactionBegin()`, if a
+  // compaction is split to 2 sub-compactions, it will be triggered twice. If
+  // a compaction is not split, it will still be triggered once.
+  // The default implementation is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param ci a reference to a CompactionJobInfo struct, it contains a
+  //  `sub_job_id` which is only unique within the specified compaction (which
+  //  can be identified by `job_id`). 'ci' is released after this function is
+  //  returned, and must be copied if it's needed outside this function.
+  //  Note: `table_properties` is not set for sub-compaction, the information
+  //  could be got from `OnCompactionCompleted()`.
+  virtual void OnSubcompactionCompleted(const SubcompactionJobInfo& /*si*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a SST file is created.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this callback is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Historically it will only be called if the file is successfully created.
+  // Now it will also be called on failure case. User can check info.status
+  // to see if it succeeded or not.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a SST file is being created. It will follow by OnTableFileCreated after
+  // the creation finishes.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a memtable is made immutable.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a column family handle is deleted.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  // @param handle is a pointer to the column family handle to be deleted
+  // which will become a dangling pointer after the deletion.
+  virtual void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* /*handle*/) {}
+
+  // A callback function for RocksDB which will be called after an external
+  // file is ingested using IngestExternalFile.
+  //
+  // Note that the this function will run on the same thread as
+  // IngestExternalFile(), if this function is blocked, IngestExternalFile()
+  // will be blocked from finishing.
+  virtual void OnExternalFileIngested(
+      DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before setting the
+  // background error status to a non-OK value. The new background error status
+  // is provided in `bg_error` and can be modified by the callback. E.g., a
+  // callback can suppress errors by resetting it to Status::OK(), thus
+  // preventing the database from entering read-only mode. We do not provide any
+  // guarantee when failed flushes/compactions will be rescheduled if the user
+  // suppresses an error.
+  //
+  // Note that this function can run on the same threads as flush, compaction,
+  // and user writes. So, it is extremely important not to perform heavy
+  // computations or blocking calls in this function.
+  virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
+                                 Status* /* bg_error */) {}
+
+  // A callback function for RocksDB which will be called whenever a change
+  // of superversion triggers a change of the stall conditions.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever a file read
+  // operation finishes.
+  virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file write
+  // operation finishes.
+  virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file flush
+  // operation finishes.
+  virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file sync
+  // operation finishes.
+  virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // rangeSync operation finishes.
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // truncate operation finishes.
+  virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file close
+  // operation finishes.
+  virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
+
+  // If true, the OnFile*Finish functions will be called. If
+  // false, then they won't be called.
+  virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
+  // A callback function for RocksDB which will be called just before
+  // starting the automatic recovery process for recoverable background
+  // errors, such as NoSpace(). The callback can suppress the automatic
+  // recovery by setting *auto_recovery to false. The database will then
+  // have to be transitioned out of read-only mode by calling DB::Resume()
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+                                    Status /* bg_error */,
+                                    bool* /* auto_recovery */) {}
+
+  // DEPRECATED
+  // A callback function for RocksDB which will be called once the database
+  // is recovered from read-only mode after an error. When this is called, it
+  // means normal writes to the database can be issued and the user can
+  // initiate any further recovery actions needed
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error) {
+    old_bg_error.PermitUncheckedError();
+  }
+
+  // A callback function for RocksDB which will be called once the recovery
+  // attempt from a background retryable error is completed. The recovery
+  // may have been successful or not. In either case, the callback is called
+  // with the old and new error. If info.new_bg_error is Status::OK(), that
+  // means the recovery succeeded.
+  virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
+  }
+
+  // A callback function for RocksDB which will be called before
+  // a blob file is being created. It will follow by OnBlobFileCreated after
+  // the creation finishes.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is created.
+  // It will be called whether the file is successfully created or not. User can
+  // check info.status to see if it succeeded or not.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is deleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever an IO error
+  // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback.
+  virtual void OnIOError(const IOErrorInfo& /*info*/) {}
+
+  ~EventListener() override {}
+};
+
+#else
+
+class EventListener {};
+struct FlushJobInfo {};
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h
new file mode 100644
index 000000000..5cb799e42
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memory_allocator.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator : public Customizable {
+ public:
+  static const char* Type() { return "MemoryAllocator"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<MemoryAllocator>* result);
+
+  // Allocate a block of at least size. Has to be thread-safe.
+  virtual void* Allocate(size_t size) = 0;
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  virtual void Deallocate(void* p) = 0;
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+    // default implementation just returns the allocation size
+    return allocation_size;
+  }
+
+  std::string GetId() const override { return GenerateIndividualId(); }
+};
+
+struct JemallocAllocatorOptions {
+  static const char* kName() { return "JemallocAllocatorOptions"; }
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommended to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommended to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocator which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvise to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator are through the same arena.
+// The memory allocator hooks memory allocation of the arena, and calls
+// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduction of jemalloc
+// metadata for some workloads.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
+// can be reduced by limiting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
new file mode 100644
index 000000000..cb5444dca
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -0,0 +1,423 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - HashSkipListRep: The memtable rep that is best used for keys that are
+//  structured like "prefix:suffix" where iteration within a prefix is
+//  common and iteration across different prefixes is rare. It is backed by
+//  a hash map where each bucket is a skip list.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <stdexcept>
+#include <unordered_set>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Allocator;
+class LookupKey;
+class SliceTransform;
+class Logger;
+struct DBOptions;
+
+using KeyHandle = void*;
+
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    using DecodedType = ROCKSDB_NAMESPACE::Slice;
+
+    virtual DecodedType decode_key(const char* key) const {
+      // The format of key is frozen and can be treated as a part of the API
+      // contract. Refer to MemTable::Add for details.
+      return GetLengthPrefixedSlice(key);
+    }
+
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
+
+    virtual ~KeyComparator() {}
+  };
+
+  explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
+
+  // Allocate a buf of len size for storing key. The idea is that a
+  // specific memtable representation knows its underlying data structure
+  // better. By allowing it to allocate memory, it can possibly put
+  // correlated stuff in consecutive memory area to make processor
+  // prefetching more efficient.
+  virtual KeyHandle Allocate(const size_t len, char** buf);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert).
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection, and no concurrent modifications to the table in progress
+  virtual void Insert(KeyHandle handle) = 0;
+
+  // Same as ::Insert
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKey(KeyHandle handle) {
+    Insert(handle);
+    return true;
+  }
+
+  // Same as Insert(), but in additional pass a hint to insert location for
+  // the key. If hint points to nullptr, a new hint will be populated.
+  // otherwise the hint will be updated to reflect the last insert location.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to Insert() by default.
+  virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    Insert(handle);
+  }
+
+  // Same as ::InsertWithHint
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
+    InsertWithHint(handle, hint);
+    return true;
+  }
+
+  // Same as ::InsertWithHint, but allow concurrent write
+  //
+  // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+  // the hint will be updated to reflect the last insert location. The hint is
+  // owned by the caller and it is the caller's responsibility to delete the
+  // hint later.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to InsertConcurrently() by default.
+  virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    InsertConcurrently(handle);
+  }
+
+  // Same as ::InsertWithHintConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+    InsertWithHintConcurrently(handle, hint);
+    return true;
+  }
+
+  // Like Insert(handle), but may be called concurrent with other calls
+  // to InsertConcurrently for other handles.
+  //
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual void InsertConcurrently(KeyHandle handle);
+
+  // Same as ::InsertConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyConcurrently(KeyHandle handle) {
+    InsertConcurrently(handle);
+    return true;
+  }
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default,
+  // does nothing.  After MarkReadOnly() is called, this table rep will
+  // not be written to (ie No more calls to Allocate(), Insert(),
+  // or any writes done directly to entries accessed through the iterator.)
+  virtual void MarkReadOnly() {}
+
+  // Notify this table rep that it has been flushed to stable storage.
+  // By default, does nothing.
+  //
+  // Invariant: MarkReadOnly() is called, before MarkFlushed().
+  // Note that this method if overridden, should not run for an extended period
+  // of time. Otherwise, RocksDB may be blocked.
+  virtual void MarkFlushed() {}
+
+  // Look up key from the mem table, since the first key in the mem table whose
+  // user_key matches the one given k, call the function callback_func(), with
+  // callback_args directly forwarded as the first parameter, and the mem table
+  // key as the second parameter. If the return value is false, then terminates.
+  // Otherwise, go through the next key.
+  //
+  // It's safe for Get() to terminate after having finished all the potential
+  // key for the k.user_key(), or not.
+  //
+  // Default:
+  // Get() function with a default value of dynamically construct an iterator,
+  // seek and call the call back function.
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg, const char* entry));
+
+  virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
+                                         const Slice& /*end_key*/) {
+    return 0;
+  }
+
+  // Returns a vector of unique random memtable entries of approximate
+  // size 'target_sample_size' (this size is not strictly enforced).
+  virtual void UniqueRandomSample(const uint64_t num_entries,
+                                  const uint64_t target_sample_size,
+                                  std::unordered_set<const char*>* entries) {
+    (void)num_entries;
+    (void)target_sample_size;
+    (void)entries;
+    assert(false);
+  }
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the allocator.  Safe to call from any thread.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() {}
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+    // retreat to the first entry with a key <= target
+    virtual void SeekForPrev(const Slice& internal_key,
+                             const char* memtable_key) = 0;
+
+    virtual void RandomSeek() {}
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  // arena: If not null, the arena is used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+    return GetIterator(arena);
+  }
+
+  // Return true if the current MemTableRep supports merge operator.
+  // Default: true
+  virtual bool IsMergeOperatorSupported() const { return true; }
+
+  // Return true if the current MemTableRep supports snapshot
+  // Default: true
+  virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+
+  Allocator* allocator_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory : public Customizable {
+ public:
+  ~MemTableRepFactory() override {}
+
+  static const char* Type() { return "MemTableRepFactory"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::unique_ptr<MemTableRepFactory>* factory);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<MemTableRepFactory>* factory);
+
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) = 0;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
+      const SliceTransform* slice_transform, Logger* logger,
+      uint32_t /* column_family_id */) {
+    return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
+  }
+
+  const char* Name() const override = 0;
+
+  // Return true if the current MemTableRep supports concurrent inserts
+  // Default: false
+  virtual bool IsInsertConcurrentlySupported() const { return false; }
+
+  // Return true if the current MemTableRep supports detecting duplicate
+  // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
+  // false when if the <key,seq> already exists.
+  // Default: false
+  virtual bool CanHandleDuplicatedKey() const { return false; }
+};
+
+// This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+//   lookahead: If non-zero, each iterator's seek operation will start the
+//     search from the previously visited record (doing at most 'lookahead'
+//     steps). This is an optimization for the access pattern including many
+//     seeks with consecutive keys.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+  explicit SkipListFactory(size_t lookahead = 0);
+
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "SkipListFactory"; }
+  static const char* kNickName() { return "skip_list"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+
+  // Methods for MemTableRepFactory class overrides
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) override;
+
+  bool IsInsertConcurrentlySupported() const override { return true; }
+
+  bool CanHandleDuplicatedKey() const override { return true; }
+
+ private:
+  size_t lookahead_;
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  size_t count_;
+
+ public:
+  explicit VectorRepFactory(size_t count = 0);
+
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "VectorRepFactory"; }
+  static const char* kNickName() { return "vector"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  // Methods for MemTableRepFactory class overrides
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) override;
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+    int32_t skiplist_branching_factor = 4);
+
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+// @bucket_entries_logging_threshold: if number of entries in one bucket
+//                                    exceeds this number, log about it.
+// @if_log_bucket_dist_when_flash: if true, log distribution of number of
+//                                 entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+//                          entries exceed this parameter.
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
+    int bucket_entries_logging_threshold = 4096,
+    bool if_log_bucket_dist_when_flash = true,
+    uint32_t threshold_use_skiplist = 256);
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h
new file mode 100644
index 000000000..ae795220b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/merge_operator.h
@@ -0,0 +1,265 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMergeV2) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges multiple
+//    operands together. this is especially useful if your key values have
+//    complex structures but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class MergeOperator : public Customizable {
+ public:
+  virtual ~MergeOperator() {}
+  static const char* Type() { return "MergeOperator"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 std::shared_ptr<MergeOperator>* result);
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here.
+  // The string that new_value is pointing to will be empty.
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                         const std::deque<std::string>& /*operand_list*/,
+                         std::string* /*new_value*/, Logger* /*logger*/) const {
+    // deprecated, please use FullMergeV2()
+    assert(false);
+    return false;
+  }
+
+  struct MergeOperationInput {
+    // If user-defined timestamp is enabled, `_key` includes timestamp.
+    explicit MergeOperationInput(const Slice& _key,
+                                 const Slice* _existing_value,
+                                 const std::vector<Slice>& _operand_list,
+                                 Logger* _logger)
+        : key(_key),
+          existing_value(_existing_value),
+          operand_list(_operand_list),
+          logger(_logger) {}
+
+    // The key associated with the merge operation.
+    const Slice& key;
+    // The existing value of the current key, nullptr means that the
+    // value doesn't exist.
+    const Slice* existing_value;
+    // A list of operands to apply.
+    const std::vector<Slice>& operand_list;
+    // Logger could be used by client to log any errors that happen during
+    // the merge operation.
+    Logger* logger;
+  };
+
+  struct MergeOperationOutput {
+    explicit MergeOperationOutput(std::string& _new_value,
+                                  Slice& _existing_operand)
+        : new_value(_new_value), existing_operand(_existing_operand) {}
+
+    // Client is responsible for filling the merge result here.
+    std::string& new_value;
+    // If the merge result is one of the existing operands (or existing_value),
+    // client can set this field to the operand (or existing_value) instead of
+    // using new_value.
+    Slice& existing_operand;
+  };
+
+  // This function applies a stack of merge operands in chronological order
+  // on top of an existing value. There are two ways in which this method is
+  // being used:
+  // a) During Get() operation, it used to calculate the final value of a key
+  // b) During compaction, in order to collapse some operands with the based
+  //    value.
+  //
+  // Note: The name of the method is somewhat misleading, as both in the cases
+  // of Get() or compaction it may be called on a subset of operands:
+  // K:    0    +1    +2    +7    +4     +5      2     +1     +2
+  //                              ^
+  //                              |
+  //                          snapshot
+  // In the example above, Get(K) operation will call FullMerge with a base
+  // value of 2 and operands [+1, +2]. Compaction process might decide to
+  // collapse the beginning of the history up to the snapshot by performing
+  // full Merge with base value of 0 and operands [+1, +2, +7, +4].
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // The string that new_value is pointing to will be empty.
+  //
+  // The default implementation of PartialMergeMulti will use this function
+  // as a helper, for backward compatibility.  Any successor class of
+  // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+  // although implementing PartialMergeMulti is suggested as it is in general
+  // more effective to merge multiple operands at a time instead of two
+  // operands at a time.
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMergeV2() function
+  // and return false there.  The default implementation of PartialMerge will
+  // always return false.
+  virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+                            const Slice& /*right_operand*/,
+                            std::string* /*new_value*/,
+                            Logger* /*logger*/) const {
+    return false;
+  }
+
+  // This function performs merge when all the operands are themselves merge
+  // operation types that you would have passed to a DB::Merge() call in the
+  // same order (front() first)
+  // (i.e. DB::Merge(key, operand_list[0]), followed by
+  //  DB::Merge(key, operand_list[1]), ...)
+  //
+  // PartialMergeMulti should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.  *new_value should
+  // be constructed such that a call to DB::Merge(key, *new_value) would yield
+  // the same result as sequential individual calls to DB::Merge(key, operand)
+  // for each operand in operand_list from front() to back().
+  //
+  // The string that new_value is pointing to will be empty.
+  //
+  // The PartialMergeMulti function will be called when there are at least two
+  // operands.
+  //
+  // In the default implementation, PartialMergeMulti will invoke PartialMerge
+  // multiple times, where each time it only merges two operands.  Developers
+  // should either implement PartialMergeMulti, or implement PartialMerge which
+  // is served as the helper function of the default PartialMergeMulti.
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const override = 0;
+
+  // Determines whether the PartialMerge can be called with just a single
+  // merge operand.
+  // Override and return true for allowing a single operand. PartialMerge
+  // and PartialMergeMulti should be overridden and implemented
+  // correctly to properly handle a single operand.
+  virtual bool AllowSingleOperand() const { return false; }
+
+  // Allows to control when to invoke a full merge during Get.
+  // This could be used to limit the number of merge operands that are looked at
+  // during a point lookup, thereby helping in limiting the number of levels to
+  // read from.
+  // Doesn't help with iterators.
+  //
+  // Note: the merge operands are passed to this function in the reversed order
+  // relative to how they were merged (passed to FullMerge or FullMergeV2)
+  // for performance reasons, see also:
+  // https://github.com/facebook/rocksdb/issues/3865
+  virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const {
+    return false;
+  }
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  ~AssociativeMergeOperator() override {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result
+  // here. The string that new_value is pointing to will be empty.
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const = 0;
+
+ private:
+  // Default implementations of the MergeOperator functions
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMerge(const Slice& key, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* logger) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
new file mode 100644
index 000000000..0cdffcd5f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -0,0 +1,245 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Basic identifiers and metadata for a file in a DB. This only includes
+// information considered relevant for taking backups, checkpoints, or other
+// services relating to DB file storage.
+// This is only appropriate for immutable files, such as SST files or all
+// files in a backup. See also LiveFileStorageInfo.
+struct FileStorageInfo {
+  // The name of the file within its directory (e.g. "123456.sst")
+  std::string relative_filename;
+  // The directory containing the file, without a trailing '/'. This could be
+  // a DB path, wal_dir, etc.
+  std::string directory;
+
+  // The id of the file within a single DB. Set to 0 if the file does not have
+  // a number (e.g. CURRENT)
+  uint64_t file_number = 0;
+  // The type of the file as part of a DB.
+  FileType file_type = kTempFile;
+
+  // File size in bytes. See also `trim_to_size`.
+  uint64_t size = 0;
+
+  // This feature is experimental and subject to change.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum of a SST file, the value is decided by the file content and
+  // the checksum algorithm used for this SST file. The checksum function is
+  // identified by the file_checksum_func_name. If the checksum function is
+  // not specified, file_checksum is "0" by default.
+  std::string file_checksum;
+
+  // The name of the checksum function used to generate the file checksum
+  // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
+  // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
+  // "Unknown".
+  std::string file_checksum_func_name;
+};
+
+// Adds to FileStorageInfo the ability to capture the state of files that
+// might change in a running DB.
+struct LiveFileStorageInfo : public FileStorageInfo {
+  // If non-empty, this string represents the "saved" contents of the file
+  // for the current context. (This field is used for checkpointing CURRENT
+  // file.) In that case, size == replacement_contents.size() and file on disk
+  // should be ignored. If empty string, the file on disk should still have
+  // "saved" contents. (See trim_to_size.)
+  std::string replacement_contents;
+
+  // If true, the file on disk is allowed to be larger than `size` but only
+  // the first `size` bytes should be used for the current context. If false,
+  // the file is corrupt if size on disk does not equal `size`.
+  bool trim_to_size = false;
+};
+
+// The metadata that describes an SST file. (Does not need to extend
+// LiveFileStorageInfo because SST files are always immutable.)
+struct SstFileMetaData : public FileStorageInfo {
+  SstFileMetaData() { file_type = kTableFile; }
+
+  SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+                  const std::string& _directory, uint64_t _size,
+                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
+                  const std::string& _smallestkey,
+                  const std::string& _largestkey, uint64_t _num_reads_sampled,
+                  bool _being_compacted, Temperature _temperature,
+                  uint64_t _oldest_blob_file_number,
+                  uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+                  std::string& _file_checksum,
+                  std::string& _file_checksum_func_name)
+      : smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno),
+        smallestkey(_smallestkey),
+        largestkey(_largestkey),
+        num_reads_sampled(_num_reads_sampled),
+        being_compacted(_being_compacted),
+        num_entries(0),
+        num_deletions(0),
+        oldest_blob_file_number(_oldest_blob_file_number),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {
+    if (!_file_name.empty()) {
+      if (_file_name[0] == '/') {
+        relative_filename = _file_name.substr(1);
+        name = _file_name;  // Deprecated field
+      } else {
+        relative_filename = _file_name;
+        name = std::string("/") + _file_name;  // Deprecated field
+      }
+      assert(relative_filename.size() + 1 == name.size());
+      assert(relative_filename[0] != '/');
+      assert(name[0] == '/');
+    }
+    directory = _directory;
+    db_path = _directory;  // Deprecated field
+    file_number = _file_number;
+    file_type = kTableFile;
+    size = _size;
+    temperature = _temperature;
+    file_checksum = _file_checksum;
+    file_checksum_func_name = _file_checksum_func_name;
+  }
+
+  SequenceNumber smallest_seqno = 0;  // Smallest sequence number in file.
+  SequenceNumber largest_seqno = 0;   // Largest sequence number in file.
+  std::string smallestkey;            // Smallest user defined key in the file.
+  std::string largestkey;             // Largest user defined key in the file.
+  uint64_t num_reads_sampled = 0;     // How many times the file is read.
+  bool being_compacted =
+      false;  // true if the file is currently being compacted.
+
+  uint64_t num_entries = 0;
+  uint64_t num_deletions = 0;
+
+  uint64_t oldest_blob_file_number = 0;  // The id of the oldest blob file
+                                         // referenced by the file.
+  // An SST file may be generated by compactions whose input files may
+  // in turn be generated by earlier compactions. The creation time of the
+  // oldest SST file that is the compaction ancestor of this file.
+  // The timestamp is provided SystemClock::GetCurrentTime().
+  // 0 if the information is not available.
+  //
+  // Note: for TTL blob files, it contains the start of the expiration range.
+  uint64_t oldest_ancester_time = 0;
+  // Timestamp when the SST file is created, provided by
+  // SystemClock::GetCurrentTime(). 0 if the information is not available.
+  uint64_t file_creation_time = 0;
+
+  // DEPRECATED: The name of the file within its directory with a
+  // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
+  // instead.
+  std::string name;
+
+  // DEPRECATED: replaced by `directory` in base struct
+  std::string db_path;
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  int level;                       // Level at which this file resides.
+  LiveFileMetaData() : column_family_name(), level(0) {}
+};
+
+// The MetaData that describes a Blob file
+struct BlobMetaData {
+  BlobMetaData()
+      : blob_file_number(0),
+        blob_file_size(0),
+        total_blob_count(0),
+        total_blob_bytes(0),
+        garbage_blob_count(0),
+        garbage_blob_bytes(0) {}
+
+  BlobMetaData(uint64_t _file_number, const std::string& _file_name,
+               const std::string& _file_path, uint64_t _file_size,
+               uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+               uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name)
+      : blob_file_number(_file_number),
+        blob_file_name(_file_name),
+        blob_file_path(_file_path),
+        blob_file_size(_file_size),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes),
+        checksum_method(_file_checksum),
+        checksum_value(_file_checksum_func_name) {}
+  uint64_t blob_file_number;
+  std::string blob_file_name;
+  std::string blob_file_path;
+  uint64_t blob_file_size;
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
+  std::string checksum_method;
+  std::string checksum_value;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+  LevelMetaData(int _level, uint64_t _size,
+                const std::vector<SstFileMetaData>&& _files)
+      : level(_level), size(_size), files(_files) {}
+
+  // The level which this meta data describes.
+  const int level;
+  // The size of this level in bytes, which is equal to the sum of
+  // the file size of its "files".
+  const uint64_t size;
+  // The metadata of all sst files in this level.
+  const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+  ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
+  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+                       const std::vector<LevelMetaData>&& _levels)
+      : size(_size), name(_name), levels(_levels) {}
+
+  // The size of this column family in bytes, which is equal to the sum of
+  // the file size of its "levels".
+  uint64_t size;
+  // The number of files in this column family.
+  size_t file_count;
+  // The name of the column family.
+  std::string name;
+  // The metadata of all levels in this column family.
+  std::vector<LevelMetaData> levels;
+
+  // The total size of all blob files
+  uint64_t blob_file_size = 0;
+  // The number of blob files in this column family.
+  size_t blob_file_count = 0;
+  // The metadata of the blobs in this column family.
+  std::vector<BlobMetaData> blob_files;
+};
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+  std::string db_comparator_name;       // Used to safety check at import.
+  std::vector<LiveFileMetaData> files;  // Vector of file metadata.
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
new file mode 100644
index 000000000..7a4d8b5a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -0,0 +1,2113 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/types.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#ifdef max
+#undef max
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class ConcurrentTaskLimiter;
+class Env;
+enum InfoLogLevel : unsigned char;
+class SstFileManager;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class MemTableRepFactory;
+class RateLimiter;
+class Slice;
+class Statistics;
+class InternalKeyComparator;
+class WalFilter;
+class FileSystem;
+
+struct Options;
+struct DbPath;
+
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
+struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
+  // The function recovers options to a previous version. Only 4.6 or later
+  // versions are supported.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
+                                   int rocksdb_minor_version = 6);
+
+  // Some functions that make it easier to optimize RocksDB
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  // An optional cache object is passed in to be used as the block cache
+  ColumnFamilyOptions* OptimizeForSmallDb(
+      std::shared_ptr<Cache>* cache = nullptr);
+
+  // Use this if you don't need to keep the data sorted, i.e. you'll never use
+  // an iterator, only Put() and Get() API calls
+  //
+  // Not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
+
+  // Default values for some parameters in ColumnFamilyOptions are not
+  // optimized for heavy workloads and big datasets, which means you might
+  // observe write stalls under some conditions. As a starting point for tuning
+  // RocksDB options, use the following two functions:
+  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+  // Universal style compaction is focused on reducing Write Amplification
+  // Factor for big data sets, but increases Space Amplification. You can learn
+  // more about the different styles here:
+  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+  // Make sure to also call IncreaseParallelism(), which will provide the
+  // biggest performance gains.
+  // Note: we might use more memory than memtable_memory_budget during high
+  // write rate period
+  //
+  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator = BytewiseComparator();
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // opening the DB in this case.
+  // Default: nullptr
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+
+  // A single CompactionFilter instance to call into during compaction.
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  //
+  // If the client requires a new `CompactionFilter` to be used for different
+  // compaction runs and/or requires a `CompactionFilter` for table file
+  // creations outside of compaction, it can specify compaction_filter_factory
+  // instead of this option.  The client should specify only one of the two.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  //
+  // If multithreaded compaction is being used, the supplied CompactionFilter
+  // instance may be used from different threads concurrently and so should be
+  // thread-safe.
+  //
+  // Default: nullptr
+  const CompactionFilter* compaction_filter = nullptr;
+
+  // This is a factory that provides `CompactionFilter` objects which allow
+  // an application to modify/delete a key-value during table file creation.
+  //
+  // Unlike the `compaction_filter` option, which is used when compaction
+  // creates a table file, this factory allows using a `CompactionFilter` when a
+  // table file is created for various reasons. The factory can decide what
+  // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+  // default the decision is to use a `CompactionFilter` for
+  // `TableFileCreationReason::kCompaction` only.
+  //
+  // Each thread of work involving creating table files will create a new
+  // `CompactionFilter` when it will be used according to the above
+  // `TableFileCreationReason`-based decision. This allows the application to
+  // know about the different ongoing threads of work and makes it unnecessary
+  // for `CompactionFilter` to provide thread-safety.
+  //
+  // Default: nullptr
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Note that write_buffer_size is enforced per column family.
+  // See db_write_buffer_size for sharing memory across column families.
+  //
+  // Default: 64MB
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t write_buffer_size = 64 << 20;
+
+  // Compress blocks using the specified compression algorithm.
+  //
+  // Default: kSnappyCompression, if it's supported. If snappy is not linked
+  // with the library, the default is kNoCompression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  //
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  //
+  // If you do not set `compression_opts.level`, or set it to
+  // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+  // default corresponding to `compression` as follows:
+  //
+  // - kZSTD: 3
+  // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+  // - kLZ4HCCompression: 0
+  // - For all others, we do not specify a compression level
+  //
+  // Dynamically changeable through SetOptions() API
+  CompressionType compression;
+
+  // Compression algorithm that will be used for the bottommost level that
+  // contain files. The behavior for num_levels = 1 is not well defined.
+  // Right now, with num_levels = 1,  all compaction outputs will use
+  // bottommost_compression and all flush outputs still use options.compression,
+  // but the behavior is subject to change.
+  //
+  // Default: kDisableCompressionOption (Disabled)
+  CompressionType bottommost_compression = kDisableCompressionOption;
+
+  // different options for compression algorithms used by bottommost_compression
+  // if it is enabled. To enable it, please see the definition of
+  // CompressionOptions. Behavior for num_levels = 1 is the same as
+  // options.bottommost_compression.
+  CompressionOptions bottommost_compression_opts;
+
+  // different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  //
+  // Default: 4
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_file_num_compaction_trigger = 4;
+
+  // If non-nullptr, use the specified function to put keys in contiguous
+  // groups called "prefixes". These prefixes are used to place one
+  // representative entry for the group into the Bloom filter
+  // rather than an entry for each key (see whole_key_filtering).
+  // Under certain conditions, this enables optimizing some range queries
+  // (Iterators) in addition to some point lookups (Get/MultiGet).
+  //
+  // Together `prefix_extractor` and `comparator` must satisfy one essential
+  // property for valid prefix filtering of range queries:
+  //   If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+  //      InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+  //   Then InDomain(k2) and prefix(k2) == prefix(k1)
+  //
+  // In other words, all keys with the same prefix must be in a contiguous
+  // group by comparator order, and cannot be interrupted by keys with no
+  // prefix ("out of domain"). (This makes it valid to conclude that no
+  // entries within some bounds are present if the upper and lower bounds
+  // have a common prefix and no entries with that same prefix are present.)
+  //
+  // Some other properties are recommended but not strictly required. Under
+  // most sensible comparators, the following will need to hold true to
+  // satisfy the essential property above:
+  // * "Prefix is a prefix": key.starts_with(prefix(key))
+  // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+  //   Compare(prefix(k1), prefix(k2)) <= 0
+  //
+  // The next two properties ensure that seeking to a prefix allows
+  // enumerating all entries with that prefix:
+  // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+  // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 200MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 200MB, total file size for level-2 will be 2GB,
+  // and total file size for level-3 will be 20GB.
+  //
+  // Default: 256MB.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_bytes_for_level_base = 256 * 1048576;
+
+  // Deprecated.
+  uint64_t snap_refresh_nanos = 0;
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this column family
+  //
+  // Dynamically changeable through SetOptions() API
+  bool disable_auto_compactions = false;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a block-based table factory that provides a default
+  // implementation of TableBuilder and TableReader with default
+  // BlockBasedTableOptions.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // A list of paths where SST files for this column family
+  // can be put into, with its target size. Similar to db_paths,
+  // newer data is placed into paths specified earlier in the
+  // vector while older data gradually moves to paths specified
+  // later in the vector.
+  // Note that, if a path is supplied to multiple column
+  // families, it would have files and total size from all
+  // the column families combined. User should provision for the
+  // total size(from all the column families) in such cases.
+  //
+  // If left empty, db_paths will be used.
+  // Default: empty
+  std::vector<DbPath> cf_paths;
+
+  // Compaction concurrent thread limiter for the column family.
+  // If non-nullptr, use given concurrent thread limiter to control
+  // the max outstanding compaction tasks. Limiter can be shared with
+  // multiple column families across db instances.
+  //
+  // Default: nullptr
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
+
+  // If non-nullptr, use the specified factory for a function to determine the
+  // partitioning of sst files. This helps compaction to split the files
+  // on interesting boundaries (key prefixes) to make propagation of sst
+  // files less write amplifying (covering the whole key space).
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // Default: nullptr
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  ColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit ColumnFamilyOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+enum class WALRecoveryMode : char {
+  // Original levelDB recovery
+  //
+  // We tolerate the last record in any log to be incomplete due to a crash
+  // while writing it. Zeroed bytes from preallocation are also tolerated in the
+  // trailing data of any log.
+  //
+  // Use case: Applications for which updates, once applied, must not be rolled
+  // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
+  // this as long as `WritableFile::Append()` writes are durable. In case the
+  // user needs the guarantee in more situations (e.g., when
+  // `WritableFile::Append()` writes to page cache, but the user desires this
+  // guarantee in face of power-loss crash-recovery), RocksDB offers various
+  // mechanisms to additionally invoke `WritableFile::Sync()` in order to
+  // strengthen the guarantee.
+  //
+  // This differs from `kPointInTimeRecovery` in that, in case a corruption is
+  // detected during recovery, this mode will refuse to open the DB. Whereas,
+  // `kPointInTimeRecovery` will stop recovery just before the corruption since
+  // that is a valid point-in-time to which to recover.
+  kTolerateCorruptedTailRecords = 0x00,
+  // Recover from clean shutdown
+  // We don't expect to find any corruption in the WAL
+  // Use case : This is ideal for unit tests and rare applications that
+  // can require high consistency guarantee
+  kAbsoluteConsistency = 0x01,
+  // Recover to point-in-time consistency (default)
+  // We stop the WAL playback on discovering WAL inconsistency
+  // Use case : Ideal for systems that have disk controller cache like
+  // hard disk, SSD without super capacitor that store related data
+  kPointInTimeRecovery = 0x02,
+  // Recovery after a disaster
+  // We ignore any corruption in the WAL and try to salvage as much data as
+  // possible
+  // Use case : Ideal for last ditch effort to recover data or systems that
+  // operate with low grade unrelated data
+  kSkipAnyCorruptedRecords = 0x03,
+};
+
+struct DbPath {
+  std::string path;
+  uint64_t target_size;  // Target size of total files under the path, in byte.
+
+  DbPath() : target_size(0) {}
+  DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
+};
+
+extern const char* kHostnameForDbHostId;
+
+enum class CompactionServiceJobStatus : char {
+  kSuccess,
+  kFailure,
+  kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+  std::string db_name;
+  std::string db_id;
+  std::string db_session_id;
+  uint64_t job_id;  // job_id is only unique within the current DB and session,
+                    // restart DB will reset the job_id. `db_id` and
+                    // `db_session_id` could help you build unique id across
+                    // different DBs and sessions.
+
+  Env::Priority priority;
+
+  CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+                           std::string db_session_id_, uint64_t job_id_,
+                           Env::Priority priority_)
+      : db_name(std::move(db_name_)),
+        db_id(std::move(db_id_)),
+        db_session_id(std::move(db_session_id_)),
+        job_id(job_id_),
+        priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+  static const char* Type() { return "CompactionService"; }
+
+  // Returns the name of this compaction service.
+  const char* Name() const override = 0;
+
+  // Start the remote compaction with `compaction_service_input`, which can be
+  // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+  // information the user might want to know, which includes `job_id`.
+  virtual CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& /*info*/,
+      const std::string& /*compaction_service_input*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  // Wait for remote compaction to finish.
+  virtual CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& /*info*/,
+      std::string* /*compaction_service_result*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  ~CompactionService() override = default;
+};
+
+struct DBOptions {
+  // The function recovers options to the option as in version 4.6.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  DBOptions* OldDefaults(int rocksdb_major_version = 4,
+                         int rocksdb_minor_version = 6);
+
+  // Some functions that make it easier to optimize RocksDB
+
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  // An optional cache object is passed in for the memory of the
+  // memtable to cost to
+  DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
+
+#ifndef ROCKSDB_LITE
+  // By default, RocksDB uses only one background thread for flush and
+  // compaction. Calling this function will set it up such that total of
+  // `total_threads` is used. Good value for `total_threads` is the number of
+  // cores. You almost definitely want to call this function if your system is
+  // bottlenecked by RocksDB.
+  DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif  // ROCKSDB_LITE
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing = false;
+
+  // If true, missing column families will be automatically created.
+  // Default: false
+  bool create_missing_column_families = false;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists = false;
+
+  // If true, RocksDB will aggressively check consistency of the data.
+  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
+  // Write), the database will switch to read-only mode and fail all other
+  // Write operations.
+  // In most cases you want this to be set to true.
+  // Default: true
+  bool paranoid_checks = true;
+
+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
+  // If true, the log numbers and sizes of the synced WALs are tracked
+  // in MANIFEST. During DB recovery, if a synced WAL is missing
+  // from disk, or the WAL's size does not match the recorded size in
+  // MANIFEST, an error will be reported and the recovery will be aborted.
+  //
+  // This is one additional protection against WAL corruption besides the
+  // per-WAL-entry checksum.
+  //
+  // Note that this option does not work with secondary instance.
+  // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
+  // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
+  // tracked for performance/efficiency reasons.
+  //
+  // Default: false
+  bool track_and_verify_wals_in_manifest = false;
+
+  // If true, verifies the SST unique id between MANIFEST and actual file
+  // each time an SST file is opened. This check ensures an SST file is not
+  // overwritten or misplaced. A corruption error will be reported if mismatch
+  // detected, but only when MANIFEST tracks the unique id, which starts from
+  // RocksDB version 7.3. Although the tracked internal unique id is related
+  // to the one returned by GetUniqueIdFromTableProperties, that is subject to
+  // change.
+  // NOTE: verification is currently only done on SST files using block-based
+  // table format.
+  //
+  // Setting to false should only be needed in case of unexpected problems.
+  //
+  // Although an early version of this option opened all SST files for
+  // verification on DB::Open, that is no longer guaranteed. However, as
+  // documented in an above option, if max_open_files is -1, DB will open all
+  // files on DB::Open().
+  //
+  // Default: true
+  bool verify_sst_unique_id_in_manifest = true;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc. In the near
+  // future, support for doing storage operations such as read/write files
+  // through env will be deprecated in favor of file_system (see below)
+  // Default: Env::Default()
+  Env* env = Env::Default();
+
+  // Limits internal file read/write bandwidth:
+  //
+  // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH`
+  // - Compaction requests read and write bandwidth at
+  //   `Env::IOPriority::IO_LOW`
+  // - Reads associated with a `ReadOptions` can be charged at
+  //   `ReadOptions::rate_limiter_priority` (see that option's API doc for usage
+  //   and limitations).
+  // - Writes associated with a `WriteOptions` can be charged at
+  //   `WriteOptions::rate_limiter_priority` (see that option's API doc for
+  //   usage and limitations).
+  //
+  // Rate limiting is disabled if nullptr. If rate limiter is enabled,
+  // bytes_per_sync is set to 1MB by default.
+  //
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> rate_limiter = nullptr;
+
+  // Use to track SST files and control their file deletion rate.
+  //
+  // Features:
+  //  - Throttle the deletion rate of the SST files.
+  //  - Keep track the total size of all SST files.
+  //  - Set a maximum allowed space limit for SST files that when reached
+  //    the DB wont do any further flushes or compactions and will set the
+  //    background error.
+  //  - Can be shared between multiple dbs.
+  // Limitations:
+  //  - Only track and throttle deletes of SST files in
+  //    first db_path (db_name if db_paths is empty).
+  //
+  // Default: nullptr
+  std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  std::shared_ptr<Logger> info_log = nullptr;
+
+#ifdef NDEBUG
+  InfoLogLevel info_log_level = INFO_LEVEL;
+#else
+  InfoLogLevel info_log_level = DEBUG_LEVEL;
+#endif  // NDEBUG
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
+  //
+  // A high value or -1 for this option can cause high memory usage.
+  // See BlockBasedTableOptions::cache_usage_options to constrain
+  // memory usage in case of block based table format.
+  //
+  // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_open_files = -1;
+
+  // If max_open_files is -1, DB will open all files on DB::Open(). You can
+  // use this option to increase the number of threads used to open the files.
+  // Default: 16
+  int max_file_opening_threads = 16;
+
+  // Once write-ahead logs exceed this size, we will start forcing the flush of
+  // column families whose memtables are backed by the oldest live WAL file
+  // (i.e. the ones that are causing all the space amplification). If set to 0
+  // (default), we will dynamically choose the WAL size limit to be
+  // [sum of all write_buffer_size * max_write_buffer_number] * 4
+  //
+  // For example, with 15 column families, each with
+  // write_buffer_size = 128 MB
+  // max_write_buffer_number = 6
+  // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+  //
+  // The RocksDB wiki has some discussion about how the WAL interacts
+  // with memtables and flushing of column families.
+  // https://github.com/facebook/rocksdb/wiki/Column-Families
+  //
+  // This option takes effect only when there are more than one column
+  // family as otherwise the wal size is dictated by the write_buffer_size.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t max_total_wal_size = 0;
+
+  // If non-null, then we should collect metrics about database operations
+  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // By default, writes to stable storage use fdatasync (on platforms
+  // where this function is available). If this option is true,
+  // fsync is used instead.
+  //
+  // fsync and fdatasync are equally safe for our purposes and fdatasync is
+  // faster, so it is rarely necessary to set this option. It is provided
+  // as a workaround for kernel/filesystem bugs, such as one that affected
+  // fdatasync with ext4 in kernel versions prior to 3.7.
+  bool use_fsync = false;
+
+  // A list of paths where SST files can be put into, with its target size.
+  // Newer data is placed into paths specified earlier in the vector while
+  // older data gradually moves to paths specified later in the vector.
+  //
+  // For example, you have a flash device with 10GB allocated for the DB,
+  // as well as a hard drive of 2TB, you should config it to be:
+  //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+  //
+  // The system will try to guarantee data under each path is close to but
+  // not larger than the target size. But current and future file sizes used
+  // by determining where to place a file are based on best-effort estimation,
+  // which means there is a chance that the actual size under the directory
+  // is slightly more than target size under some workloads. User should give
+  // some buffer room for those cases.
+  //
+  // If none of the paths has sufficient room to place a file, the file will
+  // be placed to the last path anyway, despite to the target size.
+  //
+  // Placing newer data to earlier paths is also best-efforts. User should
+  // expect user files to be placed in higher levels in some extreme cases.
+  //
+  // If left empty, only one path will be used, which is db_name passed when
+  // opening the DB.
+  // Default: empty
+  std::vector<DbPath> db_paths;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir = "";
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir = "";
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  //
+  // Default: 6 hours
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
+
+  // Maximum number of concurrent background jobs (compactions and flushes).
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_background_jobs = 2;
+
+  // DEPRECATED: RocksDB automatically decides this based on the
+  // value of max_background_jobs. For backwards compatibility we will set
+  // `max_background_jobs = max_background_compactions + max_background_flushes`
+  // in the case where user sets at least one of `max_background_compactions` or
+  // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+  //
+  // Maximum number of concurrent background compaction jobs, submitted to
+  // the default LOW priority thread pool.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // LOW priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  //
+  // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_background_compactions = -1;
+
+  // This value represents the maximum number of threads that will
+  // concurrently perform a compaction job by breaking it into multiple,
+  // smaller ones that are run simultaneously.
+  // Default: 1 (i.e. no subcompactions)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint32_t max_subcompactions = 1;
+
+  // DEPRECATED: RocksDB automatically decides this based on the
+  // value of max_background_jobs. For backwards compatibility we will set
+  // `max_background_jobs = max_background_compactions + max_background_flushes`
+  // in the case where user sets at least one of `max_background_compactions` or
+  // `max_background_flushes`.
+  //
+  // Maximum number of concurrent background memtable flush jobs, submitted by
+  // default to the HIGH priority thread pool. If the HIGH priority thread pool
+  // is configured to have zero threads, flush jobs will share the LOW priority
+  // thread pool with compaction jobs.
+  //
+  // It is important to use both thread pools when the same Env is shared by
+  // multiple db instances. Without a separate pool, long running compaction
+  // jobs could potentially block memtable flush jobs of other db instances,
+  // leading to unnecessary Put stalls.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // HIGH priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: -1
+  int max_background_flushes = -1;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size = 0;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  // Not supported in ROCKSDB_LITE mode!
+  size_t log_file_time_to_roll = 0;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num = 1000;
+
+  // Recycle log files.
+  // If non-zero, we will reuse previously written log files for new
+  // logs, overwriting the old data.  The value indicates how many
+  // such files we will keep around at any point in time for later
+  // use.  This is more efficient because the blocks are already
+  // allocated and fdatasync does not need to update the inode after
+  // each write.
+  // Default: 0
+  size_t recycle_log_file_num = 0;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is 1GB so that the manifest file can grow, but not
+  // reach the limit of storage capacity.
+  uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits = 6;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds = 0;
+  uint64_t WAL_size_limit_MB = 0;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size = 4 * 1024 * 1024;
+
+  // Allow the OS to mmap file for reading sst tables.
+  // Not recommended for 32-bit OS.
+  // When the option is set to true and compression is disabled, the blocks
+  // will not be copied and will be read directly from the mmap-ed memory
+  // area, and the block will not be inserted into the block cache. However,
+  // checksums will still be checked if ReadOptions.verify_checksums is set
+  // to be true. It means a checksum check every time a block is read, more
+  // than the setup where the option is set to false and the block cache is
+  // used. The common use of the options is to run RocksDB on ramfs, where
+  // checksum verification is usually not needed.
+  // Default: false
+  bool allow_mmap_reads = false;
+
+  // Allow the OS to mmap file for writing.
+  // DB::SyncWAL() only works if this is set to false.
+  // Default: false
+  bool allow_mmap_writes = false;
+
+  // Enable direct I/O mode for read/write
+  // they may or may not improve performance depending on the use case
+  //
+  // Files will be opened in "direct I/O" mode
+  // which means that data r/w from the disk will not be cached or
+  // buffered. The hardware buffer of the devices may however still
+  // be used. Memory mapped files are not impacted by these parameters.
+
+  // Use O_DIRECT for user and compaction reads.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool use_direct_reads = false;
+
+  // Use O_DIRECT for writes in background flush and compactions.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool use_direct_io_for_flush_and_compaction = false;
+
+  // If false, fallocate() calls are bypassed, which disables file
+  // preallocation. The file space preallocation is used to increase the file
+  // write/append performance. By default, RocksDB preallocates space for WAL,
+  // SST, Manifest files, the extra space is truncated when the file is written.
+  // Warning: if you're using btrfs, we would recommend setting
+  // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+  // allocated space cannot be freed, which could be significant if you have
+  // lots of files. More details about this limitation:
+  // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
+  bool allow_fallocate = true;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec = true;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  //
+  // Default: 600 (10 min)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  unsigned int stats_dump_period_sec = 600;
+
+  // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
+  // Default: 600
+  unsigned int stats_persist_period_sec = 600;
+
+  // If true, automatically persist stats to a hidden column family (column
+  // family name: ___rocksdb_stats_history___) every
+  // stats_persist_period_sec seconds; otherwise, write to an in-memory
+  // struct. User can query through `GetStatsHistory` API.
+  // If user attempts to create a column family with the same name on a DB
+  // which have previously set persist_stats_to_disk to true, the column family
+  // creation will fail, but the hidden column family will survive, as well as
+  // the previously persisted statistics.
+  // When peristing stats to disk, the stat name will be limited at 100 bytes.
+  // Default: false
+  bool persist_stats_to_disk = false;
+
+  // if not zero, periodically take stats snapshots and store in memory, the
+  // memory size for stats snapshots is capped at stats_history_buffer_size
+  // Default: 1MB
+  size_t stats_history_buffer_size = 1024 * 1024;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open = true;
+
+  // Amount of data to build up in memtables across all column
+  // families before writing to disk.
+  //
+  // This is distinct from write_buffer_size, which enforces a limit
+  // for a single memtable.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: 0 (disabled)
+  size_t db_write_buffer_size = 0;
+
+  // The memory usage of memtable will report to this object. The same object
+  // can be passed into multiple DBs and it will track the sum of size of all
+  // the DBs. If the total size of all live memtables of all the DBs exceeds
+  // a limit, a flush will be triggered in the next DB to which the next write
+  // is issued, as long as there is one or more column family not already
+  // flushing.
+  //
+  // If the object is only passed to one DB, the behavior is the same as
+  // db_write_buffer_size. When write_buffer_manager is set, the value set will
+  // override db_write_buffer_size.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: null
+  std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
+  AccessHint access_hint_on_compaction_start = NORMAL;
+
+  // If non-zero, we perform bigger reads when doing compaction. If you're
+  // running RocksDB on spinning disks, you should set this to at least 2MB.
+  // That way RocksDB's compaction is doing sequential instead of random reads.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  size_t compaction_readahead_size = 0;
+
+  // This is a maximum buffer size that is used by WinMmapReadableFile in
+  // unbuffered disk I/O mode. We need to maintain an aligned buffer for
+  // reads. We allow the buffer to grow until the specified value and then
+  // for bigger requests allocate one shot buffers. In unbuffered mode we
+  // always bypass read-ahead buffer at ReadaheadRandomAccessFile
+  // When read-ahead is required we then make use of compaction_readahead_size
+  // value and always try to read ahead. With read-ahead we always
+  // pre-allocate buffer to the size instead of growing it up to a limit.
+  //
+  // This option is currently honored only on Windows
+  //
+  // Default: 1 Mb
+  //
+  // Special value: 0 - means do not maintain per instance buffer. Allocate
+  //                per request buffer and avoid locking.
+  size_t random_access_max_buffer_size = 1024 * 1024;
+
+  // This is the maximum buffer size that is used by WritableFileWriter.
+  // With direct IO, we need to maintain an aligned buffer for writes.
+  // We allow the buffer to grow until it's size hits the limit in buffered
+  // IO and fix the buffer size when using direct IO to ensure alignment of
+  // write requests if the logical sector size is unusual
+  //
+  // Default: 1024 * 1024 (1 MB)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  size_t writable_file_max_buffer_size = 1024 * 1024;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex = false;
+
+  // Create DBOptions with default values for all fields
+  DBOptions();
+  // Create DBOptions from Options
+  explicit DBOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background. This operation can be used
+  // to smooth out write I/Os over time. Users shouldn't rely on it for
+  // persistence guarantee.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  //
+  // You may consider using rate_limiter to regulate write rate to device.
+  // When rate limiter is enabled, it automatically enables bytes_per_sync
+  // to 1MB.
+  //
+  // This option applies to table files
+  //
+  // Default: 0, turned off
+  //
+  // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t bytes_per_sync = 0;
+
+  // Same as bytes_per_sync, but applies to WAL files
+  //
+  // Default: 0, turned off
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t wal_bytes_per_sync = 0;
+
+  // When true, guarantees WAL files have at most `wal_bytes_per_sync`
+  // bytes submitted for writeback at any given time, and SST files have at most
+  // `bytes_per_sync` bytes pending writeback at any given time. This can be
+  // used to handle cases where processing speed exceeds I/O speed during file
+  // generation, which can lead to a huge sync when the file is finished, even
+  // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
+  //
+  //  - If `sync_file_range` is supported it achieves this by waiting for any
+  //    prior `sync_file_range`s to finish before proceeding. In this way,
+  //    processing (compression, etc.) can proceed uninhibited in the gap
+  //    between `sync_file_range`s, and we block only when I/O falls behind.
+  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+  //    always blocks, thus preventing the interleaving of I/O and processing.
+  //
+  // Note: Enabling this option does not provide any additional persistence
+  // guarantees, as it may use `sync_file_range`, which does not write out
+  // metadata.
+  //
+  // Default: false
+  bool strict_bytes_per_sync = false;
+
+  // A vector of EventListeners whose callback functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
+  // If true, then the status of the threads involved in this DB will
+  // be tracked and available via GetThreadList() API.
+  //
+  // Default: false
+  bool enable_thread_tracking = false;
+
+  // The limited write rate to DB if soft_pending_compaction_bytes_limit or
+  // level0_slowdown_writes_trigger is triggered, or we are writing to the
+  // last mem table allowed and we allow more than 3 mem tables. It is
+  // calculated using size of user write requests before compression.
+  // RocksDB may decide to slow down more if the compaction still
+  // gets behind further.
+  // If the value is 0, we will infer a value from `rater_limiter` value
+  // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+  // if users change the rate in `rate_limiter` after DB is opened,
+  // `delayed_write_rate` won't be adjusted.
+  //
+  // Unit: byte per second.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t delayed_write_rate = 0;
+
+  // By default, a single write thread queue is maintained. The thread gets
+  // to the head of the queue becomes write batch group leader and responsible
+  // for writing to WAL and memtable for the batch group.
+  //
+  // If enable_pipelined_write is true, separate write thread queue is
+  // maintained for WAL write and memtable write. A write thread first enter WAL
+  // writer queue and then memtable writer queue. Pending thread on the WAL
+  // writer queue thus only have to wait for previous writers to finish their
+  // WAL writing but not the memtable writing. Enabling the feature may improve
+  // write throughput and reduce latency of the prepare phase of two-phase
+  // commit.
+  //
+  // Default: false
+  bool enable_pipelined_write = false;
+
+  // Setting unordered_write to true trades higher write throughput with
+  // relaxing the immutability guarantee of snapshots. This violates the
+  // repeatability one expects from ::Get from a snapshot, as well as
+  // ::MultiGet and Iterator's consistent-point-in-time view property.
+  // If the application cannot tolerate the relaxed guarantees, it can implement
+  // its own mechanisms to work around that and yet benefit from the higher
+  // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+  // two_write_queues=true is one way to achieve immutable snapshots despite
+  // unordered_write.
+  //
+  // By default, i.e., when it is false, rocksdb does not advance the sequence
+  // number for new snapshots unless all the writes with lower sequence numbers
+  // are already finished. This provides the immutability that we except from
+  // snapshots. Moreover, since Iterator and MultiGet internally depend on
+  // snapshots, the snapshot immutability results into Iterator and MultiGet
+  // offering consistent-point-in-time view. If set to true, although
+  // Read-Your-Own-Write property is still provided, the snapshot immutability
+  // property is relaxed: the writes issued after the snapshot is obtained (with
+  // larger sequence numbers) will be still not visible to the reads from that
+  // snapshot, however, there still might be pending writes (with lower sequence
+  // number) that will change the state visible to the snapshot after they are
+  // landed to the memtable.
+  //
+  // Default: false
+  bool unordered_write = false;
+
+  // If true, allow multi-writers to update mem tables in parallel.
+  // Only some memtable_factory-s support concurrent writes; currently it
+  // is implemented only for SkipListFactory.  Concurrent memtable writes
+  // are not compatible with inplace_update_support or filter_deletes.
+  // It is strongly recommended to set enable_write_thread_adaptive_yield
+  // if you are going to use this feature.
+  //
+  // Default: true
+  bool allow_concurrent_memtable_write = true;
+
+  // If true, threads synchronizing with the write batch group leader will
+  // wait for up to write_thread_max_yield_usec before blocking on a mutex.
+  // This can substantially improve throughput for concurrent workloads,
+  // regardless of whether allow_concurrent_memtable_write is enabled.
+  //
+  // Default: true
+  bool enable_write_thread_adaptive_yield = true;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  //
+  // Default: 1 MB
+  uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
+  // The maximum number of microseconds that a write operation will use
+  // a yielding spin loop to coordinate with other write threads before
+  // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
+  // set properly) increasing this value is likely to increase RocksDB
+  // throughput at the expense of increased CPU usage.
+  //
+  // Default: 100
+  uint64_t write_thread_max_yield_usec = 100;
+
+  // The latency in microseconds after which a std::this_thread::yield
+  // call (sched_yield on Linux) is considered to be a signal that
+  // other processes or threads would like to use the current core.
+  // Increasing this makes writer threads more likely to take CPU
+  // by spinning, which will show up as an increase in the number of
+  // involuntary context switches.
+  //
+  // Default: 3
+  uint64_t write_thread_slow_yield_usec = 3;
+
+  // If true, then DB::Open() will not update the statistics used to optimize
+  // compaction decision by loading table properties from many files.
+  // Turning off this feature will improve DBOpen time especially in
+  // disk environment.
+  //
+  // Default: false
+  bool skip_stats_update_on_db_open = false;
+
+  // If true, then DB::Open() will not fetch and check sizes of all sst files.
+  // This may significantly speed up startup if there are many sst files,
+  // especially when using non-default Env with expensive GetFileSize().
+  // We'll still check that all required sst files exist.
+  // If paranoid_checks is false, this option is ignored, and sst files are
+  // not checked at all.
+  //
+  // Default: false
+  bool skip_checking_sst_file_sizes_on_db_open = false;
+
+  // Recovery mode to control the consistency while replaying WAL
+  // Default: kPointInTimeRecovery
+  WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  // if set to false then recovery will fail when a prepared
+  // transaction is encountered in the WAL
+  bool allow_2pc = false;
+
+  // A global cache for table-level rows.
+  // Default: nullptr (disabled)
+  // Not supported in ROCKSDB_LITE mode!
+  std::shared_ptr<Cache> row_cache = nullptr;
+
+#ifndef ROCKSDB_LITE
+  // A filter object supplied to be invoked while processing write-ahead-logs
+  // (WALs) during recovery. The filter provides a way to inspect log
+  // records, ignoring a particular record or skipping replay.
+  // The filter is invoked at startup and is invoked from a single-thread
+  // currently.
+  WalFilter* wal_filter = nullptr;
+#endif  // ROCKSDB_LITE
+
+  // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+  // SetOptions will fail if options file is not properly persisted.
+  //
+  // DEFAULT: false
+  bool fail_if_options_file_error = false;
+
+  // If true, then print malloc stats together with rocksdb.stats
+  // when printing to LOG.
+  // DEFAULT: false
+  bool dump_malloc_stats = false;
+
+  // By default RocksDB replay WAL logs and flush them on DB open, which may
+  // create very small SST files. If this option is enabled, RocksDB will try
+  // to avoid (but not guarantee not to) flush during recovery. Also, existing
+  // WAL logs will be kept, so that if crash happened before flush, we still
+  // have logs to recover from.
+  //
+  // DEFAULT: false
+  bool avoid_flush_during_recovery = false;
+
+  // By default RocksDB will flush all memtables on DB close if there are
+  // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+  // DB close. Unpersisted data WILL BE LOST.
+  //
+  // DEFAULT: false
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  bool avoid_flush_during_shutdown = false;
+
+  // Set this option to true during creation of database if you want
+  // to be able to ingest behind (call IngestExternalFile() skipping keys
+  // that already exist, rather than overwriting matching keys).
+  // Setting this option to true will affect 2 things:
+  // 1) Disable some internal optimizations around SST file compression
+  // 2) Reserve bottom-most level for ingested files only.
+  // 3) Note that num_levels should be >= 3 if this option is turned on.
+  //
+  // DEFAULT: false
+  // Immutable.
+  bool allow_ingest_behind = false;
+
+  // If enabled it uses two queues for writes, one for the ones with
+  // disable_memtable and one for the ones that also write to memtable. This
+  // allows the memtable writes not to lag behind other writes. It can be used
+  // to optimize MySQL 2PC in which only the commits, which are serial, write to
+  // memtable.
+  bool two_write_queues = false;
+
+  // If true WAL is not flushed automatically after each write. Instead it
+  // relies on manual invocation of FlushWAL to write the WAL buffer to its
+  // file.
+  bool manual_wal_flush = false;
+
+  // This feature is WORK IN PROGRESS
+  // If enabled WAL records will be compressed before they are written.
+  // Only zstd is supported. Compressed WAL records will be read in supported
+  // versions regardless of the wal_compression settings.
+  CompressionType wal_compression = kNoCompression;
+
+  // If true, RocksDB supports flushing multiple column families and committing
+  // their results atomically to MANIFEST. Note that it is not
+  // necessary to set atomic_flush to true if WAL is always enabled since WAL
+  // allows the database to be restored to the last persistent state in WAL.
+  // This option is useful when there are column families with writes NOT
+  // protected by WAL.
+  // For manual flush, application has to specify which column families to
+  // flush atomically in DB::Flush.
+  // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+  //
+  // Currently, any WAL-enabled writes after atomic flush may be replayed
+  // independently if the process crashes later and tries to recover.
+  bool atomic_flush = false;
+
+  // If true, working thread may avoid doing unnecessary and long-latency
+  // operation (such as deleting obsolete files directly or deleting memtable)
+  // and will instead schedule a background job to do it.
+  // Use it if you're latency-sensitive.
+  // If set to true, takes precedence over
+  // ReadOptions::background_purge_on_iterator_cleanup.
+  bool avoid_unnecessary_blocking_io = false;
+
+  // Historically DB ID has always been stored in Identity File in DB folder.
+  // If this flag is true, the DB ID is written to Manifest file in addition
+  // to the Identity file. By doing this 2 problems are solved
+  // 1. We don't checksum the Identity file where as Manifest file is.
+  // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+  //    the source of truth. Previously the Identity file could be copied
+  //    independent of Manifest and that can result in wrong DB ID.
+  // We recommend setting this flag to true.
+  // Default: false
+  bool write_dbid_to_manifest = false;
+
+  // The number of bytes to prefetch when reading the log. This is mostly useful
+  // for reading a remotely located log, as it can save the number of
+  // round-trips. If 0, then the prefetching is disabled.
+  //
+  // Default: 0
+  size_t log_readahead_size = 0;
+
+  // If user does NOT provide the checksum generator factory, the file checksum
+  // will NOT be used. A new file checksum generator object will be created
+  // when a SST file is created. Therefore, each created FileChecksumGenerator
+  // will only be used from a single thread and so does not need to be
+  // thread-safe.
+  //
+  // Default: nullptr
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  // By default, RocksDB recovery fails if any table/blob file referenced in the
+  // final version reconstructed from the
+  // MANIFEST are missing after scanning the MANIFEST pointed to by the
+  // CURRENT file. It can also fail if verification of unique SST id fails.
+  // Best-efforts recovery is another recovery mode that does not necessarily
+  // fail when certain table/blob files are missing/corrupted or have mismatched
+  // unique id table property. Instead, best-efforts recovery recovers each
+  // column family to a point in the MANIFEST that corresponds to a version. In
+  // such a version, all valid table/blob files referenced have the expected
+  // file size. For table files, their unique id table property match the
+  // MANIFEST.
+  //
+  // Best-efforts recovery does not need a valid CURRENT file, and tries to
+  // recover the database using one of the available MANIFEST files in the db
+  // directory.
+  // Best-efforts recovery tries the available MANIFEST files from high file
+  // numbers (newer) to low file numbers (older), and stops after finding the
+  // first MANIFEST file from which the db can be recovered to a state without
+  // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
+  // files. It is possible that the database can be restored to an empty state
+  // with no table or blob files.
+  //
+  // Regardless of this option, the IDENTITY file
+  // is updated if needed during recovery to match the DB ID in the MANIFEST (if
+  // previously using write_dbid_to_manifest) or to be in some valid state
+  // (non-empty DB ID). Currently, not compatible with atomic flush.
+  // Furthermore, WAL files will not be used for recovery if
+  // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
+  // 2) underlying env's LockFile() call returns ok even for non-existing LOCK
+  // file.
+  //
+  // Default: false
+  bool best_efforts_recovery = false;
+
+  // It defines how many times db resume is called by a separate thread when
+  // background retryable IO Error happens. When background retryable IO
+  // Error happens, SetBGError is called to deal with the error. If the error
+  // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+  // then db resume is called in background to recover from the error. If this
+  // value is 0 or negative, db resume will not be called.
+  //
+  // Default: INT_MAX
+  int max_bgerror_resume_count = INT_MAX;
+
+  // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
+  // This option decides how long to wait to retry the next resume if the
+  // previous resume fails and satisfy redo resume conditions.
+  //
+  // Default: 1000000 (microseconds).
+  uint64_t bgerror_resume_retry_interval = 1000000;
+
+  // It allows user to opt-in to get error messages containing corrupted
+  // keys/values. Corrupt keys, values will be logged in the
+  // messages/logs/status that will help users with the useful information
+  // regarding affected data. By default value is set false to prevent users
+  // data to be exposed in the logs/messages etc.
+  //
+  // Default: false
+  bool allow_data_in_errors = false;
+
+  // A string identifying the machine hosting the DB. This
+  // will be written as a property in every SST file written by the DB (or
+  // by offline writers such as SstFileWriter and RepairDB). It can be useful
+  // for troubleshooting in memory corruption caused by a failing host when
+  // writing a file, by tracing back to the writing host. These corruptions
+  // may not be caught by the checksum since they happen before checksumming.
+  // If left as default, the table writer will substitute it with the actual
+  // hostname when writing the SST file. If set to an empty string, the
+  // property will not be written to the SST file.
+  //
+  // Default: hostname
+  std::string db_host_id = kHostnameForDbHostId;
+
+  // Use this if your DB want to enable checksum handoff for specific file
+  // types writes. Make sure that the File_system you use support the
+  // crc32c checksum verification
+  // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+  // NOTE: currently RocksDB only generates crc32c based checksum for the
+  // handoff. If the storage layer has different checksum support, user
+  // should enble this set as empty. Otherwise,it may cause unexpected
+  // write failures.
+  FileTypeSet checksum_handoff_file_types;
+
+  // EXPERIMENTAL
+  // CompactionService is a feature allows the user to run compactions on a
+  // different host or process, which offloads the background load from the
+  // primary host.
+  // It's an experimental feature, the interface will be changed without
+  // backward/forward compatibility support for now. Some known issues are still
+  // under development.
+  std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+  // It indicates, which lowest cache tier we want to
+  // use for a certain DB. Currently we support volatile_tier and
+  // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+  // the block cache (current implemented volatile_tier) is used. So
+  // cache entries will not spill to secondary cache (current
+  // implemented non_volatile_tier), and block cache lookup misses will not
+  // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+  // both block cache and secondary cache.
+  //
+  // Default: kNonVolatileBlockTier
+  CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+  // If set to false, when compaction or flush sees a SingleDelete followed by
+  // a Delete for the same user key, compaction job will not fail.
+  // Otherwise, compaction job will fail.
+  // This is a temporary option to help existing use cases migrate, and
+  // will be removed in a future release.
+  // Warning: do not set to false unless you are trying to migrate existing
+  // data in which the contract of single delete
+  // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced,
+  // thus has Delete mixed with SingleDelete for the same user key. Violation
+  // of the contract leads to undefined behaviors with high possibility of data
+  // inconsistency, e.g. deleted old data become visible again, etc.
+  bool enforce_single_del_contracts = true;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+  // Create an Options object with default values for all fields.
+  Options() : DBOptions(), ColumnFamilyOptions() {}
+
+  Options(const DBOptions& db_options,
+          const ColumnFamilyOptions& column_family_options)
+      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+  // Change to some default settings from an older version.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  Options* OldDefaults(int rocksdb_major_version = 4,
+                       int rocksdb_minor_version = 6);
+
+  void Dump(Logger* log) const;
+
+  void DumpCFOptions(Logger* log) const;
+
+  // Some functions that make it easier to optimize RocksDB
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  Options* OptimizeForSmallDb();
+
+  // Disable some checks that should not be necessary in the absence of
+  // software logic errors or CPU+memory hardware errors. This can improve
+  // write speeds but is only recommended for temporary use. Does not
+  // change protection against corrupt storage (e.g. verify_checksums).
+  Options* DisableExtraChecks();
+};
+
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier = 0x0,     // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1,  // data in memtable or block cache
+  kPersistedTier = 0x2,   // persisted data.  When WAL is disabled, this option
+                          // will skip data in memtable.
+                          // Note that this ReadTier currently only supports
+                          // Get and MultiGet and does not support iterators.
+  kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an implicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // `iterate_lower_bound` defines the smallest key at which the backward
+  // iterator can return an entry. Once the bound is passed, Valid() will be
+  // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
+  // entry.
+  //
+  // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+  // need to have the same prefix. This is because ordering is not guaranteed
+  // outside of prefix domain.
+  //
+  // In case of user_defined timestamp, if enabled, iterate_lower_bound should
+  // point to key without timestamp part.
+  // Default: nullptr
+  const Slice* iterate_lower_bound;
+
+  // "iterate_upper_bound" defines the extent up to which the forward iterator
+  // can return entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" is exclusive ie the bound value is
+  // not a valid entry. If prefix_extractor is not null:
+  // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+  //    to infer whether prefix iterating (e.g. applying prefix bloom filter)
+  //    can be used within RocksDB. This is done by comparing
+  //    iterate_upper_bound with the seek key.
+  // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+  //    effect if it shares the same prefix as the seek key. If
+  //    iterate_upper_bound is outside the prefix of the seek key, then keys
+  //    returned outside the prefix range will be undefined, just as if
+  //    iterate_upper_bound = null.
+  // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+  // at the first key smaller than iterate_upper_bound.
+  //
+  // In case of user_defined timestamp, if enabled, iterate_upper_bound should
+  // point to key without timestamp part.
+  // Default: nullptr
+  const Slice* iterate_upper_bound;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file. The readahead starts at 8KB and doubles on every
+  // additional read up to 256KB.
+  // This option can help if most of the range scans are large, and if it is
+  // determined that a larger readahead than that enabled by auto-readahead is
+  // needed.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  // Default: 0
+  size_t readahead_size;
+
+  // A threshold for the number of keys that can be skipped before failing an
+  // iterator seek as incomplete. The default value of 0 should be used to
+  // never fail a request as incomplete, even on skipping too many keys.
+  // Default: 0
+  uint64_t max_skippable_internal_keys;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: true
+  bool verify_checksums;
+
+  // Should the "data block"/"index block" read for this iteration be placed in
+  // block cache?
+  // Callers may wish to set this field to false for bulk scans.
+  // This would help not to the change eviction order of existing items in the
+  // block cache.
+  // Default: true
+  bool fill_cache;
+
+  // Specify to create a tailing iterator -- a special iterator that has a
+  // view of the complete database (i.e. it can also be used to read newly
+  // added data) and is optimized for sequential reads. It will return records
+  // that were inserted into the database after the creation of the iterator.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool tailing;
+
+  // This options is not used anymore. It was to turn on a functionality that
+  // has been removed.
+  bool managed;
+
+  // Enable a total order seek regardless of index format (e.g. hash index)
+  // used in the table. Some table format (e.g. plain table) may not support
+  // this option.
+  // If true when calling Get(), we also skip prefix bloom when reading from
+  // block based table, which only affects Get() performance.
+  // Default: false
+  bool total_order_seek;
+
+  // When true, by default use total_order_seek = true, and RocksDB can
+  // selectively enable prefix seek mode if won't generate a different result
+  // from total_order_seek, based on seek key, and iterator upper bound.
+  // Not supported in ROCKSDB_LITE mode, in the way that even with value true
+  // prefix mode is not used.
+  // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+  // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+  // prefix of upper bound differs from prefix of seek key has a flaw.
+  // If present in the DB, "short keys" (shorter than "full length" prefix)
+  // can be omitted from auto_prefix_mode iteration when they would be present
+  // in total_order_seek iteration, regardless of whether the short keys are
+  // "in domain" of the prefix extractor. This is not an issue if no short
+  // keys are added to DB or are not expected to be returned by such
+  // iterators. (We are also assuming the new condition on
+  // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+  // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
+  // Default: false
+  bool auto_prefix_mode;
+
+  // Enforce that the iterator only iterates over the same prefix as the seek.
+  // This option is effective only for prefix seeks, i.e. prefix_extractor is
+  // non-null for the column family and total_order_seek is false.  Unlike
+  // iterate_upper_bound, prefix_same_as_start only works within a prefix
+  // but in both directions.
+  // Default: false
+  bool prefix_same_as_start;
+
+  // Keep the blocks loaded by the iterator pinned in memory as long as the
+  // iterator is not deleted, If used when reading from tables created with
+  // BlockBasedTableOptions::use_delta_encoding = false,
+  // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
+  // return 1.
+  // Default: false
+  bool pin_data;
+
+  // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+  // schedule a background job in the flush job queue and delete obsolete files
+  // in background.
+  // Default: false
+  bool background_purge_on_iterator_cleanup;
+
+  // If true, range tombstones handling will be skipped in key lookup paths.
+  // For DB instances that don't use DeleteRange() calls, this setting can
+  // be used to optimize the read performance.
+  // Note that, if this assumption (of no previous DeleteRange() calls) is
+  // broken, stale keys could be served in read paths.
+  // Default: false
+  bool ignore_range_deletions;
+
+  // A callback to determine whether relevant keys for this scan exist in a
+  // given table based on the table's properties. The callback is passed the
+  // properties of each table during iteration. If the callback returns false,
+  // the table will not be scanned. This option only affects Iterators and has
+  // no impact on point lookups.
+  // Default: empty (every table will be scanned)
+  std::function<bool(const TableProperties&)> table_filter;
+
+  // Timestamp of operation. Read should return the latest data visible to the
+  // specified timestamp. All timestamps of the same database must be of the
+  // same length and format. The user is responsible for providing a customized
+  // compare function via Comparator to order <key, timestamp> tuples.
+  // For iterator, iter_start_ts is the lower bound (older) and timestamp
+  // serves as the upper bound. Versions of the same record that fall in
+  // the timestamp range will be returned. If iter_start_ts is nullptr,
+  // only the most recent version visible to timestamp is returned.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  // Default: nullptr
+  const Slice* timestamp;
+  const Slice* iter_start_ts;
+
+  // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+  // in microseconds.
+  // It should be set to microseconds since epoch, i.e, gettimeofday or
+  // equivalent plus allowed duration in microseconds. The best way is to use
+  // env->NowMicros() + some timeout.
+  // This is best efforts. The call may exceed the deadline if there is IO
+  // involved and the file system doesn't support deadlines, or due to
+  // checking for deadline periodically rather than for every key if
+  // processing a batch
+  std::chrono::microseconds deadline;
+
+  // A timeout in microseconds to be passed to the underlying FileSystem for
+  // reads. As opposed to deadline, this determines the timeout for each
+  // individual file read request. If a MultiGet/Get/Seek/Next etc call
+  // results in multiple reads, each read can last up to io_timeout us.
+  std::chrono::microseconds io_timeout;
+
+  // It limits the maximum cumulative value size of the keys in batch while
+  // reading through MultiGet. Once the cumulative value size exceeds this
+  // soft limit then all the remaining keys are returned with status Aborted.
+  //
+  // Default: std::numeric_limits<uint64_t>::max()
+  uint64_t value_size_soft_limit;
+
+  // For iterators, RocksDB does auto-readahead on noticing more than two
+  // sequential reads for a table file if user doesn't provide readahead_size.
+  // The readahead starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size only when reads are sequential. However at each
+  // level, if iterator moves over next file, readahead_size starts again from
+  // 8KB.
+  //
+  // By enabling this option, RocksDB will do some enhancements for
+  // prefetching the data.
+  //
+  // Default: false
+  bool adaptive_readahead;
+
+  // For file reads associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // The rate limiting is bypassed no matter this option's value for file reads
+  // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
+  // is a `PlainTableFactory`) and cuckoo tables (these can exist when
+  // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
+  //
+  // The bytes charged to rate limiter may not exactly match the file read bytes
+  // since there are some seemingly insignificant reads, like for file
+  // headers/footers, that we currently do not charge to rate limiter.
+  //
+  // Default: `Env::IO_TOTAL`.
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+
+  // Experimental
+  //
+  // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
+  // RocksDB apply it if reads are sequential and its internal automatic
+  // prefetching.
+  //
+  // Default: false
+  bool async_io;
+
+  // Experimental
+  //
+  // If async_io is set, then this flag controls whether we read SST files
+  // in multiple levels asynchronously. Enabling this flag can help reduce
+  // MultiGet latency by maximizing the number of SST files read in
+  // parallel if the keys in the MultiGet batch are in different levels. It
+  // comes at the expense of slightly higher CPU overhead.
+  //
+  // Default: true
+  bool optimize_multiget_for_io;
+
+  ReadOptions();
+  ReadOptions(bool cksum, bool cache);
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may get lost after a crash. The backup engine
+  // relies on write-ahead logs to back up the memtable, so if
+  // you disable write-ahead logs, you must create backups with
+  // flush_before_backup=true to avoid losing unflushed memtable data.
+  // Default: false
+  bool disableWAL;
+
+  // If true and if user is trying to write to column families that don't exist
+  // (they were dropped),  ignore the write (don't return an error). If there
+  // are multiple writes in a WriteBatch, other writes will succeed.
+  // Default: false
+  bool ignore_missing_column_families;
+
+  // If true and we need to wait or sleep for the write request, fails
+  // immediately with Status::Incomplete().
+  // Default: false
+  bool no_slowdown;
+
+  // If true, this write request is of lower priority if compaction is
+  // behind. In this case, no_slowdown = true, the request will be canceled
+  // immediately with Status::Incomplete() returned. Otherwise, it will be
+  // slowed down. The slowdown value is determined by RocksDB to guarantee
+  // it introduces minimum impacts to high priority writes.
+  //
+  // Default: false
+  bool low_pri;
+
+  // If true, this writebatch will maintain the last insert positions of each
+  // memtable as hints in concurrent write. It can improve write performance
+  // in concurrent writes if keys in one writebatch are sequential. In
+  // non-concurrent writes (when concurrent_memtable_writes is false) this
+  // option will be ignored.
+  //
+  // Default: false
+  bool memtable_insert_hint_per_batch;
+
+  // For writes associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // Currently the support covers automatic WAL flushes, which happen during
+  // live updates (`Put()`, `Write()`, `Delete()`, etc.)
+  // when `WriteOptions::disableWAL == false`
+  // and `DBOptions::manual_wal_flush == false`.
+  //
+  // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed
+  // due to implementation constraints.
+  //
+  // Default: `Env::IO_TOTAL`
+  Env::IOPriority rate_limiter_priority;
+
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  //
+  // Default: zero (disabled).
+  size_t protection_bytes_per_key;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false),
+        ignore_missing_column_families(false),
+        no_slowdown(false),
+        low_pri(false),
+        memtable_insert_hint_per_batch(false),
+        rate_limiter_priority(Env::IO_TOTAL),
+        protection_bytes_per_key(0) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+  // If true, the flush would proceed immediately even it means writes will
+  // stall for the duration of the flush; if false the operation will wait
+  // until it's possible to do flush w/o causing stall or until required flush
+  // is performed by someone else (foreground call or background thread).
+  // Default: false
+  bool allow_write_stall;
+  FlushOptions() : wait(true), allow_write_stall(false) {}
+};
+
+// Create a Logger from provided DBOptions
+extern Status CreateLoggerFromOptions(const std::string& dbname,
+                                      const DBOptions& options,
+                                      std::shared_ptr<Logger>* logger);
+
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+  // Compaction output compression type
+  // Default: snappy
+  // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+  // according to the `ColumnFamilyOptions`, taking into account the output
+  // level if `compression_per_level` is specified.
+  CompressionType compression;
+  // Compaction will create files of size `output_file_size_limit`.
+  // Default: MAX, which means that compaction will create a single file
+  uint64_t output_file_size_limit;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions;
+
+  CompactionOptions()
+      : compression(kSnappyCompression),
+        output_file_size_limit(std::numeric_limits<uint64_t>::max()),
+        max_subcompactions(0) {}
+};
+
+// For level based compaction, we can configure if we want to skip/force
+// bottommost level compaction.
+enum class BottommostLevelCompaction {
+  // Skip bottommost level compaction
+  kSkip,
+  // Only compact bottommost level if there is a compaction filter
+  // This is the default option
+  kIfHaveCompactionFilter,
+  // Always compact bottommost level
+  kForce,
+  // Always compact bottommost level but in bottommost level avoid
+  // double-compacting files created in the same compaction
+  kForceOptimized,
+};
+
+// For manual compaction, we can configure if we want to skip/force garbage
+// collection of blob files.
+enum class BlobGarbageCollectionPolicy {
+  // Force blob file garbage collection.
+  kForce,
+  // Skip blob file garbage collection.
+  kDisable,
+  // Inherit blob file garbage collection policy from ColumnFamilyOptions.
+  kUseDefault,
+};
+
+// CompactRangeOptions is used by CompactRange() call.
+struct CompactRangeOptions {
+  // If true, no other compaction will run at the same time as this
+  // manual compaction.
+  //
+  // Default: false
+  bool exclusive_manual_compaction = false;
+
+  // If true, compacted files will be moved to the minimum level capable
+  // of holding the data or given level (specified non-negative target_level).
+  bool change_level = false;
+  // If change_level is true and target_level have non-negative value, compacted
+  // files will be moved to target_level.
+  int target_level = -1;
+  // Compaction outputs will be placed in options.db_paths[target_path_id].
+  // Behavior is undefined if target_path_id is out of range.
+  uint32_t target_path_id = 0;
+  // By default level based compaction will only compact the bottommost level
+  // if there is a compaction filter
+  BottommostLevelCompaction bottommost_level_compaction =
+      BottommostLevelCompaction::kIfHaveCompactionFilter;
+  // If true, will execute immediately even if doing so would cause the DB to
+  // enter write stall mode. Otherwise, it'll sleep until load is low enough.
+  bool allow_write_stall = false;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions = 0;
+  // Set user-defined timestamp low bound, the data with older timestamp than
+  // low bound maybe GCed by compaction. Default: nullptr
+  const Slice* full_history_ts_low = nullptr;
+
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled = nullptr;
+  // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+  // canceled variable in CompactRangeOptions.
+  // Typically, when CompactRange is being called in one thread (t1) with
+  // canceled = false, and DisableManualCompaction is being called in the
+  // other thread (t2), manual compaction is disabled normally, even if the
+  // compaction iterator may still scan a few items before *canceled is
+  // set to true
+
+  // If set to kForce, RocksDB will override enable_blob_file_garbage_collection
+  // to true; if set to kDisable, RocksDB will override it to false, and
+  // kUseDefault leaves the setting in effect. This enables customers to both
+  // force-enable and force-disable GC when calling CompactRange.
+  BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+      BlobGarbageCollectionPolicy::kUseDefault;
+
+  // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff
+  // from ColumnFamilyOptions in effect. Otherwise, it will override the
+  // user-provided setting. This enables customers to selectively override the
+  // age cutoff.
+  double blob_garbage_collection_age_cutoff = -1;
+};
+
+// IngestExternalFileOptions is used by IngestExternalFile()
+struct IngestExternalFileOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+  // If set to true, ingestion falls back to copy when move fails.
+  bool failed_move_fall_back_to_copy = true;
+  // If set to false, an ingested file keys could appear in existing snapshots
+  // that where created before the file was ingested.
+  bool snapshot_consistency = true;
+  // If set to false, IngestExternalFile() will fail if the file key range
+  // overlaps with existing keys or tombstones in the DB.
+  bool allow_global_seqno = true;
+  // If set to false and the file key range overlaps with the memtable key range
+  // (memtable flush required), IngestExternalFile will fail.
+  bool allow_blocking_flush = true;
+  // Set to true if you would like duplicate keys in the file being ingested
+  // to be skipped rather than overwriting existing data under that key.
+  // Use case: back-fill of some historical data in the database without
+  // over-writing existing newer version of data.
+  // This option could only be used if the DB has been running
+  // with allow_ingest_behind=true since the dawn of time.
+  // All files will be ingested at the bottommost level with seqno=0.
+  bool ingest_behind = false;
+  // Set to true if you would like to write global_seqno to a given offset in
+  // the external SST file for backward compatibility. Older versions of
+  // RocksDB writes a global_seqno to a given offset within ingested SST files,
+  // and new versions of RocksDB do not. If you ingest an external SST using
+  // new version of RocksDB and would like to be able to downgrade to an
+  // older version of RocksDB, you should set 'write_global_seqno' to true. If
+  // your service is just starting to use the new RocksDB, we recommend that
+  // you set this option to false, which brings two benefits:
+  // 1. No extra random write for global_seqno during ingestion.
+  // 2. Without writing external SST file, it's possible to do checksum.
+  // We have a plan to set this option to false by default in the future.
+  bool write_global_seqno = true;
+  // Set to true if you would like to verify the checksums of each block of the
+  // external SST file before ingestion.
+  // Warning: setting this to true causes slowdown in file ingestion because
+  // the external SST file has to be read.
+  bool verify_checksums_before_ingest = false;
+  // When verify_checksums_before_ingest = true, RocksDB uses default
+  // readahead setting to scan the file while verifying checksums before
+  // ingestion.
+  // Users can override the default value using this option.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  size_t verify_checksums_readahead_size = 0;
+  // Set to TRUE if user wants to verify the sst file checksum of ingested
+  // files. The DB checksum function will generate the checksum of each
+  // ingested file (if file_checksum_gen_factory is set) and compare the
+  // checksum function name and checksum with the ingested checksum information.
+  //
+  // If this option is set to True: 1) if DB does not enable checksum
+  // (file_checksum_gen_factory == nullptr), the ingested checksum information
+  // will be ignored; 2) If DB enable the checksum function, we calculate the
+  // sst file checksum after the file is moved or copied and compare the
+  // checksum and checksum name. If checksum or checksum function name does
+  // not match, ingestion will be failed. If the verification is successful,
+  // checksum and checksum function name will be stored in Manifest.
+  // If this option is set to FALSE, 1) if DB does not enable checksum,
+  // the ingested checksum information will be ignored; 2) if DB enable the
+  // checksum, we only verify the ingested checksum function name and we
+  // trust the ingested checksum. If the checksum function name matches, we
+  // store the checksum in Manifest. DB does not calculate the checksum during
+  // ingestion. However, if no checksum information is provided with the
+  // ingested files, DB will generate the checksum and store in the Manifest.
+  bool verify_file_checksum = true;
+  // Set to TRUE if user wants file to be ingested to the bottommost level. An
+  // error of Status::TryAgain() will be returned if a file cannot fit in the
+  // bottommost level when calling
+  // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear
+  // the bottommost level in the overlapping range before re-attempt.
+  //
+  // ingest_behind takes precedence over fail_if_not_bottommost_level.
+  bool fail_if_not_bottommost_level = false;
+};
+
+enum TraceFilterType : uint64_t {
+  // Trace all the operations
+  kTraceFilterNone = 0x0,
+  // Do not trace the get operations
+  kTraceFilterGet = 0x1 << 0,
+  // Do not trace the write operations
+  kTraceFilterWrite = 0x1 << 1,
+  // Do not trace the `Iterator::Seek()` operations
+  kTraceFilterIteratorSeek = 0x1 << 2,
+  // Do not trace the `Iterator::SeekForPrev()` operations
+  kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+  // Do not trace the `MultiGet()` operations
+  kTraceFilterMultiGet = 0x1 << 4,
+};
+
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+  // To avoid the trace file size grows large than the storage space,
+  // user can set the max trace file size in Bytes. Default is 64GB
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+  // Specify trace sampling option, i.e. capture one per how many requests.
+  // Default to 1 (capture every request).
+  uint64_t sampling_frequency = 1;
+  // Note: The filtering happens before sampling.
+  uint64_t filter = kTraceFilterNone;
+  // When true, the order of write records in the trace will match the order of
+  // the corresponding write records in the WAL and applied to the DB. There may
+  // be a performance penalty associated with preserving this ordering.
+  //
+  // Default: false. This means write records in the trace may be in an order
+  // different from the WAL's order.
+  bool preserve_write_order = false;
+};
+
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+};
+
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+  // Defines whether the returned size should include the recently written
+  // data in the memtables. If set to false, include_files must be true.
+  bool include_memtables = false;
+  // Defines whether the returned size should include data serialized to disk.
+  // If set to false, include_memtables must be true.
+  bool include_files = true;
+  // When approximating the files total size that is used to store a keys range
+  // using DB::GetApproximateSizes, allow approximation with an error margin of
+  // up to total_files_size * files_size_error_margin. This allows to take some
+  // shortcuts in files size approximation, resulting in better performance,
+  // while guaranteeing the resulting error is within a reasonable margin.
+  // E.g., if the value is 0.1, then the error margin of the returned files size
+  // approximation will be within 10%.
+  // If the value is non-positive - a more precise yet more CPU intensive
+  // estimation is performed.
+  double files_size_error_margin = -1.0;
+};
+
+struct CompactionServiceOptionsOverride {
+  // Currently pointer configurations are not passed to compaction service
+  // compaction so the user needs to set it. It will be removed once pointer
+  // configuration passing is supported.
+  Env* env = Env::Default();
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  const Comparator* comparator = BytewiseComparator();
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+  const CompactionFilter* compaction_filter = nullptr;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+  std::shared_ptr<TableFactory> table_factory;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // Only subsets of events are triggered in remote compaction worker, like:
+  // `OnTableFileCreated`, `OnTableFileCreationStarted`,
+  // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`,
+  // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and
+  // `OnCompactionCompleted` won't be triggered. They will be triggered on the
+  // primary DB side.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
+  // statistics is used to collect DB operation metrics, the metrics won't be
+  // returned to CompactionService primary host, to collect that, the user needs
+  // to set it here.
+  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // Only compaction generated SST files use this user defined table properties
+  // collector.
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories;
+};
+
+struct OpenAndCompactOptions {
+  // Allows cancellation of an in-progress compaction.
+  std::atomic<bool>* canceled = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+  // Whether to populate FileStorageInfo::file_checksum* or leave blank
+  bool include_checksum_info = false;
+  // Flushes memtables if total size in bytes of live WAL files is >= this
+  // number (and DB is not read-only).
+  // Default: always force a flush without checking sizes.
+  uint64_t wal_size_for_flush = 0;
+};
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
new file mode 100644
index 000000000..cd1dd99f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  uint64_t bloom_filter_useful = 0;
+  // # of times bloom FullFilter has not avoided the reads.
+  uint64_t bloom_filter_full_positive = 0;
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  uint64_t bloom_filter_full_true_positive = 0;
+
+  // total number of user key returned (only include keys that are found, does
+  // not include keys that are deleted or merged without a final put
+  uint64_t user_key_return_count = 0;
+
+  // total nanos spent on reading data from SST files
+  uint64_t get_from_table_nanos = 0;
+
+  uint64_t block_cache_hit_count = 0;   // total number of block cache hits
+  uint64_t block_cache_miss_count = 0;  // total number of block cache misses
+
+  void Reset();  // reset all performance counters to zero
+};
+
+struct PerfContext {
+  ~PerfContext();
+
+  PerfContext() {}
+
+  PerfContext(const PerfContext&);
+  PerfContext& operator=(const PerfContext&);
+  PerfContext(PerfContext&&) noexcept;
+
+  void Reset();  // reset all performance counters to zero
+
+  std::string ToString(bool exclude_zero_counters = false) const;
+
+  // enable per level perf context and allocate storage for PerfContextByLevel
+  void EnablePerLevelPerfContext();
+
+  // temporarily disable per level perf context by setting the flag to false
+  void DisablePerLevelPerfContext();
+
+  // free the space for PerfContextByLevel, also disable per level perf context
+  void ClearPerLevelPerfContext();
+
+  uint64_t user_key_comparison_count;  // total number of user key comparisons
+  uint64_t block_cache_hit_count;      // total number of block cache hits
+  uint64_t block_read_count;           // total number of block reads (with IO)
+  uint64_t block_read_byte;            // total number of bytes from block reads
+  uint64_t block_read_time;            // total nanos spent on block reads
+  uint64_t block_cache_index_hit_count;  // total number of index block hits
+  // total number of standalone handles lookup from secondary cache
+  uint64_t block_cache_standalone_handle_count;
+  // total number of real handles lookup from secondary cache that are inserted
+  // into primary cache
+  uint64_t block_cache_real_handle_count;
+  uint64_t index_block_read_count;        // total number of index block reads
+  uint64_t block_cache_filter_hit_count;  // total number of filter block hits
+  uint64_t filter_block_read_count;       // total number of filter block reads
+  uint64_t compression_dict_block_read_count;  // total number of compression
+                                               // dictionary block reads
+
+  uint64_t secondary_cache_hit_count;  // total number of secondary cache hits
+  // total number of real handles inserted into secondary cache
+  uint64_t compressed_sec_cache_insert_real_count;
+  // total number of dummy handles inserted into secondary cache
+  uint64_t compressed_sec_cache_insert_dummy_count;
+  // bytes for vals before compression in secondary cache
+  uint64_t compressed_sec_cache_uncompressed_bytes;
+  // bytes for vals after compression in secondary cache
+  uint64_t compressed_sec_cache_compressed_bytes;
+
+  uint64_t block_checksum_time;    // total nanos spent on block checksum
+  uint64_t block_decompress_time;  // total nanos spent on block decompression
+
+  uint64_t get_read_bytes;       // bytes for vals returned by Get
+  uint64_t multiget_read_bytes;  // bytes for vals returned by MultiGet
+  uint64_t iter_read_bytes;      // bytes for keys/vals decoded by iterator
+
+  uint64_t blob_cache_hit_count;  // total number of blob cache hits
+  uint64_t blob_read_count;       // total number of blob reads (with IO)
+  uint64_t blob_read_byte;        // total number of bytes from blob reads
+  uint64_t blob_read_time;        // total nanos spent on blob reads
+  uint64_t blob_checksum_time;    // total nanos spent on blob checksum
+  uint64_t blob_decompress_time;  // total nanos spent on blob decompression
+
+  // total number of internal keys skipped over during iteration.
+  // There are several reasons for it:
+  // 1. when calling Next(), the iterator is in the position of the previous
+  //    key, so that we'll need to skip it. It means this counter will always
+  //    be incremented in Next().
+  // 2. when calling Next(), we need to skip internal entries for the previous
+  //    keys that are overwritten.
+  // 3. when calling Next(), Seek() or SeekToFirst(), after previous key
+  //    before calling Next(), the seek key in Seek() or the beginning for
+  //    SeekToFirst(), there may be one or more deleted keys before the next
+  //    valid key that the operation should place the iterator to. We need
+  //    to skip both of the tombstone and updates hidden by the tombstones. The
+  //    tombstones are not included in this counter, while previous updates
+  //    hidden by the tombstones will be included here.
+  // 4. symmetric cases for Prev() and SeekToLast()
+  // internal_recent_skipped_count is not included in this counter.
+  //
+  uint64_t internal_key_skipped_count;
+  // Total number of deletes and single deletes skipped over during iteration
+  // When calling Next(), Seek() or SeekToFirst(), after previous position
+  // before calling Next(), the seek key in Seek() or the beginning for
+  // SeekToFirst(), there may be one or more deleted keys before the next valid
+  // key. Every deleted key is counted once. We don't recount here if there are
+  // still older updates invalidated by the tombstones.
+  //
+  uint64_t internal_delete_skipped_count;
+  // How many times iterators skipped over internal keys that are more recent
+  // than the snapshot that iterator is using.
+  //
+  uint64_t internal_recent_skipped_count;
+  // How many values were fed into merge operator by iterators.
+  //
+  uint64_t internal_merge_count;
+  // Number of times we reseeked inside a merging iterator, specifically to skip
+  // after or before a range of keys covered by a range deletion in a newer LSM
+  // component.
+  uint64_t internal_range_del_reseek_count;
+
+  uint64_t get_snapshot_time;        // total nanos spent on getting snapshot
+  uint64_t get_from_memtable_time;   // total nanos spent on querying memtables
+  uint64_t get_from_memtable_count;  // number of mem tables queried
+  // total nanos spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time;  // total nanos reading from output files
+  // total nanos spent on seeking memtable
+  uint64_t seek_on_memtable_time;
+  // number of seeks issued on memtable
+  // (including SeekForPrev but not SeekToFirst and SeekToLast)
+  uint64_t seek_on_memtable_count;
+  // number of Next()s issued on memtable
+  uint64_t next_on_memtable_count;
+  // number of Prev()s issued on memtable
+  uint64_t prev_on_memtable_count;
+  // total nanos spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;  // total nanos spent on the merge min heap
+  uint64_t seek_max_heap_time;  // total nanos spent on the merge max heap
+  // total nanos spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total nanos spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+
+  // This group of stats provide a breakdown of time spent by Write().
+  // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+  // are enabled.
+  //
+  // total nanos spent on writing to WAL
+  uint64_t write_wal_time;
+  // total nanos spent on writing to mem tables
+  uint64_t write_memtable_time;
+  // total nanos spent on delaying or throttling write
+  uint64_t write_delay_time;
+  // total nanos spent on switching memtable/wal and scheduling
+  // flushes/compactions.
+  uint64_t write_scheduling_flushes_compactions_time;
+  // total nanos spent on writing a record, excluding the above four things
+  uint64_t write_pre_and_post_process_time;
+
+  // time spent waiting for other threads of the batch group
+  uint64_t write_thread_wait_nanos;
+
+  // time spent on acquiring DB mutex.
+  uint64_t db_mutex_lock_nanos;
+  // Time spent on waiting with a condition variable created with DB mutex.
+  uint64_t db_condition_wait_nanos;
+  // Time spent on merge operator.
+  uint64_t merge_operator_time_nanos;
+
+  // Time spent on reading index block from block cache or SST file
+  uint64_t read_index_block_nanos;
+  // Time spent on reading filter block from block cache or SST file
+  uint64_t read_filter_block_nanos;
+  // Time spent on creating data block iterator
+  uint64_t new_table_block_iter_nanos;
+  // Time spent on creating a iterator of an SST file.
+  uint64_t new_table_iterator_nanos;
+  // Time spent on seeking a key in data/index blocks
+  uint64_t block_seek_nanos;
+  // Time spent on finding or creating a table reader
+  uint64_t find_table_nanos;
+  // total number of mem table bloom hits
+  uint64_t bloom_memtable_hit_count;
+  // total number of mem table bloom misses
+  uint64_t bloom_memtable_miss_count;
+  // total number of SST table bloom hits
+  uint64_t bloom_sst_hit_count;
+  // total number of SST table bloom misses
+  uint64_t bloom_sst_miss_count;
+
+  // Time spent waiting on key locks in transaction lock manager.
+  uint64_t key_lock_wait_time;
+  // number of times acquiring a lock was blocked by another transaction.
+  uint64_t key_lock_wait_count;
+
+  // Total time spent in Env filesystem operations. These are only populated
+  // when TimedEnv is used.
+  uint64_t env_new_sequential_file_nanos;
+  uint64_t env_new_random_access_file_nanos;
+  uint64_t env_new_writable_file_nanos;
+  uint64_t env_reuse_writable_file_nanos;
+  uint64_t env_new_random_rw_file_nanos;
+  uint64_t env_new_directory_nanos;
+  uint64_t env_file_exists_nanos;
+  uint64_t env_get_children_nanos;
+  uint64_t env_get_children_file_attributes_nanos;
+  uint64_t env_delete_file_nanos;
+  uint64_t env_create_dir_nanos;
+  uint64_t env_create_dir_if_missing_nanos;
+  uint64_t env_delete_dir_nanos;
+  uint64_t env_get_file_size_nanos;
+  uint64_t env_get_file_modification_time_nanos;
+  uint64_t env_rename_file_nanos;
+  uint64_t env_link_file_nanos;
+  uint64_t env_lock_file_nanos;
+  uint64_t env_unlock_file_nanos;
+  uint64_t env_new_logger_nanos;
+
+  uint64_t get_cpu_nanos;
+  uint64_t iter_next_cpu_nanos;
+  uint64_t iter_prev_cpu_nanos;
+  uint64_t iter_seek_cpu_nanos;
+
+  // Time spent in encrypting data. Populated when EncryptedEnv is used.
+  uint64_t encrypt_data_nanos;
+  // Time spent in decrypting data. Populated when EncryptedEnv is used.
+  uint64_t decrypt_data_nanos;
+
+  uint64_t number_async_seek;
+
+  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+  bool per_level_perf_context_enabled = false;
+};
+
+// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
+// non-thread-local PerfContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local PerfContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
+PerfContext* get_perf_context();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h
new file mode 100644
index 000000000..e7dded0e3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_level.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// How much perf stats to collect. Affects perf_context and iostats_context.
+enum PerfLevel : unsigned char {
+  kUninitialized = 0,             // unknown setting
+  kDisable = 1,                   // disable perf stats
+  kEnableCount = 2,               // enable only count stats
+  kEnableTimeExceptForMutex = 3,  // Other than count stats, also enable time
+                                  // stats except for mutexes
+  // Other than time, also measure CPU time counters. Still don't measure
+  // time (neither wall time nor CPU time) for mutexes.
+  kEnableTimeAndCPUTimeExceptForMutex = 4,
+  kEnableTime = 5,  // enable count and time stats
+  kOutOfBounds = 6  // N.B. Must always be the last value!
+};
+
+// set the perf stats level for current thread
+void SetPerfLevel(PerfLevel level);
+
+// get current perf stats level for current thread
+PerfLevel GetPerfLevel();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h
new file mode 100644
index 000000000..f14f01999
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/persistent_cache.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCache
+//
+// Persistent cache interface for caching IO pages on a persistent medium. The
+// cache interface is specifically designed for persistent read cache.
+class PersistentCache {
+ public:
+  using StatsType = std::vector<std::map<std::string, double>>;
+
+  virtual ~PersistentCache() {}
+
+  // Insert to page cache
+  //
+  // page_key   Identifier to identify a page uniquely across restarts
+  // data       Page data to copy (caller retains ownership)
+  // size       Size of the page
+  virtual Status Insert(const Slice& key, const char* data,
+                        const size_t size) = 0;
+
+  // Lookup page cache by page identifier
+  //
+  // page_key   Page identifier
+  // buf        Buffer where the data should be copied
+  // size       Size of the page
+  virtual Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                        size_t* size) = 0;
+
+  // True if the cache is configured to store serialized blocks, which are
+  // potentially compressed and include a trailer (when SST format calls for
+  // one). False if the cache stores uncompressed blocks (no trailer).
+  virtual bool IsCompressed() = 0;
+
+  // Return stats as map of {string, double} per-tier
+  //
+  // Persistent cache can be initialized as a tier of caches. The stats are per
+  // tire top-down
+  virtual StatsType Stats() = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharding the same persistent cache to partition the key space.  Typically
+  // the client will allocate a new id at startup and prepend the id to its
+  // cache keys.
+  virtual uint64_t NewId() = 0;
+};
+
+// Factor method to create a new persistent cache
+Status NewPersistentCache(Env* const env, const std::string& path,
+                          const uint64_t size,
+                          const std::shared_ptr<Logger>& log,
+                          const bool optimized_for_nvm,
+                          std::shared_ptr<PersistentCache>* cache);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
new file mode 100644
index 000000000..9cad6edf4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -0,0 +1,159 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class RateLimiter {
+ public:
+  enum class OpType {
+    kRead,
+    kWrite,
+  };
+
+  enum class Mode {
+    kReadsOnly,
+    kWritesOnly,
+    kAllIo,
+  };
+
+  // For API compatibility, default to rate-limiting writes only.
+  explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {}
+
+  virtual ~RateLimiter() {}
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  // REQUIRED: bytes_per_second > 0
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
+
+  // Deprecated. New RateLimiter derived classes should override
+  // Request(const int64_t, const Env::IOPriority, Statistics*) or
+  // Request(const int64_t, const Env::IOPriority, Statistics*, OpType)
+  // instead.
+  //
+  // Request for token for bytes. If this request can not be satisfied, the call
+  // is blocked. Caller is responsible to make sure
+  // bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
+    assert(false);
+  }
+
+  // Request for token for bytes and potentially update statistics. If this
+  // request can not be satisfied, the call is blocked. Caller is responsible to
+  // make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* /* stats */) {
+    // For API compatibility, default implementation calls the older API in
+    // which statistics are unsupported.
+    Request(bytes, pri);
+  }
+
+  // Requests token to read or write bytes and potentially updates statistics.
+  //
+  // If this request can not be satisfied, the call is blocked. Caller is
+  // responsible to make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* stats, OpType op_type) {
+    if (IsRateLimited(op_type)) {
+      Request(bytes, pri, stats);
+    }
+  }
+
+  // Requests token to read or write bytes and potentially updates statistics.
+  // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of
+  // direct I/O) to allocate an appropriate number of bytes, which may be less
+  // than the number of bytes requested.
+  virtual size_t RequestToken(size_t bytes, size_t alignment,
+                              Env::IOPriority io_priority, Statistics* stats,
+                              RateLimiter::OpType op_type);
+
+  // Max bytes can be granted in a single burst
+  virtual int64_t GetSingleBurstBytes() const = 0;
+
+  // Total bytes that go through rate limiter
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+  // Total # of requests that go through rate limiter
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+  // Total # of requests that are pending for bytes in rate limiter
+  // For convenience, this function is supported by the RateLimiter returned
+  // by NewGenericRateLimiter but is not required by RocksDB.
+  //
+  // REQUIRED: total_pending_request != nullptr
+  virtual Status GetTotalPendingRequests(
+      int64_t* total_pending_requests,
+      const Env::IOPriority pri = Env::IO_TOTAL) const {
+    assert(total_pending_requests != nullptr);
+    (void)total_pending_requests;
+    (void)pri;
+    return Status::NotSupported();
+  }
+
+  virtual int64_t GetBytesPerSecond() const = 0;
+
+  virtual bool IsRateLimited(OpType op_type) {
+    if ((mode_ == RateLimiter::Mode::kWritesOnly &&
+         op_type == RateLimiter::OpType::kRead) ||
+        (mode_ == RateLimiter::Mode::kReadsOnly &&
+         op_type == RateLimiter::OpType::kWrite)) {
+      return false;
+    }
+    return true;
+  }
+
+ protected:
+  Mode GetMode() { return mode_; }
+
+ private:
+  const Mode mode_;
+};
+
+// Create a RateLimiter object, which can be shared among RocksDB instances to
+// control write rate of flush and compaction.
+// @rate_bytes_per_sec: this is the only parameter you want to set most of the
+// time. It controls the total write rate of compaction and flush in bytes per
+// second. Currently, RocksDB does not enforce rate limit for anything other
+// than flush and compaction, e.g. write to WAL.
+// @refill_period_us: this controls how often tokens are refilled. For example,
+// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+// burstier writes while smaller value introduces more CPU overhead.
+// The default should work for most cases.
+// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
+// A low-pri request is usually blocked in favor of hi-pri request. Currently,
+// RocksDB assigns low-pri to request from compaction and high-pri to request
+// from flush. Low-pri requests can get blocked if flush requests come in
+// continuously. This fairness parameter grants low-pri requests permission by
+// 1/fairness chance even though high-pri requests exist to avoid starvation.
+// You should be good by leaving it at default 10.
+// @mode: Mode indicates which types of operations count against the limit.
+// @auto_tuned: Enables dynamic adjustment of rate limit within the range
+//              `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to
+//              the recent demand for background I/O.
+extern RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
+    int32_t fairness = 10,
+    RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+    bool auto_tuned = false);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
new file mode 100644
index 000000000..a339ec2aa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// For testing purposes
+#if ROCKSDB_NAMESPACE == 42
+#undef ROCKSDB_NAMESPACE
+#endif
+
+// Normal logic
+#ifndef ROCKSDB_NAMESPACE
+#define ROCKSDB_NAMESPACE rocksdb
+#endif
diff --git a/src/rocksdb/include/rocksdb/secondary_cache.h b/src/rocksdb/include/rocksdb/secondary_cache.h
new file mode 100644
index 000000000..a6a8c8b1d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/secondary_cache.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2021, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A handle for lookup result. The handle may not be immediately ready or
+// have a valid value. The caller must call isReady() to determine if its
+// ready, and call Wait() in order to block until it becomes ready.
+// The caller must call value() after it becomes ready to determine if the
+// handle successfullly read the item.
+class SecondaryCacheResultHandle {
+ public:
+  virtual ~SecondaryCacheResultHandle() = default;
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SecondaryCache : public Customizable {
+ public:
+  ~SecondaryCache() override = default;
+
+  static const char* Type() { return "SecondaryCache"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<SecondaryCache>* result);
+
+  // Insert the given value into this cache. Ownership of `value` is
+  // transferred to the callee, who is reponsible for deleting the value
+  // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(),
+  // the callee is responsible for such cleanup even in case of non-OK
+  // Status.
+  // Typically, the value is not saved directly but the implementation
+  // uses the SaveToCallback provided by helper to extract value's
+  // persistable data (typically uncompressed block), which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status
+  // is success (OK).
+  //
+  // If the implementation is asynchronous or otherwise uses `value` after
+  // the call returns, then InsertSaved() must be overridden not to rely on
+  // Insert(). For example, there could be a "holding area" in memory where
+  // Lookup() might return the same parsed value back. But more typically, if
+  // the implementation only uses `value` for getting persistable data during
+  // the call, then the default implementation of `InsertSaved()` suffices.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Insert a value from its saved/persistable data (typically uncompressed
+  // block), as if generated by SaveToCallback/SizeCallback. This can be used
+  // in "warming up" the cache from some auxiliary source, and like Insert()
+  // may or may not write it to cache depending on the admission control
+  // policy, even if the return status is success.
+  //
+  // The default implementation assumes synchronous, non-escaping Insert(),
+  // wherein `value` is not used after return of Insert(). See Insert().
+  virtual Status InsertSaved(const Slice& key, const Slice& saved);
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready.
+  //
+  // advise_erase is a hint from the primary cache indicating that the handle
+  // will be cached there, so the secondary cache is advised to drop it from
+  // the cache as an optimization. To use this feature, SupportForceErase()
+  // needs to return true.
+  // This hint can also be safely ignored.
+  //
+  // is_in_sec_cache is to indicate whether the handle is possibly erased
+  // from the secondary cache after the Lookup.
+  virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait,
+      bool advise_erase, bool& is_in_sec_cache) = 0;
+
+  // Indicate whether a handle can be erased in this secondary cache.
+  [[nodiscard]] virtual bool SupportForceErase() const = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready.
+  virtual void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) = 0;
+
+  // Set the maximum configured capacity of the cache.
+  // When the new capacity is less than the old capacity and the existing usage
+  // is greater than new capacity, the implementation will do its best job to
+  // purge the released entries from the cache in order to lower the usage.
+  //
+  // The derived class can make this function no-op and return NotSupported().
+  virtual Status SetCapacity(size_t /* capacity */) {
+    return Status::NotSupported();
+  }
+
+  // The derived class can make this function no-op and return NotSupported().
+  virtual Status GetCapacity(size_t& /* capacity */) {
+    return Status::NotSupported();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
new file mode 100644
index 000000000..0d7eb5949
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -0,0 +1,264 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <string_view>  // RocksDB now requires C++17 support
+
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) {}
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) {}
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+  // Create a slice that refers to the same contents as "sv"
+  /* implicit */
+  Slice(const std::string_view& sv) : data_(sv.data()), size_(sv.size()) {}
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); }
+
+  // Create a single slice from SliceParts using buf as storage.
+  // buf must exist as long as the returned Slice exists.
+  Slice(const struct SliceParts& parts, std::string* buf);
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() {
+    data_ = "";
+    size_ = 0;
+  }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  void remove_suffix(size_t n) {
+    assert(n <= size());
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
+  std::string ToString(bool hex = false) const;
+
+  // Return a string_view that references the same data as this slice.
+  std::string_view ToStringView() const {
+    return std::string_view(data_, size_);
+  }
+
+  // Decodes the current slice interpreted as an hexadecimal string into result,
+  // if successful returns true, if this isn't a valid hex string
+  // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
+  // This slice is expected to have an even number of 0-9A-F characters
+  // also accepts lowercase (a-f)
+  bool DecodeHex(std::string* result) const;
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+  bool ends_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0));
+  }
+
+  // Compare two slices and returns the first byte where they differ
+  size_t difference_offset(const Slice& b) const;
+
+  // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+/**
+ * A Slice that can be pinned with some cleanup tasks, which will be run upon
+ * ::Reset() or object destruction, whichever is invoked first. This can be used
+ * to avoid memcpy by having the PinnableSlice object referring to the data
+ * that is locked in the memory and release them after the data is consumed.
+ */
+class PinnableSlice : public Slice, public Cleanable {
+ public:
+  PinnableSlice() { buf_ = &self_space_; }
+  explicit PinnableSlice(std::string* buf) { buf_ = buf; }
+
+  PinnableSlice(PinnableSlice&& other);
+  PinnableSlice& operator=(PinnableSlice&& other);
+
+  // No copy constructor and copy assignment allowed.
+  PinnableSlice(PinnableSlice&) = delete;
+  PinnableSlice& operator=(PinnableSlice&) = delete;
+
+  inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
+                       void* arg2) {
+    assert(!pinned_);
+    pinned_ = true;
+    data_ = s.data();
+    size_ = s.size();
+    RegisterCleanup(f, arg1, arg2);
+    assert(pinned_);
+  }
+
+  inline void PinSlice(const Slice& s, Cleanable* cleanable) {
+    assert(!pinned_);
+    pinned_ = true;
+    data_ = s.data();
+    size_ = s.size();
+    if (cleanable != nullptr) {
+      cleanable->DelegateCleanupsTo(this);
+    }
+    assert(pinned_);
+  }
+
+  inline void PinSelf(const Slice& slice) {
+    assert(!pinned_);
+    buf_->assign(slice.data(), slice.size());
+    data_ = buf_->data();
+    size_ = buf_->size();
+    assert(!pinned_);
+  }
+
+  inline void PinSelf() {
+    assert(!pinned_);
+    data_ = buf_->data();
+    size_ = buf_->size();
+    assert(!pinned_);
+  }
+
+  void remove_suffix(size_t n) {
+    assert(n <= size());
+    if (pinned_) {
+      size_ -= n;
+    } else {
+      buf_->erase(size() - n, n);
+      PinSelf();
+    }
+  }
+
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    if (pinned_) {
+      data_ += n;
+      size_ -= n;
+    } else {
+      buf_->erase(0, n);
+      PinSelf();
+    }
+  }
+
+  void Reset() {
+    Cleanable::Reset();
+    pinned_ = false;
+    size_ = 0;
+  }
+
+  inline std::string* GetSelf() { return buf_; }
+
+  inline bool IsPinned() const { return pinned_; }
+
+ private:
+  friend class PinnableSlice4Test;
+  std::string self_space_;
+  std::string* buf_;
+  bool pinned_ = false;
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* _parts, int _num_parts)
+      : parts(_parts), num_parts(_num_parts) {}
+  SliceParts() : parts(nullptr), num_parts(0) {}
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); }
+
+inline int Slice::compare(const Slice& b) const {
+  assert(data_ != nullptr && b.data_ != nullptr);
+  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_)
+      r = -1;
+    else if (size_ > b.size_)
+      r = +1;
+  }
+  return r;
+}
+
+inline size_t Slice::difference_offset(const Slice& b) const {
+  size_t off = 0;
+  const size_t len = (size_ < b.size_) ? size_ : b.size_;
+  for (; off < len; off++) {
+    if (data_[off] != b.data_[off]) break;
+  }
+  return off;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h
new file mode 100644
index 000000000..8909b9c53
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice_transform.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct ConfigOptions;
+
+// A SliceTransform is a generic pluggable way of transforming one string
+// to another. Its primary use-case is in configuring RocksDB prefix Bloom
+// filters, by setting prefix_extractor in ColumnFamilyOptions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SliceTransform : public Customizable {
+ public:
+  virtual ~SliceTransform(){};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const override = 0;
+  static const char* Type() { return "SliceTransform"; }
+
+  // Creates and configures a new SliceTransform from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<const SliceTransform>* result);
+
+  // Returns a string representation of this SliceTransform, representing the ID
+  // and any additional properties.
+  std::string AsString() const;
+
+  // Extract a prefix from a specified key, partial key, iterator upper bound,
+  // etc. This is normally used for building and checking prefix Bloom filters
+  // but should accept any string for which InDomain() returns true.
+  // See ColumnFamilyOptions::prefix_extractor for specific properties that
+  // must be satisfied by prefix extractors.
+  virtual Slice Transform(const Slice& key) const = 0;
+
+  // Determine whether the specified key is compatible with the logic
+  // specified in the Transform method. Keys for which InDomain returns
+  // false will not be added to or queried against prefix Bloom filters.
+  //
+  // For example, if the Transform method returns a fixed length
+  // prefix of size 4, then an invocation to InDomain("abc") returns
+  // false because the specified key length(3) is shorter than the
+  // prefix size of 4.
+  //
+  // Wiki documentation here:
+  // https://github.com/facebook/rocksdb/wiki/Prefix-Seek
+  //
+  virtual bool InDomain(const Slice& key) const = 0;
+
+  // DEPRECATED: This is currently not used and remains here for backward
+  // compatibility.
+  virtual bool InRange(const Slice& /*dst*/) const { return false; }
+
+  // Returns information on maximum prefix length, if there is one.
+  // If Transform(x).size() == n for some keys and otherwise < n,
+  // should return true and set *len = n. Returning false is safe but
+  // currently disables some auto_prefix_mode filtering.
+  // Specifically, if the iterate_upper_bound is the immediate successor (see
+  // Comparator::IsSameLengthImmediateSuccessor) of the seek key's prefix,
+  // we require this function return true and iterate_upper_bound.size() == n
+  // to recognize and optimize the prefix seek.
+  // Otherwise (including FullLengthEnabled returns false, or prefix length is
+  // less than maximum), Seek with auto_prefix_mode is only optimized if the
+  // iterate_upper_bound and seek key have the same prefix.
+  // BUG: Despite all these conditions and even with the extra condition on
+  // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not
+  // sufficient to ensure auto_prefix_mode returns all entries that
+  // total_order_seek would return. See auto_prefix_mode "BUG" section.
+  virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
+
+  // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+  //
+  // This function is not used by RocksDB, but for users. If users pass
+  // Options by string to RocksDB, they might not know what prefix extractor
+  // they are using. This function is to help users can determine:
+  //   if they want to iterate all keys prefixing `prefix`, whether it is
+  //   safe to use prefix bloom filter and seek to key `prefix`.
+  // If this function returns true, this means a user can Seek() to a prefix
+  // using the bloom filter. Otherwise, user needs to skip the bloom filter
+  // by setting ReadOptions.total_order_seek = true.
+  //
+  // Here is an example: Suppose we implement a slice transform that returns
+  // the first part of the string up to and including first ",":
+  // 1. SameResultWhenAppended("abc,") should return true. If applying prefix
+  //    bloom filter using it, all slices matching "abc,.*" will be extracted
+  //    to "abc,", so any SST file or memtable containing any of those key
+  //    will not be filtered out.
+  // 2. SameResultWhenAppended("abc") should return false. A user will not be
+  //    guaranteed to see all the keys matching "abc.*" if a user prefix
+  //    seeks to "abc" against a DB with the same setting. If one SST file
+  //    only contains "abcd,e", the file can be filtered out and the key will
+  //    be invisible, because the prefix according to the configured extractor
+  //    is "abcd,".
+  //
+  // i.e., an implementation always returning false is safe.
+  virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const {
+    return false;
+  }
+};
+
+// The prefix is the first `prefix_len` bytes of the key, and keys shorter
+// then `prefix_len` are not InDomain.
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+// The prefix is the first min(length(key),`cap_len`) bytes of the key, and
+// all keys are InDomain.
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
+// Prefix is equal to key. All keys are InDomain.
+extern const SliceTransform* NewNoopTransform();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h
new file mode 100644
index 000000000..1ea56e71e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/snapshot.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+//
+// To Create a Snapshot, call DB::GetSnapshot().
+// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot).
+class Snapshot {
+ public:
+  virtual SequenceNumber GetSequenceNumber() const = 0;
+
+  // Returns unix time i.e. the number of seconds since the Epoch, 1970-01-01
+  // 00:00:00 (UTC).
+  virtual int64_t GetUnixTime() const = 0;
+
+  virtual uint64_t GetTimestamp() const = 0;
+
+ protected:
+  virtual ~Snapshot();
+};
+
+// Simple RAII wrapper class for Snapshot.
+// Constructing this object will create a snapshot.  Destructing will
+// release the snapshot.
+class ManagedSnapshot {
+ public:
+  explicit ManagedSnapshot(DB* db);
+
+  // Instead of creating a snapshot, take ownership of the input snapshot.
+  ManagedSnapshot(DB* db, const Snapshot* _snapshot);
+
+  ~ManagedSnapshot();
+
+  const Snapshot* snapshot();
+
+ private:
+  DB* db_;
+  const Snapshot* snapshot_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 000000000..9261ba47d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SSTDumpTool {
+ public:
+  int Run(int argc, char const* const* argv, Options options = Options());
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h
new file mode 100644
index 000000000..613292151
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_manager.h
@@ -0,0 +1,136 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+// SstFileManager is NOT an extensible interface but a public interface for
+// result of NewSstFileManager. Any derived classes must be RocksDB internal.
+class SstFileManager {
+ public:
+  virtual ~SstFileManager() {}
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature; maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
+
+  // Set the amount of buffer room each compaction should be able to leave.
+  // In other words, at its maximum disk space consumption, the compaction
+  // should still leave compaction_buffer_size available on the disk so that
+  // other background functions may continue, such as logging and flushing.
+  virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
+
+  // Return true if the total size of SST  and blob files exceeded the maximum
+  // allowed space usage.
+  //
+  // thread-safe.
+  virtual bool IsMaxAllowedSpaceReached() = 0;
+
+  // Returns true if the total size of SST and blob files as well as estimated
+  // size of ongoing compactions exceeds the maximums allowed space usage.
+  virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
+
+  // Return the total size of all tracked files.
+  // thread-safe
+  virtual uint64_t GetTotalSize() = 0;
+
+  // Return a map containing all tracked files and their corresponding sizes.
+  // thread-safe
+  virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
+
+  // Return delete rate limit in bytes per second.
+  // thread-safe
+  virtual int64_t GetDeleteRateBytesPerSecond() = 0;
+
+  // Update the delete rate limit in bytes per second.
+  // zero means disable delete rate limiting and delete files immediately
+  // thread-safe
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual double GetMaxTrashDBRatio() = 0;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+  // Return the total size of trash files
+  // thread-safe
+  virtual uint64_t GetTotalTrashSize() = 0;
+
+  // Set the statistics ptr to dump the stat information
+  virtual void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) = 0;
+};
+
+// Create a new SstFileManager that can be shared among multiple RocksDB
+// instances to track SST and blob files and control there deletion rate.
+// Even though SstFileManager don't track WAL files but it still control
+// there deletion rate.
+//
+// @param env: Pointer to Env object, please see "rocksdb/env.h".
+// @param fs: Pointer to FileSystem object (rocksdb/file_system.h"
+// @param info_log: If not nullptr, info_log will be used to log errors.
+//
+// == Deletion rate limiting specific arguments ==
+// @param trash_dir: Deprecated, this argument have no effect
+// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
+//    this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
+//    in 1 second, we will wait for another 3 seconds before we delete other
+//    files, Set to 0 to disable deletion rate limiting.
+//    This option also affect the delete rate of WAL files in the DB.
+// @param delete_existing_trash: Deprecated, this argument have no effect, but
+//    if user provide trash_dir we will schedule deletes for files in the dir
+// @param status: If not nullptr, status will contain any errors that happened
+//    during creating the missing trash_dir or deleting existing files in trash.
+// @param max_trash_db_ratio: If the trash size constitutes for more than this
+//    fraction of the total DB size we will start deleting new files passed to
+//    DeleteScheduler immediately
+// @param bytes_max_delete_chunk: if a file to delete is larger than delete
+//    chunk, ftruncate the file by this size each time, rather than dropping the
+//    whole file. 0 means to always delete the whole file. If the file has more
+//    than one linked names, the file will be deleted as a whole. Either way,
+//    `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
+//    files already renamed as a trash may be partial, so users should not
+//    directly recover them without checking.
+extern SstFileManager* NewSstFileManager(
+    Env* env, std::shared_ptr<FileSystem> fs,
+    std::shared_ptr<Logger> info_log = nullptr,
+    const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0,
+    bool delete_existing_trash = true, Status* status = nullptr,
+    double max_trash_db_ratio = 0.25,
+    uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+// Same as above, but takes a pointer to a legacy Env object, instead of
+// Env and FileSystem objects
+extern SstFileManager* NewSstFileManager(
+    Env* env, std::shared_ptr<Logger> info_log = nullptr,
+    std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
+    bool delete_existing_trash = true, Status* status = nullptr,
+    double max_trash_db_ratio = 0.25,
+    uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h
new file mode 100644
index 000000000..4b8642480
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+  SstFileReader(const Options& options);
+
+  ~SstFileReader();
+
+  // Prepares to read from the file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Returns a new iterator over the table contents.
+  // Most read options provide the same control as we read from DB.
+  // If "snapshot" is nullptr, the iterator returns only the latest keys.
+  Iterator* NewIterator(const ReadOptions& options);
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+  // Verifies whether there is corruption in this table.
+  Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+  Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h
new file mode 100644
index 000000000..c85f097a5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_writer.h
@@ -0,0 +1,174 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+
+// ExternalSstFileInfo include information about sst files created
+// using SstFileWriter.
+struct ExternalSstFileInfo {
+  ExternalSstFileInfo()
+      : file_path(""),
+        smallest_key(""),
+        largest_key(""),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
+        sequence_number(0),
+        file_size(0),
+        num_entries(0),
+        num_range_del_entries(0),
+        version(0) {}
+
+  ExternalSstFileInfo(const std::string& _file_path,
+                      const std::string& _smallest_key,
+                      const std::string& _largest_key,
+                      SequenceNumber _sequence_number, uint64_t _file_size,
+                      int32_t _num_entries, int32_t _version)
+      : file_path(_file_path),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
+        sequence_number(_sequence_number),
+        file_size(_file_size),
+        num_entries(_num_entries),
+        num_range_del_entries(0),
+        version(_version) {}
+
+  std::string file_path;     // external sst file path
+  std::string smallest_key;  // smallest user key in file
+  std::string largest_key;   // largest user key in file
+  std::string
+      smallest_range_del_key;  // smallest range deletion user key in file
+  std::string largest_range_del_key;  // largest range deletion user key in file
+  std::string file_checksum;          // sst file checksum;
+  std::string file_checksum_func_name;  // The name of file checksum function
+  SequenceNumber sequence_number;       // sequence number of all keys in file
+  uint64_t file_size;                   // file size in bytes
+  uint64_t num_entries;                 // number of entries in file
+  uint64_t num_range_del_entries;  // number of range deletion entries in file
+  int32_t version;                 // file version
+};
+
+// SstFileWriter is used to create sst files that can be added to database later
+// All keys in files generated by SstFileWriter will have sequence number = 0.
+class SstFileWriter {
+ public:
+  // User can pass `column_family` to specify that the generated file will
+  // be ingested into this column_family, note that passing nullptr means that
+  // the column_family is unknown.
+  // If invalidate_page_cache is set to true, SstFileWriter will give the OS a
+  // hint that this file pages is not needed every time we write 1MB to the
+  // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
+  // passed.
+  // The `skip_filters` option is DEPRECATED and could be removed in the
+  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
+  // generation.
+  SstFileWriter(const EnvOptions& env_options, const Options& options,
+                ColumnFamilyHandle* column_family = nullptr,
+                bool invalidate_page_cache = true,
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false)
+      : SstFileWriter(env_options, options, options.comparator, column_family,
+                      invalidate_page_cache, io_priority, skip_filters) {}
+
+  // Deprecated API
+  SstFileWriter(const EnvOptions& env_options, const Options& options,
+                const Comparator* user_comparator,
+                ColumnFamilyHandle* column_family = nullptr,
+                bool invalidate_page_cache = true,
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false);
+
+  ~SstFileWriter();
+
+  // Prepare SstFileWriter to write into file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Add a Put key with value to currently opened file (deprecated)
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
+
+  // Add a Put key with value to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Put(const Slice& user_key, const Slice& value);
+
+  // Add a Put (key with timestamp, value) to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value);
+
+  // Add a Merge key with value to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Merge(const Slice& user_key, const Slice& value);
+
+  // Add a deletion key to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Delete(const Slice& user_key);
+
+  // Add a deletion key with timestamp to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Delete(const Slice& user_key, const Slice& timestamp);
+
+  // Add a range deletion tombstone to currently opened file
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key);
+
+  // Add a range deletion tombstone to currently opened file.
+  // REQUIRES: begin_key and end_key are user keys without timestamp.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key,
+                     const Slice& timestamp);
+
+  // Finalize writing to sst file and close file.
+  //
+  // An optional ExternalSstFileInfo pointer can be passed to the function
+  // which will be populated with information about the created sst file.
+  Status Finish(ExternalSstFileInfo* file_info = nullptr);
+
+  // Return the current file size.
+  uint64_t FileSize();
+
+ private:
+  void InvalidatePageCache(bool closing);
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_partitioner.h b/src/rocksdb/include/rocksdb/sst_partitioner.h
new file mode 100644
index 000000000..3af8e9492
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_partitioner.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+enum PartitionerResult : char {
+  // Partitioner does not require to create new file
+  kNotRequired = 0x0,
+  // Partitioner is requesting forcefully to create new file
+  kRequired = 0x1
+  // Additional constants can be added
+};
+
+struct PartitionerRequest {
+  PartitionerRequest(const Slice& prev_user_key_,
+                     const Slice& current_user_key_,
+                     uint64_t current_output_file_size_)
+      : prev_user_key(&prev_user_key_),
+        current_user_key(&current_user_key_),
+        current_output_file_size(current_output_file_size_) {}
+  const Slice* prev_user_key;
+  const Slice* current_user_key;
+  uint64_t current_output_file_size;
+};
+
+/*
+ * A SstPartitioner is a generic pluggable way of defining the partition
+ * of SST files. Compaction job will split the SST files on partition boundary
+ * to lower the write amplification during SST file promote to higher level.
+ */
+class SstPartitioner {
+ public:
+  virtual ~SstPartitioner() {}
+
+  // Return the name of this partitioner.
+  virtual const char* Name() const = 0;
+
+  // It is called for all keys in compaction. When partitioner want to create
+  // new SST file it needs to return true. It means compaction job will finish
+  // current SST file where last key is "prev_user_key" parameter and start new
+  // SST file where first key is "current_user_key". Returns decision if
+  // partition boundary was detected and compaction should create new file.
+  virtual PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) = 0;
+
+  // Called with smallest and largest keys in SST file when compaction try to do
+  // trivial move. Returns true is partitioner allows to do trivial move.
+  virtual bool CanDoTrivialMove(const Slice& smallest_user_key,
+                                const Slice& largest_user_key) = 0;
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+    // Is this compaction requested by the client (true),
+    // or is it occurring as an automatic compaction process
+    bool is_manual_compaction;
+    // Output level for this compaction
+    int output_level;
+    // Smallest key for compaction
+    Slice smallest_user_key;
+    // Largest key for compaction
+    Slice largest_user_key;
+  };
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SstPartitionerFactory : public Customizable {
+ public:
+  ~SstPartitionerFactory() override {}
+  static const char* Type() { return "SstPartitionerFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<SstPartitionerFactory>* result);
+
+  virtual std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& context) const = 0;
+
+  // Returns a name that identifies this partitioner factory.
+  const char* Name() const override = 0;
+};
+
+/*
+ * Fixed key prefix partitioner. It splits the output SST files when prefix
+ * defined by size changes.
+ */
+class SstPartitionerFixedPrefix : public SstPartitioner {
+ public:
+  explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {}
+
+  virtual ~SstPartitionerFixedPrefix() override {}
+
+  const char* Name() const override { return "SstPartitionerFixedPrefix"; }
+
+  PartitionerResult ShouldPartition(const PartitionerRequest& request) override;
+
+  bool CanDoTrivialMove(const Slice& smallest_user_key,
+                        const Slice& largest_user_key) override;
+
+ private:
+  size_t len_;
+};
+
+/*
+ * Factory for fixed prefix partitioner.
+ */
+class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory {
+ public:
+  explicit SstPartitionerFixedPrefixFactory(size_t len);
+
+  ~SstPartitionerFixedPrefixFactory() override {}
+
+  static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override;
+
+ private:
+  size_t len_;
+};
+
+extern std::shared_ptr<SstPartitionerFactory>
+NewSstPartitionerFixedPrefixFactory(size_t prefix_len);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
new file mode 100644
index 000000000..42a938f30
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -0,0 +1,707 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/**
+ * Keep adding tickers here.
+ *  1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking
+ *     over its old value.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
+ *  3. Add a corresponding enum value to TickerType.java in the java API
+ *  4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
+ *     and toCppTickers
+ */
+enum Tickers : uint32_t {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS = 0,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of failures when adding blocks to block cache.
+  BLOCK_CACHE_ADD_FAILURES,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of index blocks added to block cache.
+  BLOCK_CACHE_INDEX_ADD,
+  // # of bytes of index blocks inserted into cache
+  BLOCK_CACHE_INDEX_BYTES_INSERT,
+  // # of bytes of index block erased from cache
+  BLOCK_CACHE_INDEX_BYTES_EVICT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of filter blocks added to block cache.
+  BLOCK_CACHE_FILTER_ADD,
+  // # of bytes of bloom filter blocks inserted into cache
+  BLOCK_CACHE_FILTER_BYTES_INSERT,
+  // # of bytes of bloom filter block erased from cache
+  BLOCK_CACHE_FILTER_BYTES_EVICT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of data blocks added to block cache.
+  BLOCK_CACHE_DATA_ADD,
+  // # of bytes of data blocks inserted into cache
+  BLOCK_CACHE_DATA_BYTES_INSERT,
+  // # of bytes read from cache.
+  BLOCK_CACHE_BYTES_READ,
+  // # of bytes written into cache.
+  BLOCK_CACHE_BYTES_WRITE,
+
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  BLOOM_FILTER_USEFUL,
+  // # of times bloom FullFilter has not avoided the reads.
+  BLOOM_FILTER_FULL_POSITIVE,
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  BLOOM_FILTER_FULL_TRUE_POSITIVE,
+
+  BLOOM_FILTER_MICROS,
+
+  // # persistent cache hit
+  PERSISTENT_CACHE_HIT,
+  // # persistent cache miss
+  PERSISTENT_CACHE_MISS,
+
+  // # total simulation block cache hits
+  SIM_BLOCK_CACHE_HIT,
+  // # total simulation block cache misses
+  SIM_BLOCK_CACHE_MISS,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  // # of Get() queries served by L0
+  GET_HIT_L0,
+  // # of Get() queries served by L1
+  GET_HIT_L1,
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 4 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
+                                    // Also includes keys dropped for range del.
+  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
+  COMPACTION_KEY_DROP_RANGE_DEL,    // key was covered by a range tombstone.
+  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
+  COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+  // If a compaction was canceled in sfm to prevent ENOSPC
+  COMPACTION_CANCELLED,
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+  // DB::Merge(), and DB::Write().
+  BYTES_WRITTEN,
+  // The number of uncompressed bytes read from DB::Get().  It could be
+  // either from memtables, cache, or table files.
+  // For the number of logical bytes read from DB::MultiGet(),
+  // please use NUMBER_MULTIGET_BYTES_READ.
+  BYTES_READ,
+  // The number of calls to seek/next/prev
+  NUMBER_DB_SEEK,
+  NUMBER_DB_NEXT,
+  NUMBER_DB_PREV,
+  // The number of calls to seek/next/prev that returned data
+  NUMBER_DB_SEEK_FOUND,
+  NUMBER_DB_NEXT_FOUND,
+  NUMBER_DB_PREV_FOUND,
+  // The number of uncompressed bytes read from an iterator.
+  // Includes size of key and value.
+  ITER_BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // DEPRECATED Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // DEPRECATED Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // DEPRECATED write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS,
+  // The wait time for db mutex.
+  // Disabled by default. To enable it set stats level to kAll
+  DB_MUTEX_WAIT_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+  // DEPRECATED number of iterators currently open
+  NO_ITERATORS,
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpdatesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
+  // Number of blocks added to compressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD,
+  // Number of failures when adding blocks to compressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+  WAL_FILE_SYNCED,  // Number of times WAL sync is done
+  WAL_FILE_BYTES,   // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,  // Equivalent to writes done for others
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
+  WRITE_WITH_WAL,       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES,   // Bytes read during compaction
+  COMPACT_WRITE_BYTES,  // Bytes written during compaction
+  FLUSH_WRITE_BYTES,    // Bytes written during flush
+
+  // Compaction read and write statistics broken down by CompactionReason
+  COMPACT_READ_BYTES_MARKED,
+  COMPACT_READ_BYTES_PERIODIC,
+  COMPACT_READ_BYTES_TTL,
+  COMPACT_WRITE_BYTES_MARKED,
+  COMPACT_WRITE_BYTES_PERIODIC,
+  COMPACT_WRITE_BYTES_TTL,
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+  NUMBER_SUPERVERSION_ACQUIRES,
+  NUMBER_SUPERVERSION_RELEASES,
+  NUMBER_SUPERVERSION_CLEANUPS,
+
+  // # of compressions/decompressions executed
+  NUMBER_BLOCK_COMPRESSED,
+  NUMBER_BLOCK_DECOMPRESSED,
+
+  NUMBER_BLOCK_NOT_COMPRESSED,
+  MERGE_OPERATION_TOTAL_TIME,
+  FILTER_OPERATION_TOTAL_TIME,
+
+  // Row cache.
+  ROW_CACHE_HIT,
+  ROW_CACHE_MISS,
+
+  // Read amplification statistics.
+  // Read amplification can be calculated using this formula
+  // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+  //
+  // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
+  READ_AMP_ESTIMATE_USEFUL_BYTES,  // Estimate of total bytes actually used.
+  READ_AMP_TOTAL_READ_BYTES,       // Total size of loaded data blocks.
+
+  // Number of refill intervals where rate limiter's bytes are fully consumed.
+  NUMBER_RATE_LIMITER_DRAINS,
+
+  // Number of internal keys skipped by Iterator
+  NUMBER_ITER_SKIP,
+
+  // BlobDB specific stats
+  // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_PUT,
+  // # of Write to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_WRITE,
+  // # of Get to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_GET,
+  // # of MultiGet to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_MULTIGET,
+  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_NUM_SEEK,
+  // # of Next to BlobDB iterator. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_NEXT,
+  // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_PREV,
+  // # of keys written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_KEYS_WRITTEN,
+  // # of keys read from BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_KEYS_READ,
+  // # of bytes (key + value) written to BlobDB. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_BYTES_WRITTEN,
+  // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_BYTES_READ,
+  // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_WRITE_INLINED,
+  // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_INLINED_TTL,
+  // # of keys written by BlobDB as non-TTL blob value. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_WRITE_BLOB,
+  // # of keys written by BlobDB as TTL blob value. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_BLOB_TTL,
+  // # of bytes written to blob file.
+  BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+  // # of bytes read from blob file.
+  BLOB_DB_BLOB_FILE_BYTES_READ,
+  // # of times a blob files being synced.
+  BLOB_DB_BLOB_FILE_SYNCED,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of expiration. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of expiration. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of corresponding file deleted. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of corresponding file deleted. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
+  // # of blob files that were obsoleted by garbage collection. Only applicable
+  // to legacy BlobDB.
+  BLOB_DB_GC_NUM_FILES,
+  // # of blob files generated by garbage collection. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_GC_NUM_NEW_FILES,
+  // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB.
+  BLOB_DB_GC_FAILURES,
+  // # of keys dropped by BlobDB garbage collection because they had been
+  // overwritten. DEPRECATED.
+  BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+  // # of keys dropped by BlobDB garbage collection because of expiration.
+  // DEPRECATED.
+  BLOB_DB_GC_NUM_KEYS_EXPIRED,
+  // # of keys relocated to new blob file by garbage collection.
+  BLOB_DB_GC_NUM_KEYS_RELOCATED,
+  // # of bytes dropped by BlobDB garbage collection because they had been
+  // overwritten. DEPRECATED.
+  BLOB_DB_GC_BYTES_OVERWRITTEN,
+  // # of bytes dropped by BlobDB garbage collection because of expiration.
+  // DEPRECATED.
+  BLOB_DB_GC_BYTES_EXPIRED,
+  // # of bytes relocated to new blob file by garbage collection.
+  BLOB_DB_GC_BYTES_RELOCATED,
+  // # of blob files evicted because of BlobDB is full. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_FIFO_NUM_FILES_EVICTED,
+  // # of keys in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+  // # of bytes in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_FIFO_BYTES_EVICTED,
+
+  // These counters indicate a performance issue in WritePrepared transactions.
+  // We should not seem them ticking them much.
+  // # of times prepare_mutex_ is acquired in the fast path.
+  TXN_PREPARE_MUTEX_OVERHEAD,
+  // # of times old_commit_map_mutex_ is acquired in the fast path.
+  TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+  // # of times we checked a batch for duplicate keys.
+  TXN_DUPLICATE_KEY_OVERHEAD,
+  // # of times snapshot_mutex_ is acquired in the fast path.
+  TXN_SNAPSHOT_MUTEX_OVERHEAD,
+  // # of times ::Get returned TryAgain due to expired snapshot seq
+  TXN_GET_TRY_AGAIN,
+
+  // Number of keys actually found in MultiGet calls (vs number requested by
+  // caller)
+  // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
+  NUMBER_MULTIGET_KEYS_FOUND,
+
+  NO_ITERATOR_CREATED,  // number of iterators created
+  NO_ITERATOR_DELETED,  // number of iterators deleted
+
+  BLOCK_CACHE_COMPRESSION_DICT_MISS,
+  BLOCK_CACHE_COMPRESSION_DICT_HIT,
+  BLOCK_CACHE_COMPRESSION_DICT_ADD,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+
+  // # of blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD
+  BLOCK_CACHE_ADD_REDUNDANT,
+  // # of index blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD
+  BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+  // # of filter blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD
+  BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+  // # of data blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD
+  BLOCK_CACHE_DATA_ADD_REDUNDANT,
+  // # of dict blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT
+  //           <= BLOCK_CACHE_COMPRESSION_DICT_ADD
+  BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+
+  // # of files marked as trash by sst file manager and will be deleted
+  // later by background thread.
+  FILES_MARKED_TRASH,
+  // # of files deleted immediately by sst file manger through delete scheduler.
+  FILES_DELETED_IMMEDIATELY,
+
+  // The counters for error handler, not that, bg_io_error is the subset of
+  // bg_error and bg_retryable_io_error is the subset of bg_io_error
+  ERROR_HANDLER_BG_ERROR_COUNT,
+  ERROR_HANDLER_BG_IO_ERROR_COUNT,
+  ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+  ERROR_HANDLER_AUTORESUME_COUNT,
+  ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+  ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+
+  // Statistics for memtable garbage collection:
+  // Raw bytes of data (payload) present on memtable at flush time.
+  MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+  // Outdated bytes of data present on memtable at flush time.
+  MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+
+  // Secondary cache statistics
+  SECONDARY_CACHE_HITS,
+
+  // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+  VERIFY_CHECKSUM_READ_BYTES,
+
+  // Bytes read/written while creating backups
+  BACKUP_READ_BYTES,
+  BACKUP_WRITE_BYTES,
+
+  // Remote compaction read/write statistics
+  REMOTE_COMPACT_READ_BYTES,
+  REMOTE_COMPACT_WRITE_BYTES,
+
+  // Tiered storage related statistics
+  HOT_FILE_READ_BYTES,
+  WARM_FILE_READ_BYTES,
+  COLD_FILE_READ_BYTES,
+  HOT_FILE_READ_COUNT,
+  WARM_FILE_READ_COUNT,
+  COLD_FILE_READ_COUNT,
+
+  // Last level and non-last level read statistics
+  LAST_LEVEL_READ_BYTES,
+  LAST_LEVEL_READ_COUNT,
+  NON_LAST_LEVEL_READ_BYTES,
+  NON_LAST_LEVEL_READ_COUNT,
+
+  BLOCK_CHECKSUM_COMPUTE_COUNT,
+  MULTIGET_COROUTINE_COUNT,
+
+  // Integrated BlobDB specific stats
+  // # of times cache miss when accessing blob from blob cache.
+  BLOB_DB_CACHE_MISS,
+  // # of times cache hit when accessing blob from blob cache.
+  BLOB_DB_CACHE_HIT,
+  // # of data blocks added to blob cache.
+  BLOB_DB_CACHE_ADD,
+  // # of failures when adding blobs to blob cache.
+  BLOB_DB_CACHE_ADD_FAILURES,
+  // # of bytes read from blob cache.
+  BLOB_DB_CACHE_BYTES_READ,
+  // # of bytes written into blob cache.
+  BLOB_DB_CACHE_BYTES_WRITE,
+
+  // Time spent in the ReadAsync file system call
+  READ_ASYNC_MICROS,
+  // Number of errors returned to the async read callback
+  ASYNC_READ_ERROR_COUNT,
+
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram should have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ * Add a corresponding enum value to HistogramType.java in the java API
+ */
+enum Histograms : uint32_t {
+  DB_GET = 0,
+  DB_WRITE,
+  COMPACTION_TIME,
+  COMPACTION_CPU_TIME,
+  SUBCOMPACTION_SETUP_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  DB_SEEK,
+  WRITE_STALL,
+  SST_READ_MICROS,
+  // The number of subcompactions actually scheduled during a compaction
+  NUM_SUBCOMPACTIONS_SCHEDULED,
+  // Value size distribution in each operation
+  BYTES_PER_READ,
+  BYTES_PER_WRITE,
+  BYTES_PER_MULTIGET,
+
+  // number of bytes compressed/decompressed
+  // number of bytes is when uncompressed; i.e. before/after respectively
+  BYTES_COMPRESSED,
+  BYTES_DECOMPRESSED,
+  COMPRESSION_TIMES_NANOS,
+  DECOMPRESSION_TIMES_NANOS,
+  // Number of merge operands passed to the merge operator in user read
+  // requests.
+  READ_NUM_MERGE_OPERANDS,
+
+  // BlobDB specific stats
+  // Size of keys written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_KEY_SIZE,
+  // Size of values written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_VALUE_SIZE,
+  // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_MICROS,
+  // BlobDB Get latency. Only applicable to legacy BlobDB.
+  BLOB_DB_GET_MICROS,
+  // BlobDB MultiGet latency. Only applicable to legacy BlobDB.
+  BLOB_DB_MULTIGET_MICROS,
+  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_SEEK_MICROS,
+  // BlobDB Next latency. Only applicable to legacy BlobDB.
+  BLOB_DB_NEXT_MICROS,
+  // BlobDB Prev latency. Only applicable to legacy BlobDB.
+  BLOB_DB_PREV_MICROS,
+  // Blob file write latency.
+  BLOB_DB_BLOB_FILE_WRITE_MICROS,
+  // Blob file read latency.
+  BLOB_DB_BLOB_FILE_READ_MICROS,
+  // Blob file sync latency.
+  BLOB_DB_BLOB_FILE_SYNC_MICROS,
+  // BlobDB garbage collection time. DEPRECATED.
+  BLOB_DB_GC_MICROS,
+  // BlobDB compression time.
+  BLOB_DB_COMPRESSION_MICROS,
+  // BlobDB decompression time.
+  BLOB_DB_DECOMPRESSION_MICROS,
+  // Time spent flushing memtable to disk
+  FLUSH_TIME,
+  SST_BATCH_SIZE,
+
+  // MultiGet stats logged per level
+  // Num of index and filter blocks read from file system per level.
+  NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+  // Num of data blocks read from file system per level.
+  // Obsolete
+  NUM_DATA_BLOCKS_READ_PER_LEVEL,
+  // Num of sst files read from file system per level.
+  NUM_SST_READ_PER_LEVEL,
+
+  // Error handler statistics
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+
+  // Stats related to asynchronous read requests.
+  ASYNC_READ_BYTES,
+  POLL_WAIT_MICROS,
+
+  // Number of prefetched bytes discarded by RocksDB.
+  PREFETCHED_BYTES_DISCARDED,
+
+  // Number of IOs issued in parallel in a MultiGet batch
+  MULTIGET_IO_BATCH_SIZE,
+
+  // Number of levels requiring IO for MultiGet
+  NUM_LEVEL_READ_PER_MULTIGET,
+
+  // Wait time for aborting async read in FilePrefetchBuffer destructor
+  ASYNC_PREFETCH_ABORT_MICROS,
+
+  HISTOGRAM_ENUM_MAX,
+};
+
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+  // zero-initialize new members since old Statistics::histogramData()
+  // implementations won't write them.
+  double max = 0.0;
+  uint64_t count = 0;
+  uint64_t sum = 0;
+  double min = 0.0;
+};
+
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+//   options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+enum StatsLevel : uint8_t {
+  // Disable all metrics
+  kDisableAll,
+  // Disable tickers
+  kExceptTickers = kDisableAll,
+  // Disable timer stats, and skip histogram stats
+  kExceptHistogramOrTimers,
+  // Skip timer stats
+  kExceptTimers,
+  // Collect all stats except time inside mutex lock AND time spent on
+  // compression.
+  kExceptDetailedTimers,
+  // Collect all stats except the counters requiring to get time inside the
+  // mutex lock.
+  kExceptTimeForMutex,
+  // Collect all stats, including measuring duration of mutex operations.
+  // If getting time is expensive on the platform to run, it can
+  // reduce scalability to more threads, especially for writes.
+  kAll,
+};
+
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+//  Options options;
+//  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+//  Status s = DB::Open(options, kDBPath, &db);
+//  ...
+//  options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+//  HistogramData hist;
+//  options.statistics->histogramData(FLUSH_TIME, &hist);
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Statistics : public Customizable {
+ public:
+  ~Statistics() override {}
+  static const char* Type() { return "Statistics"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<Statistics>* result);
+  // Default name of empty, for backwards compatibility.  Derived classes should
+  // override this method.
+  // This default implementation will likely be removed in a future release
+  const char* Name() const override { return ""; }
+  virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
+  virtual void histogramData(uint32_t type,
+                             HistogramData* const data) const = 0;
+  virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
+  virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
+  virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
+  virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
+    if (get_stats_level() <= StatsLevel::kExceptTimers) {
+      return;
+    }
+    recordInHistogram(histogramType, time);
+  }
+  // The function is here only for backward compatibility reason.
+  // Users implementing their own Statistics class should override
+  // recordInHistogram() instead and leave measureTime() as it is.
+  virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
+    // This is not supposed to be called.
+    assert(false);
+  }
+  virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
+    // measureTime() is the old and inaccurate function name.
+    // To keep backward compatible. If users implement their own
+    // statistics, which overrides measureTime() but doesn't override
+    // this function. We forward to measureTime().
+    measureTime(histogramType, time);
+  }
+
+  // Resets all ticker and histogram stats
+  virtual Status Reset() { return Status::NotSupported("Not implemented"); }
+
+#ifndef ROCKSDB_LITE
+  using Customizable::ToString;
+#endif  // ROCKSDB_LITE
+  // String representation of the statistic object. Must be thread-safe.
+  virtual std::string ToString() const {
+    // Do nothing by default
+    return std::string("ToString(): not implemented");
+  }
+
+  virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
+    // Do nothing by default
+    return false;
+  }
+
+  // Override this function to disable particular histogram collection
+  virtual bool HistEnabledForType(uint32_t type) const {
+    return type < HISTOGRAM_ENUM_MAX;
+  }
+  void set_stats_level(StatsLevel sl) {
+    stats_level_.store(sl, std::memory_order_relaxed);
+  }
+  StatsLevel get_stats_level() const {
+    return stats_level_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h
new file mode 100644
index 000000000..57e469295
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/stats_history.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+//   std::unique_ptr<StatsHistoryIterator> stats_iter;
+//   Status s = db->GetStatsHistory(0 /* start_time */,
+//                                  env->NowMicros() /* end_time*/,
+//                                  &stats_iter);
+//   if (s.ok) {
+//     for (; stats_iter->Valid(); stats_iter->Next()) {
+//       uint64_t stats_time = stats_iter->GetStatsTime();
+//       const std::map<std::string, uint64_t>& stats_map =
+//           stats_iter->GetStatsMap();
+//       process(stats_time, stats_map);
+//     }
+//   }
+class StatsHistoryIterator {
+ public:
+  StatsHistoryIterator() {}
+  virtual ~StatsHistoryIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  // Moves to the next stats history record.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Return the time stamp (in seconds) when stats history is recorded.
+  // REQUIRES: Valid()
+  virtual uint64_t GetStatsTime() const = 0;
+
+  // DEPRECATED (was never used)
+  virtual int GetFormatVersion() const { return -1; }
+
+  // Return the current stats history as an std::map which specifies the
+  // mapping from stats name to stats value . The underlying storage
+  // for the returned map is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  virtual Status status() const = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
new file mode 100644
index 000000000..1ab3dc4cb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -0,0 +1,570 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#pragma once
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include <memory>
+#include <string>
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include "port/stack_trace.h"
+#endif
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status {
+ public:
+  // Create a success status.
+  Status()
+      : code_(kOk),
+        subcode_(kNone),
+        sev_(kNoError),
+        retryable_(false),
+        data_loss_(false),
+        scope_(0),
+        state_(nullptr) {}
+  ~Status() {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (!checked_) {
+      fprintf(stderr, "Failed to check Status %p\n", this);
+      port::PrintStack();
+      std::abort();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  Status& operator=(const Status& s);
+  Status(Status&& s) noexcept;
+  Status& operator=(Status&& s) noexcept;
+  bool operator==(const Status& rhs) const;
+  bool operator!=(const Status& rhs) const;
+
+  // In case of intentionally swallowing an error, user must explicitly call
+  // this function. That way we are easily able to search the code to find where
+  // error swallowing occurs.
+  inline void PermitUncheckedError() const { MarkChecked(); }
+
+  inline void MustCheck() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+
+  enum Code : unsigned char {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7,
+    kShutdownInProgress = 8,
+    kTimedOut = 9,
+    kAborted = 10,
+    kBusy = 11,
+    kExpired = 12,
+    kTryAgain = 13,
+    kCompactionTooLarge = 14,
+    kColumnFamilyDropped = 15,
+    kMaxCode
+  };
+
+  Code code() const {
+    MarkChecked();
+    return code_;
+  }
+
+  enum SubCode : unsigned char {
+    kNone = 0,
+    kMutexTimeout = 1,
+    kLockTimeout = 2,
+    kLockLimit = 3,
+    kNoSpace = 4,
+    kDeadlock = 5,
+    kStaleFile = 6,
+    kMemoryLimit = 7,
+    kSpaceLimit = 8,
+    kPathNotFound = 9,
+    KMergeOperandsInsufficientCapacity = 10,
+    kManualCompactionPaused = 11,
+    kOverwritten = 12,
+    kTxnNotPrepared = 13,
+    kIOFenced = 14,
+    kMaxSubCode
+  };
+
+  SubCode subcode() const {
+    MarkChecked();
+    return subcode_;
+  }
+
+  enum Severity : unsigned char {
+    kNoError = 0,
+    kSoftError = 1,
+    kHardError = 2,
+    kFatalError = 3,
+    kUnrecoverableError = 4,
+    kMaxSeverity
+  };
+
+  Status(const Status& s, Severity sev);
+
+  Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg)
+      : Status(_code, _subcode, msg, "", _sev) {}
+
+  Severity severity() const {
+    MarkChecked();
+    return sev_;
+  }
+
+  // Returns a C style string indicating the message of the Status
+  const char* getState() const {
+    MarkChecked();
+    return state_.get();
+  }
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Successful, though an existing something was overwritten
+  // Note: using variants of OK status for program logic is discouraged,
+  // but it can be useful for communicating statistical information without
+  // changing public APIs.
+  static Status OkOverwritten() { return Status(kOk, kOverwritten); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+
+  // Fast path for not found without malloc;
+  static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
+
+  static Status NotFound(SubCode sc, const Slice& msg,
+                         const Slice& msg2 = Slice()) {
+    return Status(kNotFound, sc, msg, msg2);
+  }
+
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status Corruption(SubCode msg = kNone) {
+    return Status(kCorruption, msg);
+  }
+
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status NotSupported(SubCode msg = kNone) {
+    return Status(kNotSupported, msg);
+  }
+
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status InvalidArgument(SubCode msg = kNone) {
+    return Status(kInvalidArgument, msg);
+  }
+
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); }
+
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status MergeInProgress(SubCode msg = kNone) {
+    return Status(kMergeInProgress, msg);
+  }
+
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+  static Status Incomplete(SubCode msg = kNone) {
+    return Status(kIncomplete, msg);
+  }
+
+  static Status ShutdownInProgress(SubCode msg = kNone) {
+    return Status(kShutdownInProgress, msg);
+  }
+  static Status ShutdownInProgress(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kShutdownInProgress, msg, msg2);
+  }
+  static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); }
+  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, msg, msg2);
+  }
+
+  static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); }
+  static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kBusy, msg, msg2);
+  }
+
+  static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); }
+  static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTimedOut, msg, msg2);
+  }
+
+  static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); }
+  static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kExpired, msg, msg2);
+  }
+
+  static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); }
+  static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTryAgain, msg, msg2);
+  }
+
+  static Status CompactionTooLarge(SubCode msg = kNone) {
+    return Status(kCompactionTooLarge, msg);
+  }
+  static Status CompactionTooLarge(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kCompactionTooLarge, msg, msg2);
+  }
+
+  static Status ColumnFamilyDropped(SubCode msg = kNone) {
+    return Status(kColumnFamilyDropped, msg);
+  }
+
+  static Status ColumnFamilyDropped(const Slice& msg,
+                                    const Slice& msg2 = Slice()) {
+    return Status(kColumnFamilyDropped, msg, msg2);
+  }
+
+  static Status NoSpace() { return Status(kIOError, kNoSpace); }
+  static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kNoSpace, msg, msg2);
+  }
+
+  static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); }
+  static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, kMemoryLimit, msg, msg2);
+  }
+
+  static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); }
+  static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kSpaceLimit, msg, msg2);
+  }
+
+  static Status PathNotFound() { return Status(kIOError, kPathNotFound); }
+  static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kPathNotFound, msg, msg2);
+  }
+
+  static Status TxnNotPrepared() {
+    return Status(kInvalidArgument, kTxnNotPrepared);
+  }
+  static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const {
+    MarkChecked();
+    return code() == kOk;
+  }
+
+  // Returns true iff the status indicates success *with* something
+  // overwritten
+  bool IsOkOverwritten() const {
+    MarkChecked();
+    return code() == kOk && subcode() == kOverwritten;
+  }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const {
+    MarkChecked();
+    return code() == kNotFound;
+  }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const {
+    MarkChecked();
+    return code() == kCorruption;
+  }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const {
+    MarkChecked();
+    return code() == kNotSupported;
+  }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const {
+    MarkChecked();
+    return code() == kInvalidArgument;
+  }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const {
+    MarkChecked();
+    return code() == kIOError;
+  }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const {
+    MarkChecked();
+    return code() == kMergeInProgress;
+  }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const {
+    MarkChecked();
+    return code() == kIncomplete;
+  }
+
+  // Returns true iff the status indicates Shutdown In progress
+  bool IsShutdownInProgress() const {
+    MarkChecked();
+    return code() == kShutdownInProgress;
+  }
+
+  bool IsTimedOut() const {
+    MarkChecked();
+    return code() == kTimedOut;
+  }
+
+  bool IsAborted() const {
+    MarkChecked();
+    return code() == kAborted;
+  }
+
+  bool IsLockLimit() const {
+    MarkChecked();
+    return code() == kAborted && subcode() == kLockLimit;
+  }
+
+  // Returns true iff the status indicates that a resource is Busy and
+  // temporarily could not be acquired.
+  bool IsBusy() const {
+    MarkChecked();
+    return code() == kBusy;
+  }
+
+  bool IsDeadlock() const {
+    MarkChecked();
+    return code() == kBusy && subcode() == kDeadlock;
+  }
+
+  // Returns true iff the status indicated that the operation has Expired.
+  bool IsExpired() const {
+    MarkChecked();
+    return code() == kExpired;
+  }
+
+  // Returns true iff the status indicates a TryAgain error.
+  // This usually means that the operation failed, but may succeed if
+  // re-attempted.
+  bool IsTryAgain() const {
+    MarkChecked();
+    return code() == kTryAgain;
+  }
+
+  // Returns true iff the status indicates the proposed compaction is too large
+  bool IsCompactionTooLarge() const {
+    MarkChecked();
+    return code() == kCompactionTooLarge;
+  }
+
+  // Returns true iff the status indicates Column Family Dropped
+  bool IsColumnFamilyDropped() const {
+    MarkChecked();
+    return code() == kColumnFamilyDropped;
+  }
+
+  // Returns true iff the status indicates a NoSpace error
+  // This is caused by an I/O error returning the specific "out of space"
+  // error condition. Stricto sensu, an NoSpace error is an I/O error
+  // with a specific subcode, enabling users to take the appropriate action
+  // if needed
+  bool IsNoSpace() const {
+    MarkChecked();
+    return (code() == kIOError) && (subcode() == kNoSpace);
+  }
+
+  // Returns true iff the status indicates a memory limit error.  There may be
+  // cases where we limit the memory used in certain operations (eg. the size
+  // of a write batch) in order to avoid out of memory exceptions.
+  bool IsMemoryLimit() const {
+    MarkChecked();
+    return (code() == kAborted) && (subcode() == kMemoryLimit);
+  }
+
+  // Returns true iff the status indicates a PathNotFound error
+  // This is caused by an I/O error returning the specific "no such file or
+  // directory" error condition. A PathNotFound error is an I/O error with
+  // a specific subcode, enabling users to take appropriate action if necessary
+  bool IsPathNotFound() const {
+    MarkChecked();
+    return (code() == kIOError || code() == kNotFound) &&
+           (subcode() == kPathNotFound);
+  }
+
+  // Returns true iff the status indicates manual compaction paused. This
+  // is caused by a call to PauseManualCompaction
+  bool IsManualCompactionPaused() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+  }
+
+  // Returns true iff the status indicates a TxnNotPrepared error.
+  bool IsTxnNotPrepared() const {
+    MarkChecked();
+    return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared);
+  }
+
+  // Returns true iff the status indicates a IOFenced error.
+  bool IsIOFenced() const {
+    MarkChecked();
+    return (code() == kIOError) && (subcode() == kIOFenced);
+  }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ protected:
+  Code code_;
+  SubCode subcode_;
+  Severity sev_;
+  bool retryable_;
+  bool data_loss_;
+  unsigned char scope_;
+  // A nullptr state_ (which is at least the case for OK) means the extra
+  // message is empty.
+  std::unique_ptr<const char[]> state_;
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  mutable bool checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+
+  explicit Status(Code _code, SubCode _subcode = kNone)
+      : code_(_code),
+        subcode_(_subcode),
+        sev_(kNoError),
+        retryable_(false),
+        data_loss_(false),
+        scope_(0) {}
+
+  explicit Status(Code _code, SubCode _subcode, bool retryable, bool data_loss,
+                  unsigned char scope)
+      : code_(_code),
+        subcode_(_subcode),
+        sev_(kNoError),
+        retryable_(retryable),
+        data_loss_(data_loss),
+        scope_(scope) {}
+
+  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2,
+         Severity sev = kNoError);
+  Status(Code _code, const Slice& msg, const Slice& msg2)
+      : Status(_code, kNone, msg, msg2) {}
+
+  static std::unique_ptr<const char[]> CopyState(const char* s);
+
+  inline void MarkChecked() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+};
+
+inline Status::Status(const Status& s)
+    : code_(s.code_),
+      subcode_(s.subcode_),
+      sev_(s.sev_),
+      retryable_(s.retryable_),
+      data_loss_(s.data_loss_),
+      scope_(s.scope_) {
+  s.MarkChecked();
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status::Status(const Status& s, Severity sev)
+    : code_(s.code_),
+      subcode_(s.subcode_),
+      sev_(sev),
+      retryable_(s.retryable_),
+      data_loss_(s.data_loss_),
+      scope_(s.scope_) {
+  s.MarkChecked();
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status& Status::operator=(const Status& s) {
+  if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
+    code_ = s.code_;
+    subcode_ = s.subcode_;
+    sev_ = s.sev_;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+  }
+  return *this;
+}
+
+inline Status::Status(Status&& s) noexcept : Status() {
+  s.MarkChecked();
+  *this = std::move(s);
+}
+
+inline Status& Status::operator=(Status&& s) noexcept {
+  if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
+    code_ = std::move(s.code_);
+    s.code_ = kOk;
+    subcode_ = std::move(s.subcode_);
+    s.subcode_ = kNone;
+    sev_ = std::move(s.sev_);
+    s.sev_ = kNoError;
+    retryable_ = std::move(s.retryable_);
+    s.retryable_ = false;
+    data_loss_ = std::move(s.data_loss_);
+    s.data_loss_ = false;
+    scope_ = std::move(s.scope_);
+    s.scope_ = 0;
+    state_ = std::move(s.state_);
+  }
+  return *this;
+}
+
+inline bool Status::operator==(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
+  return (code_ == rhs.code_);
+}
+
+inline bool Status::operator!=(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
+  return !(*this == rhs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/system_clock.h b/src/rocksdb/include/rocksdb/system_clock.h
new file mode 100644
index 000000000..486183d60
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/system_clock.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+
+// A SystemClock is an interface used by the rocksdb implementation to access
+// operating system time-related functionality.
+class SystemClock : public Customizable {
+ public:
+  ~SystemClock() override {}
+
+  static const char* Type() { return "SystemClock"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<SystemClock>* result);
+  // The name of this system clock
+  virtual const char* Name() const override = 0;
+
+  // The name/nickname for the Default SystemClock.  This name can be used
+  // to determine if the clock is the default one.
+  static const char* kDefaultName() { return "DefaultClock"; }
+
+  // Return a default SystemClock suitable for the current operating
+  // system.
+  static const std::shared_ptr<SystemClock>& Default();
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // Returns the number of micro-seconds of CPU time used by the current thread.
+  // 0 indicates not supported.
+  virtual uint64_t CPUMicros() { return 0; }
+
+  // Returns the number of nano-seconds of CPU time used by the current thread.
+  // Default implementation simply relies on CPUMicros.
+  // 0 indicates not supported.
+  virtual uint64_t CPUNanos() { return CPUMicros() * 1000; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+};
+
+// Wrapper class for a SystemClock.  Redirects all methods (except Name)
+// of the SystemClock interface to the target/wrapped class.
+class SystemClockWrapper : public SystemClock {
+ public:
+  explicit SystemClockWrapper(const std::shared_ptr<SystemClock>& t);
+
+  uint64_t NowMicros() override { return target_->NowMicros(); }
+
+  uint64_t NowNanos() override { return target_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return target_->CPUMicros(); }
+
+  uint64_t CPUNanos() override { return target_->CPUNanos(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    return target_->SleepForMicroseconds(micros);
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_->GetCurrentTime(unix_time);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_->TimeToString(time);
+  }
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+  const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+  std::shared_ptr<SystemClock> target_;
+};
+
+}  // end namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
new file mode 100644
index 000000000..3a2bf2629
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -0,0 +1,940 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Block-based Table
+class Cache;
+class FilterPolicy;
+class FlushBlockPolicyFactory;
+class PersistentCache;
+class RandomAccessFile;
+struct TableReaderOptions;
+struct TableBuilderOptions;
+class TableBuilder;
+class TableFactory;
+class TableReader;
+class WritableFileWriter;
+struct ConfigOptions;
+struct EnvOptions;
+
+// Types of checksums to use for checking integrity of logical blocks within
+// files. All checksums currently use 32 bits of checking power (1 in 4B
+// chance of failing to detect random corruption).
+enum ChecksumType : char {
+  kNoChecksum = 0x0,
+  kCRC32c = 0x1,
+  kxxHash = 0x2,
+  kxxHash64 = 0x3,
+  kXXH3 = 0x4,  // Supported since RocksDB 6.27
+};
+
+// `PinningTier` is used to specify which tier of block-based tables should
+// be affected by a block cache pinning setting (see
+// `MetadataCacheOptions` below).
+enum class PinningTier {
+  // For compatibility, this value specifies to fallback to the behavior
+  // indicated by the deprecated options,
+  // `pin_l0_filter_and_index_blocks_in_cache` and
+  // `pin_top_level_index_and_filter`.
+  kFallback,
+
+  // This tier contains no block-based tables.
+  kNone,
+
+  // This tier contains block-based tables that may have originated from a
+  // memtable flush. In particular, it includes tables from L0 that are smaller
+  // than 1.5 times the current `write_buffer_size`. Note these criteria imply
+  // it can include intra-L0 compaction outputs and ingested files, as long as
+  // they are not abnormally large compared to flushed files in L0.
+  kFlushedAndSimilar,
+
+  // This tier contains all block-based tables.
+  kAll,
+};
+
+// `MetadataCacheOptions` contains members indicating the desired caching
+// behavior for the different categories of metadata blocks.
+struct MetadataCacheOptions {
+  // The tier of block-based tables whose top-level index into metadata
+  // partitions will be pinned. Currently indexes and filters may be
+  // partitioned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise any top-level index into metadata partitions would be
+  // held in table reader memory, outside the block cache.
+  PinningTier top_level_index_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose metadata partitions will be pinned.
+  // Currently indexes and filters may be partitioned.
+  PinningTier partition_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose unpartitioned metadata blocks will be
+  // pinned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise the unpartitioned meta-blocks would be held in table
+  // reader memory, outside the block cache.
+  PinningTier unpartitioned_pinning = PinningTier::kFallback;
+};
+
+struct CacheEntryRoleOptions {
+  enum class Decision {
+    kEnabled,
+    kDisabled,
+    kFallback,
+  };
+  Decision charged = Decision::kFallback;
+  bool operator==(const CacheEntryRoleOptions& other) const {
+    return charged == other.charged;
+  }
+};
+
+struct CacheUsageOptions {
+  CacheEntryRoleOptions options;
+  std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+  static const char* kName() { return "BlockTableOptions"; };
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
+  //
+  // TODO(ajkr) we need to update names of variables controlling meta-block
+  // caching as they should now apply to range tombstone and compression
+  // dictionary meta-blocks, in addition to index and filter meta-blocks.
+  //
+  // Whether to put index/filter blocks in the block cache. When false,
+  // each "table reader" object will pre-load index/filter blocks during
+  // table initialization. Index and filter partition blocks always use
+  // block cache regardless of this option.
+  bool cache_index_and_filter_blocks = false;
+
+  // If cache_index_and_filter_blocks is enabled, cache index and filter
+  // blocks with high priority. If set to true, depending on implementation of
+  // block cache, index, filter, and other metadata blocks may be less likely
+  // to be evicted than data blocks.
+  bool cache_index_and_filter_blocks_with_high_priority = true;
+
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating each of the following variables that
+  // has the default value, `PinningTier::kFallback`:
+  //
+  // - `MetadataCacheOptions::partition_pinning`
+  // - `MetadataCacheOptions::unpartitioned_pinning`
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
+  //   `PinningTier::kFlushedAndSimilar`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
+  // if cache_index_and_filter_blocks is true and the below is true, then
+  // filter and index blocks are stored in the cache, but a reference is
+  // held in the "table reader" object so the blocks are pinned and only
+  // evicted from cache when the table reader is freed.
+  bool pin_l0_filter_and_index_blocks_in_cache = false;
+
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating
+  // `MetadataCacheOptions::top_level_index_pinning` when it has the
+  // default value, `PinningTier::kFallback`.
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_top_level_index_and_filter == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_top_level_index_and_filter == true` ->
+  //   `PinningTier::kAll`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
+  // If cache_index_and_filter_blocks is true and the below is true, then
+  // the top-level index of partitioned filter and index blocks are stored in
+  // the cache, but a reference is held in the "table reader" object so the
+  // blocks are pinned and only evicted from cache when the table reader is
+  // freed. This is not limited to l0 in LSM tree.
+  bool pin_top_level_index_and_filter = true;
+
+  // The desired block cache pinning behavior for the different categories of
+  // metadata blocks. While pinning can reduce block cache contention, users
+  // must take care not to pin excessive amounts of data, which risks
+  // overflowing block cache.
+  MetadataCacheOptions metadata_cache_options;
+
+  // The index type that will be used for this table.
+  enum IndexType : char {
+    // A space efficient index block that is optimized for
+    // binary-search-based index.
+    kBinarySearch = 0x00,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `Options.prefix_extractor` is provided.
+    kHashSearch = 0x01,
+
+    // A two-level index implementation. Both levels are binary search indexes.
+    // Second level index blocks ("partitions") use block cache even when
+    // cache_index_and_filter_blocks=false.
+    kTwoLevelIndexSearch = 0x02,
+
+    // Like kBinarySearch, but index also contains first key of each block.
+    // This allows iterators to defer reading the block until it's actually
+    // needed. May significantly reduce read amplification of short range scans.
+    // Without it, iterator seek usually reads one block from each level-0 file
+    // and from each level, which may be expensive.
+    // Works best in combination with:
+    //  - IndexShorteningMode::kNoShortening,
+    //  - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+    //    e.g. when prefix changes.
+    // Makes the index significantly bigger (2x or more), especially when keys
+    // are long.
+    kBinarySearchWithFirstKey = 0x03,
+  };
+
+  IndexType index_type = kBinarySearch;
+
+  // The index type that will be used for the data block.
+  enum DataBlockIndexType : char {
+    kDataBlockBinarySearch = 0,   // traditional block type
+    kDataBlockBinaryAndHash = 1,  // additional hash index
+  };
+
+  DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+  // #entries/#buckets. It is valid only when data_block_hash_index_type is
+  // kDataBlockBinaryAndHash.
+  double data_block_hash_table_util_ratio = 0.75;
+
+  // Option hash_index_allow_collision is now deleted.
+  // It will behave as if hash_index_allow_collision=true.
+
+  // Use the specified checksum type. Newly created table files will be
+  // protected with this checksum type. Old table files will still be readable,
+  // even though they have different checksum type.
+  ChecksumType checksum = kXXH3;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  bool no_block_cache = false;
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  std::shared_ptr<Cache> block_cache = nullptr;
+
+  // If non-NULL use the specified cache for pages read from device
+  // IF NULL, no page cache is used
+  std::shared_ptr<PersistentCache> persistent_cache = nullptr;
+
+  // DEPRECATED: This feature is planned for removal in a future release.
+  // Use SecondaryCache instead.
+  //
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+  //       same type of object there.
+  std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  uint64_t block_size = 4 * 1024;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  int block_size_deviation = 10;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.  The minimum value allowed is 1.  Any smaller
+  // value will be silently overwritten with 1.
+  int block_restart_interval = 16;
+
+  // Same as block_restart_interval but used for the index block.
+  int index_block_restart_interval = 1;
+
+  // Block size for partitioned metadata. Currently applied to indexes when
+  // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
+  // Note: Since in the current implementation the filters and index partitions
+  // are aligned, an index/filter block is created when either index or filter
+  // block size reaches the specified limit.
+  // Note: this limit is currently applied to only index blocks; a filter
+  // partition is cut right after an index block is cut
+  // TODO(myabandeh): remove the note above when filter partitions are cut
+  // separately
+  uint64_t metadata_block_size = 4096;
+
+  // `cache_usage_options` allows users to specify the default
+  // options (`cache_usage_options.options`) and the overriding
+  // options (`cache_usage_options.options_overrides`)
+  // for different `CacheEntryRole` under various features related to cache
+  // usage.
+  //
+  // For a certain `CacheEntryRole role` and a certain feature `f` of
+  // `CacheEntryRoleOptions`:
+  // 1. If `options_overrides` has an entry for `role` and
+  // `options_overrides[role].f != kFallback`, we use
+  // `options_overrides[role].f`
+  // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f`
+  // 3. Otherwise, we follow the compatible existing behavior for `f` (see
+  // each feature's comment for more)
+  //
+  // `cache_usage_options` currently supports specifying options for the
+  // following features:
+  //
+  // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`)
+  // Memory charging is a feature of accounting memory usage of specific area
+  // (represented by `CacheEntryRole`) toward usage in block cache (if
+  // available), by updating a dynamical charge to the block cache loosely based
+  // on the actual memory usage of that area.
+  //
+  // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  // (i) If kEnabled:
+  // Charge memory usage of the buffered data used as training samples for
+  // dictionary compression.
+  // If such memory usage exceeds the avaible space left in the block cache
+  // at some point (i.e, causing a cache full under
+  // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be
+  // unbuffered.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kEnabled.
+  //
+  // (b) CacheEntryRole::kFilterConstruction
+  // (i) If kEnabled:
+  // Charge memory usage of Bloom Filter
+  // (format_version >= 5) and Ribbon Filter construction.
+  // If additional temporary memory of Ribbon Filter exceeds the avaible
+  // space left in the block cache at some point (i.e, causing a cache full
+  // under `LRUCacheOptions::strict_capacity_limit` = true),
+  // construction will fall back to Bloom Filter.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (c) CacheEntryRole::kBlockBasedTableReader
+  // (i) If kEnabled:
+  // Charge memory usage of table properties +
+  // index block/filter block/uncompression dictionary (when stored in table
+  // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
+  // false) + some internal data structures during table reader creation.
+  // If such a table reader exceeds
+  // the avaible space left in the block cache at some point (i.e, causing
+  // a cache full under `LRUCacheOptions::strict_capacity_limit` = true),
+  // creation will fail with Status::MemoryLimit().
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (d) CacheEntryRole::kFileMetadata
+  // (i) If kEnabled:
+  // Charge memory usage of file metadata. RocksDB holds one file metadata
+  // structure in-memory per on-disk table file.
+  // If such file metadata's
+  // memory exceeds the avaible space left in the block cache at some point
+  // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` =
+  // true), creation will fail with Status::MemoryLimit().
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (e) Other CacheEntryRole
+  // Not supported.
+  // `Status::kNotSupported` will be returned if
+  // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}.
+  //
+  //
+  // 2. More to come ...
+  //
+  CacheUsageOptions cache_usage_options;
+
+  // Note: currently this option requires kTwoLevelIndexSearch to be set as
+  // well.
+  // TODO(myabandeh): remove the note above once the limitation is lifted
+  // Use partitioned full filters for each SST file. This option is
+  // incompatible with block-based filters. Filter partition blocks use
+  // block cache even when cache_index_and_filter_blocks=false.
+  bool partition_filters = false;
+
+  // Option to generate Bloom/Ribbon filters that minimize memory
+  // internal fragmentation.
+  //
+  // When false, malloc_usable_size is not available, or format_version < 5,
+  // filters are generated without regard to internal fragmentation when
+  // loaded into memory (historical behavior). When true (and
+  // malloc_usable_size is available and format_version >= 5), then
+  // filters are generated to "round up" and "round down" their sizes to
+  // minimize internal fragmentation when loaded into memory, assuming the
+  // reading DB has the same memory allocation characteristics as the
+  // generating DB. This option does not break forward or backward
+  // compatibility.
+  //
+  // While individual filters will vary in bits/key and false positive rate
+  // when setting is true, the implementation attempts to maintain a weighted
+  // average FP rate for filters consistent with this option set to false.
+  //
+  // With Jemalloc for example, this setting is expected to save about 10% of
+  // the memory footprint and block cache charge of filters, while increasing
+  // disk usage of filters by about 1-2% due to encoding efficiency losses
+  // with variance in bits/key.
+  //
+  // NOTE: Because some memory counted by block cache might be unmapped pages
+  // within internal fragmentation, this option can increase observed RSS
+  // memory usage. With cache_index_and_filter_blocks=true, this option makes
+  // the block cache better at using space it is allowed. (These issues
+  // should not arise with partitioned filters.)
+  //
+  // NOTE: Do not set to true if you do not trust malloc_usable_size. With
+  // this option, RocksDB might access an allocated memory object beyond its
+  // original size if malloc_usable_size says it is safe to do so. While this
+  // can be considered bad practice, it should not produce undefined behavior
+  // unless malloc_usable_size is buggy or broken.
+  bool optimize_filters_for_memory = false;
+
+  // Use delta encoding to compress keys in blocks.
+  // ReadOptions::pin_data requires this option to be disabled.
+  //
+  // Default: true
+  bool use_delta_encoding = true;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  bool whole_key_filtering = true;
+
+  // If true, detect corruption during Bloom Filter (format_version >= 5)
+  // and Ribbon Filter construction.
+  //
+  // This is an extra check that is only
+  // useful in detecting software bugs or CPU+memory malfunction.
+  // Turning on this feature increases filter construction time by 30%.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{detect_filter_construct_corruption=true;}"}});
+  //
+  // TODO: optimize this performance
+  bool detect_filter_construct_corruption = false;
+
+  // Verify that decompressing the compressed block gives back the input. This
+  // is a verification mode that we use to detect bugs in compression
+  // algorithms.
+  bool verify_compression = false;
+
+  // If used, For every data block we load into memory, we will create a bitmap
+  // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+  // will be used to figure out the percentage we actually read of the blocks.
+  //
+  // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+  // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+  // read amplification using this formula
+  // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+  //
+  // value  =>  memory usage (percentage of loaded blocks memory)
+  // 1      =>  12.50 %
+  // 2      =>  06.25 %
+  // 4      =>  03.12 %
+  // 8      =>  01.56 %
+  // 16     =>  00.78 %
+  //
+  // Note: This number must be a power of 2, if not it will be sanitized
+  // to be the next lowest power of 2, for example a value of 7 will be
+  // treated as 4, a value of 19 will be treated as 16.
+  //
+  // Default: 0 (disabled)
+  uint32_t read_amp_bytes_per_bit = 0;
+
+  // We currently have these versions:
+  // 0 -- This version can be read by really old RocksDB's. Doesn't support
+  // changing checksum type (default is CRC32).
+  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+  // checksum, like xxHash. It is written by RocksDB when
+  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+  // 0 is silently upconverted)
+  // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+  // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+  // don't plan to run RocksDB before version 3.10, you should probably use
+  // this.
+  // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
+  // encode the keys in index blocks. If you don't plan to run RocksDB before
+  // version 5.15, you should probably use this.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
+  // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+  // encode the values in index blocks. If you don't plan to run RocksDB before
+  // version 5.16 and you are using index_block_restart_interval > 1, you should
+  // probably use this as it would reduce the index size.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
+  // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
+  // filters use a generally faster and more accurate Bloom filter
+  // implementation, with a different schema.
+  uint32_t format_version = 5;
+
+  // Store index blocks on disk in compressed format. Changing this option to
+  // false  will avoid the overhead of decompression if index blocks are evicted
+  // and read back
+  bool enable_index_compression = true;
+
+  // Align data blocks on lesser of page size and block size
+  bool block_align = false;
+
+  // This enum allows trading off increased index size for improved iterator
+  // seek performance in some situations, particularly when block cache is
+  // disabled (ReadOptions::fill_cache = false) and direct IO is
+  // enabled (DBOptions::use_direct_reads = true).
+  // The default mode is the best tradeoff for most use cases.
+  // This option only affects newly written tables.
+  //
+  // The index contains a key separating each pair of consecutive blocks.
+  // Let A be the highest key in one block, B the lowest key in the next block,
+  // and I the index entry separating these two blocks:
+  // [ ... A] I [B ...]
+  // I is allowed to be anywhere in [A, B).
+  // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+  // first block, then immediately fall through to the second block.
+  // However, if I=A, this can't happen, and we'll read only the second block.
+  // In kNoShortening mode, we use I=A. In other modes, we use the shortest
+  // key in [A, B), which usually significantly reduces index size.
+  //
+  // There's a similar story for the last index entry, which is an upper bound
+  // of the highest key in the file. If it's shortened and therefore
+  // overestimated, iterator is likely to unnecessarily read the last data block
+  // from each file on each seek.
+  enum class IndexShorteningMode : char {
+    // Use full keys.
+    kNoShortening,
+    // Shorten index keys between blocks, but use full key for the last index
+    // key, which is the upper bound of the whole file.
+    kShortenSeparators,
+    // Shorten both keys between blocks and key after last block.
+    kShortenSeparatorsAndSuccessor,
+  };
+
+  IndexShorteningMode index_shortening =
+      IndexShorteningMode::kShortenSeparators;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead
+  // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB)
+  // and doubles on every additional read upto max_auto_readahead_size and
+  // max_auto_readahead_size can be configured.
+  //
+  // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable
+  // the implicit auto prefetching.
+  // If max_auto_readahead_size provided is less
+  // than initial_auto_readahead_size, then RocksDB will sanitize the
+  // initial_auto_readahead_size and set it to max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{max_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 256 KB (256 * 1024).
+  size_t max_auto_readahead_size = 256 * 1024;
+
+  // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
+  // filter blocks) which are already in memory into block cache at the time of
+  // flush. On a flush, the block that is in memory (in memtables) get flushed
+  // to the device. If using Direct IO, additional IO is incurred to read this
+  // data back into memory again, which is avoided by enabling this option. This
+  // further helps if the workload exhibits high temporal locality, where most
+  // of the reads go to recently written data. This also helps in case of
+  // Distributed FileSystem.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{prepopulate_block_cache=kFlushOnly;}"}}));
+  enum class PrepopulateBlockCache : char {
+    // Disable prepopulate block cache.
+    kDisable,
+    // Prepopulate blocks during flush only.
+    kFlushOnly,
+  };
+
+  PrepopulateBlockCache prepopulate_block_cache =
+      PrepopulateBlockCache::kDisable;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead size
+  // starts at initial_auto_readahead_size and doubles on every additional read
+  // upto BlockBasedTableOptions.max_auto_readahead_size.
+  // max_auto_readahead_size can also be configured.
+  //
+  // Scenarios:
+  // - If initial_auto_readahead_size is set 0 then it will disabled the
+  //   implicit auto prefetching irrespective of max_auto_readahead_size.
+  // - If max_auto_readahead_size is set 0, it will disable the internal
+  //    prefetching irrespective of initial_auto_readahead_size.
+  // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB
+  //   will sanitize the value of initial_auto_readahead_size to
+  //   max_auto_readahead_size and readahead_size will be
+  //   max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{initial_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 8 KB (8 * 1024).
+  size_t initial_auto_readahead_size = 8 * 1024;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size and reads are
+  // sequential.
+  // num_file_reads_for_auto_readahead indicates after how many
+  // sequential reads internal auto prefetching should be start.
+  //
+  // For example, if value is 2 then after reading 2 sequential data blocks on
+  // third data block prefetching will start.
+  // If set 0, it will start prefetching from the first read.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{num_file_reads_for_auto_readahead=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 2
+  uint64_t num_file_reads_for_auto_readahead = 2;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+  // value of this properties is a fixed int32 number.
+  static const std::string kIndexType;
+  // value is "1" for true and "0" for false.
+  static const std::string kWholeKeyFiltering;
+  // value is "1" for true and "0" for false.
+  static const std::string kPrefixFiltering;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+
+enum EncodingType : char {
+  // Always write full keys without any special encoding.
+  kPlain,
+  // Find opportunity to write the same prefix once for multiple rows.
+  // In some cases, when a key follows a previous key with the same prefix,
+  // instead of writing out the full key, it just writes out the size of the
+  // shared prefix, as well as other bytes, to save some bytes.
+  //
+  // When using this option, the user is required to use the same prefix
+  // extractor to make sure the same prefix will be extracted from the same key.
+  // The Name() value of the prefix extractor will be stored in the file. When
+  // reopening the file, the name of the options.prefix_extractor given will be
+  // bitwise compared to the prefix extractors stored in the file. An error
+  // will be returned if the two don't match.
+  kPrefix,
+};
+
+// Table Properties that are specific to plain table properties.
+struct PlainTablePropertyNames {
+  static const std::string kEncodingType;
+  static const std::string kBloomVersion;
+  static const std::string kNumBloomBlocks;
+};
+
+const uint32_t kPlainTableVariableLength = 0;
+
+struct PlainTableOptions {
+  static const char* kName() { return "PlainTableOptions"; };
+  // @user_key_len: plain table has optimization for fix-sized keys, which can
+  //                be specified via user_key_len.  Alternatively, you can pass
+  //                `kPlainTableVariableLength` if your keys have variable
+  //                lengths.
+  uint32_t user_key_len = kPlainTableVariableLength;
+
+  // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+  //                      You may disable it by passing a zero.
+  int bloom_bits_per_key = 10;
+
+  // @hash_table_ratio: the desired utilization of the hash table used for
+  //                    prefix hashing.
+  //                    hash_table_ratio = number of prefixes / #buckets in the
+  //                    hash table
+  double hash_table_ratio = 0.75;
+
+  // @index_sparseness: inside each prefix, need to build one index record for
+  //                    how many keys for binary search inside each hash bucket.
+  //                    For encoding type kPrefix, the value will be used when
+  //                    writing to determine an interval to rewrite the full
+  //                    key. It will also be used as a suggestion and satisfied
+  //                    when possible.
+  size_t index_sparseness = 16;
+
+  // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+  //                      Otherwise from huge page TLB. The user needs to
+  //                      reserve huge pages for it to be allocated, like:
+  //                          sysctl -w vm.nr_hugepages=20
+  //                      See linux doc Documentation/vm/hugetlbpage.txt
+  size_t huge_page_tlb_size = 0;
+
+  // @encoding_type: how to encode the keys. See enum EncodingType above for
+  //                 the choices. The value will determine how to encode keys
+  //                 when writing to a new SST file. This value will be stored
+  //                 inside the SST file which will be used when reading from
+  //                 the file, which makes it possible for users to choose
+  //                 different encoding type when reopening a DB. Files with
+  //                 different encoding types can co-exist in the same DB and
+  //                 can be read.
+  EncodingType encoding_type = kPlain;
+
+  // @full_scan_mode: mode for reading the whole file one record by one without
+  //                  using the index.
+  bool full_scan_mode = false;
+
+  // @store_index_in_file: compute plain table index and bloom filter during
+  //                       file building and store it in file. When reading
+  //                       file, index will be mapped instead of recomputation.
+  bool store_index_in_file = false;
+};
+
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extractor properly to make
+// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
+// the hash bucket found, a binary search is executed for hash conflicts.
+// Finally, a linear search is used.
+
+extern TableFactory* NewPlainTableFactory(
+    const PlainTableOptions& options = PlainTableOptions());
+
+struct CuckooTablePropertyNames {
+  // The key that is used to fill empty buckets.
+  static const std::string kEmptyKey;
+  // Fixed length of value.
+  static const std::string kValueLength;
+  // Number of hash functions used in Cuckoo Hash.
+  static const std::string kNumHashFunc;
+  // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+  // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+  // where starting bucket id is given by the hash function on the key. In case
+  // of a collision during inserting the key, the builder tries to insert the
+  // key in other locations of the cuckoo block before using the next hash
+  // function. This reduces cache miss during read operation in case of
+  // collision.
+  static const std::string kCuckooBlockSize;
+  // Size of the hash table. Use this number to compute the modulo of hash
+  // function. The actual number of buckets will be kMaxHashTableSize +
+  // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+  // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+  // implementation.
+  static const std::string kHashTableSize;
+  // Denotes if the key sorted in the file is Internal Key (if false)
+  // or User Key only (if true).
+  static const std::string kIsLastLevel;
+  // Indicate if using identity function for the first hash function.
+  static const std::string kIdentityAsFirstHash;
+  // Indicate if using module or bit and to calculate hash value
+  static const std::string kUseModuleHash;
+  // Fixed user key length
+  static const std::string kUserKeyLength;
+};
+
+struct CuckooTableOptions {
+  static const char* kName() { return "CuckooTableOptions"; };
+
+  // Determines the utilization of hash tables. Smaller values
+  // result in larger hash tables with fewer collisions.
+  double hash_table_ratio = 0.9;
+  // A property used by builder to determine the depth to go to
+  // to search for a path to displace elements in case of
+  // collision. See Builder.MakeSpaceForKey method. Higher
+  // values result in more efficient hash tables with fewer
+  // lookups but take more time to build.
+  uint32_t max_search_depth = 100;
+  // In case of collision while inserting, the builder
+  // attempts to insert in the next cuckoo_block_size
+  // locations before skipping over to the next Cuckoo hash
+  // function. This makes lookups more cache friendly in case
+  // of collisions.
+  uint32_t cuckoo_block_size = 5;
+  // If this option is enabled, user key is treated as uint64_t and its value
+  // is used as hash value directly. This option changes builder's behavior.
+  // Reader ignore this option and behave according to what specified in table
+  // property.
+  bool identity_as_first_hash = false;
+  // If this option is set to true, module is used during hash calculation.
+  // This often yields better space efficiency at the cost of performance.
+  // If this option is set to false, # of entries in table is constrained to be
+  // power of two, and bit and is used to calculate hash, which is faster in
+  // general.
+  bool use_module_hash = true;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+    const CuckooTableOptions& table_options = CuckooTableOptions());
+
+#endif  // ROCKSDB_LITE
+
+class RandomAccessFileReader;
+
+// A base class for table factories.
+class TableFactory : public Customizable {
+ public:
+  virtual ~TableFactory() override {}
+
+  static const char* kBlockCacheOpts() { return "BlockCache"; };
+  static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
+  static const char* kPlainTableName() { return "PlainTable"; }
+  static const char* kCuckooTableName() { return "CuckooTable"; };
+
+  // Creates and configures a new TableFactory from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<TableFactory>* factory);
+
+  static const char* Type() { return "TableFactory"; }
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in three places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (2) SstFileDumper (for SST Dump) opens the table and dump the table
+  //     contents using the iterator of the table.
+  // (3) DBImpl::IngestExternalFile() calls this function to read the contents
+  //     of the sst file it's attempting to add
+  //
+  // table_reader_options is a TableReaderOptions which contain all the
+  //    needed parameters and configuration to open the table.
+  // file is a file handler to handle the file for the table.
+  // file_size is the physical file size of the file.
+  // table_reader is the output table reader.
+  virtual Status NewTableReader(
+      const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const {
+    ReadOptions ro;
+    return NewTableReader(ro, table_reader_options, std::move(file), file_size,
+                          table_reader, prefetch_index_and_filter_in_cache);
+  }
+
+  // Overload of the above function that allows the caller to pass in a
+  // ReadOptions
+  virtual Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // Multiple configured can be accessed from there, including and not limited
+  // to compression options. file is a handle of a writable file.
+  // It is the caller's responsibility to keep the file open and close the file
+  // after closing the table builder. compression_type is the compression type
+  // to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const = 0;
+
+  // Return is delete range supported
+  virtual bool IsDeleteRangeSupported() const { return false; }
+};
+
+#ifndef ROCKSDB_LITE
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
+// convert a DB from one table format to another.
+// @table_factory_to_write: the table factory used when writing to new files.
+// @block_based_table_factory:  block based table factory to use. If NULL, use
+//                              a default one.
+// @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
+// one.
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
+    std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
+    std::shared_ptr<TableFactory> plain_table_factory = nullptr,
+    std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
new file mode 100644
index 000000000..cbe87fa3a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -0,0 +1,327 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interpret these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+//      pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+//      ++pos) {
+//   ...
+// }
+using UserCollectedProperties = std::map<std::string, std::string>;
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDbId;
+  static const std::string kDbSessionId;
+  static const std::string kDbHostId;
+  static const std::string kOriginalFileNumber;
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kIndexPartitions;
+  static const std::string kTopLevelIndexSize;
+  static const std::string kIndexKeyIsUserKey;
+  static const std::string kIndexValueIsDeltaEncoded;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kNumFilterEntries;
+  static const std::string kDeletedKeys;
+  static const std::string kMergeOperands;
+  static const std::string kNumRangeDeletions;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
+  static const std::string kFilterPolicy;
+  static const std::string kColumnFamilyName;
+  static const std::string kColumnFamilyId;
+  static const std::string kComparator;
+  static const std::string kMergeOperator;
+  static const std::string kPrefixExtractorName;
+  static const std::string kPropertyCollectors;
+  static const std::string kCompression;
+  static const std::string kCompressionOptions;
+  static const std::string kCreationTime;
+  static const std::string kOldestKeyTime;
+  static const std::string kFileCreationTime;
+  static const std::string kSlowCompressionEstimatedDataSize;
+  static const std::string kFastCompressionEstimatedDataSize;
+  static const std::string kSequenceNumberTimeMapping;
+};
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own properties that they are interested in. This class is essentially
+// a collection of callback functions that will be invoked during table
+// building. It is constructed with TablePropertiesCollectorFactory. The methods
+// don't need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially.
+//
+// Statuses from these callbacks are currently logged when not OK, but
+// otherwise ignored by RocksDB.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() {}
+
+  // DEPRECATE User defined collector should implement AddUserKey(), though
+  //           this old function still works for backward compatible reason.
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) {
+    return Status::InvalidArgument(
+        "TablePropertiesCollector::Add() deprecated.");
+  }
+
+  // AddUserKey() will be called when a new key/value pair is inserted into the
+  // table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status AddUserKey(const Slice& key, const Slice& value,
+                            EntryType /*type*/, SequenceNumber /*seq*/,
+                            uint64_t /*file_size*/) {
+    // For backwards-compatibility.
+    return Add(key, value);
+  }
+
+  // Called after each new block is cut
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) {
+    // Nothing to do here. Callback registers can override.
+    return;
+  }
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+
+  // EXPERIMENTAL Return whether the output file should be further compacted
+  virtual bool NeedCompact() const { return false; }
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollectorFactory : public Customizable {
+ public:
+  struct Context {
+    uint32_t column_family_id;
+    // The level at creating the SST file (i.e, table), of which the
+    // properties are being collected.
+    int level_at_creation = kUnknownLevelAtCreation;
+    static const uint32_t kUnknownColumnFamily;
+    static const int kUnknownLevelAtCreation = -1;
+  };
+
+  ~TablePropertiesCollectorFactory() override {}
+  static const char* Type() { return "TablePropertiesCollectorFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<TablePropertiesCollectorFactory>* result);
+
+  // has to be thread-safe
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  const char* Name() const override = 0;
+
+  // Can be overridden by sub-classes to return the Name, followed by
+  // configuration info that will // be logged to the info log when the
+  // DB is opened
+  virtual std::string ToString() const { return Name(); }
+};
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // the file number at creation time, or 0 for unknown. When known,
+  // combining with db_session_id must uniquely identify an SST file.
+  uint64_t orig_file_number = 0;
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // Total number of index partitions if kTwoLevelIndexSearch is used
+  uint64_t index_partitions = 0;
+  // Size of the top-level index if kTwoLevelIndexSearch is used
+  uint64_t top_level_index_size = 0;
+  // Whether the index key is user key. Otherwise it includes 8 byte of sequence
+  // number added by internal key format.
+  uint64_t index_key_is_user_key = 0;
+  // Whether delta encoding is used to encode the index values.
+  uint64_t index_value_is_delta_encoded = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw (uncompressed, undelineated) key size
+  uint64_t raw_key_size = 0;
+  // total raw (uncompressed, undelineated) value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+  // the number of unique entries (keys or prefixes) added to filters
+  uint64_t num_filter_entries = 0;
+  // the number of deletions in the table
+  uint64_t num_deletions = 0;
+  // the number of merge operands in the table
+  uint64_t num_merge_operands = 0;
+  // the number of range deletions in this table
+  uint64_t num_range_deletions = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
+  // ID of column family for this SST file, corresponding to the CF identified
+  // by column_family_name.
+  uint64_t column_family_id = ROCKSDB_NAMESPACE::
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  // Timestamp of the latest key. 0 means unknown.
+  // TODO(sagar0): Should be changed to latest_key_time ... but don't know the
+  // full implications of backward compatibility. Hence retaining for now.
+  uint64_t creation_time = 0;
+
+  // Timestamp of the earliest key. 0 means unknown.
+  uint64_t oldest_key_time = 0;
+  // Actual SST file creation time. 0 means unknown.
+  uint64_t file_creation_time = 0;
+  // Estimated size of data blocks if compressed using a relatively slower
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t slow_compression_estimated_data_size = 0;
+  // Estimated size of data blocks if compressed using a relatively faster
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t fast_compression_estimated_data_size = 0;
+  // Offset of the value of the property "external sst file global seqno" in the
+  // file if the property exists.
+  // 0 means not exists.
+  uint64_t external_sst_file_global_seqno_offset = 0;
+
+  // DB identity
+  // db_id is an identifier generated the first time the DB is created
+  // If DB identity is unset or unassigned, `db_id` will be an empty string.
+  std::string db_id;
+
+  // DB session identity
+  // db_session_id is an identifier that gets reset every time the DB is opened
+  // If DB session identity is unset or unassigned, `db_session_id` will be an
+  // empty string.
+  std::string db_session_id;
+
+  // Location of the machine hosting the DB instance
+  // db_host_id identifies the location of the host in some form
+  // (hostname by default, but can also be any string of the user's choosing).
+  // It can potentially change whenever the DB is opened
+  std::string db_host_id;
+
+  // Name of the column family with which this SST file is associated.
+  // If column family is unknown, `column_family_name` will be an empty string.
+  std::string column_family_name;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // The name of the comparator used in this table.
+  std::string comparator_name;
+
+  // The name of the merge operator used in this table.
+  // If no merge operator is used, `merge_operator_name` will be "nullptr".
+  std::string merge_operator_name;
+
+  // The name of the prefix extractor used in this table
+  // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr".
+  std::string prefix_extractor_name;
+
+  // The names of the property collectors factories used in this table
+  // separated by commas
+  // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
+  std::string property_collectors_names;
+
+  // The compression algo used to compress the SST files.
+  std::string compression_name;
+
+  // Compression options used to compress the SST files.
+  std::string compression_options;
+
+  // Sequence number to time mapping, delta encoded.
+  std::string seqno_to_time_mapping;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+  UserCollectedProperties readable_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
+
+  // Aggregate the numerical member variables of the specified
+  // TableProperties.
+  void Add(const TableProperties& tp);
+
+  // Subset of properties that make sense when added together
+  // between tables. Keys match field names in this class instead
+  // of using full property names.
+  std::map<std::string, uint64_t> GetAggregatablePropertiesAsMap() const;
+
+  // Return the approximated memory usage of this TableProperties object,
+  // including memory used by the string properties and UserCollectedProperties
+  std::size_t ApproximateMemoryUsage() const;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
+                                 bool* property_present);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_reader_caller.h b/src/rocksdb/include/rocksdb/table_reader_caller.h
new file mode 100644
index 000000000..10ec08130
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_reader_caller.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+  kUserGet = 1,
+  kUserMultiGet = 2,
+  kUserIterator = 3,
+  kUserApproximateSize = 4,
+  kUserVerifyChecksum = 5,
+  kSSTDumpTool = 6,
+  kExternalSSTIngestion = 7,
+  kRepair = 8,
+  kPrefetch = 9,
+  kCompaction = 10,
+  // A compaction job may refill the block cache with blocks in the new SST
+  // files if paranoid_file_checks is true.
+  kCompactionRefill = 11,
+  // After building a table, it may load all its blocks into the block cache if
+  // paranoid_file_checks is true.
+  kFlush = 12,
+  // sst_file_reader.
+  kSSTFileReader = 13,
+  // A list of callers that are either not interesting for analysis or are
+  // calling from a test environment, e.g., unit test, benchmark, etc.
+  kUncategorized = 14,
+  // All callers should be added before kMaxBlockCacheLookupCaller.
+  kMaxBlockCacheLookupCaller
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
new file mode 100644
index 000000000..1b5f8c046
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread.  Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS)
+#define ROCKSDB_USING_THREAD_STATUS
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): remove this function once c++14 is available
+//                 as std::max will be able to cover this.
+// Current MS compiler does not support constexpr
+template <int A, int B>
+struct constexpr_max {
+  static const int result = (A > B) ? A : B;
+};
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// ROCKSDB_NAMESPACE::GetThreadList().
+struct ThreadStatus {
+  // The type of a thread.
+  enum ThreadType : int {
+    HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
+    LOW_PRIORITY,       // RocksDB BG thread in low-pri thread pool
+    USER,               // User thread (Non-RocksDB BG thread)
+    BOTTOM_PRIORITY,    // RocksDB BG thread in bottom-pri thread pool
+    NUM_THREAD_TYPES
+  };
+
+  // The type used to refer to a thread operation.
+  // A thread operation describes high-level action of a thread.
+  // Examples include compaction and flush.
+  enum OperationType : int {
+    OP_UNKNOWN = 0,
+    OP_COMPACTION,
+    OP_FLUSH,
+    NUM_OP_TYPES
+  };
+
+  enum OperationStage : int {
+    STAGE_UNKNOWN = 0,
+    STAGE_FLUSH_RUN,
+    STAGE_FLUSH_WRITE_L0,
+    STAGE_COMPACTION_PREPARE,
+    STAGE_COMPACTION_RUN,
+    STAGE_COMPACTION_PROCESS_KV,
+    STAGE_COMPACTION_INSTALL,
+    STAGE_COMPACTION_SYNC_FILE,
+    STAGE_PICK_MEMTABLES_TO_FLUSH,
+    STAGE_MEMTABLE_ROLLBACK,
+    STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+    NUM_OP_STAGES
+  };
+
+  enum CompactionPropertyType : int {
+    COMPACTION_JOB_ID = 0,
+    COMPACTION_INPUT_OUTPUT_LEVEL,
+    COMPACTION_PROP_FLAGS,
+    COMPACTION_TOTAL_INPUT_BYTES,
+    COMPACTION_BYTES_READ,
+    COMPACTION_BYTES_WRITTEN,
+    NUM_COMPACTION_PROPERTIES
+  };
+
+  enum FlushPropertyType : int {
+    FLUSH_JOB_ID = 0,
+    FLUSH_BYTES_MEMTABLES,
+    FLUSH_BYTES_WRITTEN,
+    NUM_FLUSH_PROPERTIES
+  };
+
+  // The maximum number of properties of an operation.
+  // This number should be set to the biggest NUM_XXX_PROPERTIES.
+  static const int kNumOperationProperties =
+      constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+
+  // The type used to refer to a thread state.
+  // A state describes lower-level action of a thread
+  // such as reading / writing a file or waiting for a mutex.
+  enum StateType : int {
+    STATE_UNKNOWN = 0,
+    STATE_MUTEX_WAIT = 1,
+    NUM_STATE_TYPES
+  };
+
+  ThreadStatus(const uint64_t _id, const ThreadType _thread_type,
+               const std::string& _db_name, const std::string& _cf_name,
+               const OperationType _operation_type,
+               const uint64_t _op_elapsed_micros,
+               const OperationStage _operation_stage,
+               const uint64_t _op_props[], const StateType _state_type)
+      : thread_id(_id),
+        thread_type(_thread_type),
+        db_name(_db_name),
+        cf_name(_cf_name),
+        operation_type(_operation_type),
+        op_elapsed_micros(_op_elapsed_micros),
+        operation_stage(_operation_stage),
+        state_type(_state_type) {
+    for (int i = 0; i < kNumOperationProperties; ++i) {
+      op_properties[i] = _op_props[i];
+    }
+  }
+
+  // An unique ID for the thread.
+  const uint64_t thread_id;
+
+  // The type of the thread, it could be HIGH_PRIORITY,
+  // LOW_PRIORITY, and USER
+  const ThreadType thread_type;
+
+  // The name of the DB instance where the thread is currently
+  // involved with.  It would be set to empty string if the thread
+  // does not involve in any DB operation.
+  const std::string db_name;
+
+  // The name of the column family where the thread is currently
+  // It would be set to empty string if the thread does not involve
+  // in any column family.
+  const std::string cf_name;
+
+  // The operation (high-level action) that the current thread is involved.
+  const OperationType operation_type;
+
+  // The elapsed time of the current thread operation in microseconds.
+  const uint64_t op_elapsed_micros;
+
+  // An integer showing the current stage where the thread is involved
+  // in the current operation.
+  const OperationStage operation_stage;
+
+  // A list of properties that describe some details about the current
+  // operation.  Same field in op_properties[] might have different
+  // meanings for different operations.
+  uint64_t op_properties[kNumOperationProperties];
+
+  // The state (lower-level action) that the current thread is involved.
+  const StateType state_type;
+
+  // The followings are a set of utility functions for interpreting
+  // the information of ThreadStatus
+
+  static std::string GetThreadTypeName(ThreadType thread_type);
+
+  // Obtain the name of an operation given its type.
+  static const std::string& GetOperationName(OperationType op_type);
+
+  static const std::string MicrosToString(uint64_t op_elapsed_time);
+
+  // Obtain a human-readable string describing the specified operation stage.
+  static const std::string& GetOperationStageName(OperationStage stage);
+
+  // Obtain the name of the "i"th operation property of the
+  // specified operation.
+  static const std::string& GetOperationPropertyName(OperationType op_type,
+                                                     int i);
+
+  // Translate the "i"th property of the specified operation given
+  // a property value.
+  static std::map<std::string, uint64_t> InterpretOperationProperties(
+      OperationType op_type, const uint64_t* op_properties);
+
+  // Obtain the name of a state given its type.
+  static const std::string& GetStateName(StateType state_type);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h
new file mode 100644
index 000000000..f1cc55752
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/threadpool.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <functional>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+ * ThreadPool is a component that will spawn N background threads that will
+ * be used to execute scheduled work, The number of background threads could
+ * be modified by calling SetBackgroundThreads().
+ * */
+class ThreadPool {
+ public:
+  virtual ~ThreadPool() {}
+
+  // Wait for all threads to finish.
+  // Discard those threads that did not start
+  // executing
+  virtual void JoinAllThreads() = 0;
+
+  // Set the number of background threads that will be executing the
+  // scheduled jobs.
+  virtual void SetBackgroundThreads(int num) = 0;
+  virtual int GetBackgroundThreads() = 0;
+
+  // Get the number of jobs scheduled in the ThreadPool queue.
+  virtual unsigned int GetQueueLen() const = 0;
+
+  // Waits for all jobs to complete those
+  // that already started running and those that did not
+  // start yet. This ensures that everything that was thrown
+  // on the TP runs even though
+  // we may not have specified enough threads for the amount
+  // of jobs
+  virtual void WaitForJobsAndJoinAllThreads() = 0;
+
+  // Submit a fire and forget jobs
+  // This allows to submit the same job multiple times
+  virtual void SubmitJob(const std::function<void()>&) = 0;
+  // This moves the function in for efficiency
+  virtual void SubmitJob(std::function<void()>&&) = 0;
+
+  // Reserve available background threads. This function does not ensure
+  // so many threads can be reserved, instead it will return the number of
+  // threads that can be reserved against the desired one. In other words,
+  // the number of available threads could be less than the input.
+  virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; }
+
+  // Release a specific number of reserved threads
+  virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; }
+};
+
+// NewThreadPool() is a function that could be used to create a ThreadPool
+// with `num_threads` background threads.
+extern ThreadPool* NewThreadPool(int num_threads);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h
new file mode 100644
index 000000000..335e091dc
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+  virtual ~TraceWriter() = default;
+
+  virtual Status Write(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual uint64_t GetFileSize() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay operations.
+class TraceReader {
+ public:
+  virtual ~TraceReader() = default;
+
+  virtual Status Read(std::string* data) = 0;
+  virtual Status Close() = 0;
+
+  // Seek back to the trace header. Replayer can call this method to restart
+  // replaying. Note this method may fail if the reader is already closed.
+  virtual Status Reset() = 0;
+};
+
+// Factory methods to write/read traces to/from a file.
+// The implementations may not be thread-safe.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record.h b/src/rocksdb/include/rocksdb/trace_record.h
new file mode 100644
index 000000000..c00f5cafb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class DB;
+
+// Supported trace record types.
+enum TraceType : char {
+  kTraceNone = 0,
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  // Query level tracing related trace types.
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  // Block cache tracing related trace types.
+  kBlockTraceIndexBlock = 7,
+  // TODO: split out kinds of filter blocks?
+  kBlockTraceFilterBlock = 8,
+  kBlockTraceDataBlock = 9,
+  kBlockTraceUncompressionDictBlock = 10,
+  kBlockTraceRangeDeletionBlock = 11,
+  // IO tracing related trace type.
+  kIOTracer = 12,
+  // Query level tracing related trace type.
+  kTraceMultiGet = 13,
+  // All trace types should be added before kTraceMax
+  kTraceMax,
+};
+
+class GetQueryTraceRecord;
+class IteratorSeekQueryTraceRecord;
+class MultiGetQueryTraceRecord;
+class TraceRecordResult;
+class WriteQueryTraceRecord;
+
+// Base class for all types of trace records.
+class TraceRecord {
+ public:
+  explicit TraceRecord(uint64_t timestamp);
+
+  virtual ~TraceRecord() = default;
+
+  // Type of the trace record.
+  virtual TraceType GetTraceType() const = 0;
+
+  // Timestamp (in microseconds) of this trace.
+  virtual uint64_t GetTimestamp() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const WriteQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const GetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const MultiGetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+  };
+
+  // Accept the handler and report the corresponding result in `result`.
+  virtual Status Accept(Handler* handler,
+                        std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Create a handler for the exeution of TraceRecord.
+  static Handler* NewExecutionHandler(
+      DB* db, const std::vector<ColumnFamilyHandle*>& handles);
+
+ private:
+  uint64_t timestamp_;
+};
+
+// Base class for all query types of trace records.
+class QueryTraceRecord : public TraceRecord {
+ public:
+  explicit QueryTraceRecord(uint64_t timestamp);
+};
+
+// Trace record for DB::Write() operation.
+class WriteQueryTraceRecord : public QueryTraceRecord {
+ public:
+  WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp);
+
+  WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp);
+
+  virtual ~WriteQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceWrite; }
+
+  // rep string for the WriteBatch.
+  virtual Slice GetWriteBatchRep() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  PinnableSlice rep_;
+};
+
+// Trace record for DB::Get() operation
+class GetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key,
+                      uint64_t timestamp);
+
+  GetQueryTraceRecord(uint32_t column_family_id, const std::string& key,
+                      uint64_t timestamp);
+
+  virtual ~GetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceGet; }
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to get.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Base class for all Iterator related operations.
+class IteratorQueryTraceRecord : public QueryTraceRecord {
+ public:
+  explicit IteratorQueryTraceRecord(uint64_t timestamp);
+
+  IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+                           PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorQueryTraceRecord(const std::string& lower_bound,
+                           const std::string& upper_bound, uint64_t timestamp);
+
+  virtual ~IteratorQueryTraceRecord() override;
+
+  // Get the iterator's lower/upper bound. They may be used in ReadOptions to
+  // create an Iterator instance.
+  virtual Slice GetLowerBound() const;
+  virtual Slice GetUpperBound() const;
+
+ private:
+  PinnableSlice lower_;
+  PinnableSlice upper_;
+};
+
+// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation.
+class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord {
+ public:
+  // Currently we only support Seek() and SeekForPrev().
+  enum SeekType {
+    kSeek = kTraceIteratorSeek,
+    kSeekForPrev = kTraceIteratorSeekForPrev
+  };
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, PinnableSlice&& lower_bound,
+                               PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key,
+                               const std::string& lower_bound,
+                               const std::string& upper_bound,
+                               uint64_t timestamp);
+
+  virtual ~IteratorSeekQueryTraceRecord() override;
+
+  // Trace type matches the seek type.
+  TraceType GetTraceType() const override;
+
+  // Type of seek, Seek or SeekForPrev.
+  virtual SeekType GetSeekType() const;
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to seek to.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  SeekType type_;
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Trace record for DB::MultiGet() operation.
+class MultiGetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           std::vector<PinnableSlice>&& keys,
+                           uint64_t timestamp);
+
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           const std::vector<std::string>& keys,
+                           uint64_t timestamp);
+
+  virtual ~MultiGetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceMultiGet; }
+
+  // Column familiy IDs.
+  virtual std::vector<uint32_t> GetColumnFamilyIDs() const;
+
+  // Keys to get.
+  virtual std::vector<Slice> GetKeys() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  std::vector<uint32_t> cf_ids_;
+  std::vector<PinnableSlice> keys_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record_result.h b/src/rocksdb/include/rocksdb/trace_record_result.h
new file mode 100644
index 000000000..0cd0004a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record_result.h
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IteratorTraceExecutionResult;
+class MultiValuesTraceExecutionResult;
+class SingleValueTraceExecutionResult;
+class StatusOnlyTraceExecutionResult;
+
+// Base class for the results of all types of trace records.
+// Theses classes can be used to report the execution result of
+// TraceRecord::Handler::Handle() or TraceRecord::Accept().
+class TraceRecordResult {
+ public:
+  explicit TraceRecordResult(TraceType trace_type);
+
+  virtual ~TraceRecordResult() = default;
+
+  // Trace type of the corresponding TraceRecord.
+  virtual TraceType GetTraceType() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const IteratorTraceExecutionResult& result) = 0;
+  };
+
+  // Accept the handler.
+  virtual Status Accept(Handler* handler) = 0;
+
+ private:
+  TraceType trace_type_;
+};
+
+// Base class for the results from the trace record execution handler (created
+// by TraceRecord::NewExecutionHandler()).
+//
+// The actual execution status or returned values may be hidden from
+// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a
+// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may
+// return Status::NotFound() but TraceRecord::Handler::Handle() or
+// TraceRecord::Accept() will still return Status::OK(). The actual status from
+// DB::Get() and the returned value string may be saved in a
+// SingleValueTraceExecutionResult.
+class TraceExecutionResult : public TraceRecordResult {
+ public:
+  TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp,
+                       TraceType trace_type);
+
+  // Execution start/end timestamps and request latency in microseconds.
+  virtual uint64_t GetStartTimestamp() const;
+  virtual uint64_t GetEndTimestamp() const;
+  inline uint64_t GetLatency() const {
+    return GetEndTimestamp() - GetStartTimestamp();
+  }
+
+ private:
+  uint64_t ts_start_;
+  uint64_t ts_end_;
+};
+
+// Result for operations that only return a single Status.
+// Example operation: DB::Write()
+class StatusOnlyTraceExecutionResult : public TraceExecutionResult {
+ public:
+  StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp,
+                                 uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~StatusOnlyTraceExecutionResult() override = default;
+
+  // Return value of DB::Write(), etc.
+  virtual const Status& GetStatus() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+};
+
+// Result for operations that return a Status and a value.
+// Example operation: DB::Get()
+class SingleValueTraceExecutionResult : public TraceExecutionResult {
+ public:
+  SingleValueTraceExecutionResult(Status status, const std::string& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  SingleValueTraceExecutionResult(Status status, std::string&& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~SingleValueTraceExecutionResult() override;
+
+  // Return status of DB::Get().
+  virtual const Status& GetStatus() const;
+
+  // Value for the searched key.
+  virtual const std::string& GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+  std::string value_;
+};
+
+// Result for operations that return multiple Status(es) and values as vectors.
+// Example operation: DB::MultiGet()
+class MultiValuesTraceExecutionResult : public TraceExecutionResult {
+ public:
+  MultiValuesTraceExecutionResult(std::vector<Status> multi_status,
+                                  std::vector<std::string> values,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~MultiValuesTraceExecutionResult() override;
+
+  // Returned Status(es) of DB::MultiGet().
+  virtual const std::vector<Status>& GetMultiStatus() const;
+
+  // Returned values for the searched keys.
+  virtual const std::vector<std::string>& GetValues() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  std::vector<Status> multi_status_;
+  std::vector<std::string> values_;
+};
+
+// Result for Iterator operations.
+// Example operations: Iterator::Seek(), Iterator::SeekForPrev()
+class IteratorTraceExecutionResult : public TraceExecutionResult {
+ public:
+  IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key,
+                               PinnableSlice&& value, uint64_t start_timestamp,
+                               uint64_t end_timestamp, TraceType trace_type);
+
+  IteratorTraceExecutionResult(bool valid, Status status,
+                               const std::string& key, const std::string& value,
+                               uint64_t start_timestamp, uint64_t end_timestamp,
+                               TraceType trace_type);
+
+  virtual ~IteratorTraceExecutionResult() override;
+
+  // Return if the Iterator is valid.
+  virtual bool GetValid() const;
+
+  // Return the status of the Iterator.
+  virtual const Status& GetStatus() const;
+
+  // Key of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetKey() const;
+
+  // Value of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  bool valid_;
+  Status status_;
+  PinnableSlice key_;
+  PinnableSlice value_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h
new file mode 100644
index 000000000..e13ad8f80
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/transaction_log.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFile;
+using VectorLogPtr = std::vector<std::unique_ptr<LogFile>>;
+
+enum WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+};
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = 0;
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+
+  // Add empty __ctor and __dtor for the rule of five
+  // However, preserve the original semantics and prohibit copying
+  // as the std::unique_ptr member does not copy.
+  BatchResult() {}
+
+  ~BatchResult() {}
+
+  BatchResult(const BatchResult&) = delete;
+
+  BatchResult& operator=(const BatchResult&) = delete;
+
+  BatchResult(BatchResult&& bResult)
+      : sequence(std::move(bResult.sequence)),
+        writeBatchPtr(std::move(bResult.writeBatchPtr)) {}
+
+  BatchResult& operator=(BatchResult&& bResult) {
+    sequence = std::move(bResult.sequence);
+    writeBatchPtr = std::move(bResult.writeBatchPtr);
+    return *this;
+  }
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+
+  // The read options for TransactionLogIterator.
+  struct ReadOptions {
+    // If true, all data read from underlying storage will be
+    // verified against corresponding checksums.
+    // Default: true
+    bool verify_checksums_;
+
+    ReadOptions() : verify_checksums_(true) {}
+
+    explicit ReadOptions(bool verify_checksums)
+        : verify_checksums_(verify_checksums) {}
+  };
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h
new file mode 100644
index 000000000..6fb53d846
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/types.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Define all public custom types here.
+
+using ColumnFamilyId = uint32_t;
+
+// Represents a sequence number in a WAL file.
+using SequenceNumber = uint64_t;
+
+const SequenceNumber kMinUnCommittedSeq = 1;  // 0 is always committed
+
+enum class TableFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+  kMisc,
+};
+
+enum class BlobFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+};
+
+// The types of files RocksDB uses in a DB directory. (Available for
+// advanced options.)
+enum FileType {
+  kWalFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
+// User-oriented representation of internal key types.
+// Ordering of this enum entries should not change.
+enum EntryType {
+  kEntryPut,
+  kEntryDelete,
+  kEntrySingleDelete,
+  kEntryMerge,
+  kEntryRangeDeletion,
+  kEntryBlobIndex,
+  kEntryDeleteWithTimestamp,
+  kEntryWideColumnEntity,
+  kEntryOther,
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/unique_id.h b/src/rocksdb/include/rocksdb/unique_id.h
new file mode 100644
index 000000000..eb0c77826
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/unique_id.h
@@ -0,0 +1,55 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Computes a stable, universally unique 128-bit (16 binary char) identifier
+// for an SST file from TableProperties. This is supported for table (SST)
+// files created with RocksDB 6.24 and later. NotSupported will be returned
+// for other cases. The first 16 bytes (128 bits) is of sufficient quality
+// for almost all applications, and shorter prefixes are usable as a
+// hash of the full unique id.
+//
+// Note: .c_str() is not compatible with binary char strings, so using
+// .c_str() on the result will often result in information loss and very
+// poor uniqueness probability.
+//
+// More detail: the value is *guaranteed* unique for SST files
+// generated in the same process (even different DBs, RocksDB >= 6.26),
+// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26)
+// so that the "all zeros" value can be used reliably for a null ID.
+// These IDs are more than sufficient for SST uniqueness within each of
+// many DBs or hosts. For an extreme example assuming random IDs, consider
+// 10^9 hosts each with 10^9 live SST files being replaced at 10^6/second.
+// Such a service would need to run for 10 million years to see an ID
+// collision among live SST files on any host.
+//
+// And assuming one generates many SST files in the lifetime of each process,
+// the probability of ID collisions is much "better than random"; see
+// https://github.com/pdillinger/unique_id
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id);
+
+// Computes a 192-bit (24 binary char) stable, universally unique ID
+// with an extra 64 bits of uniqueness compared to the standard ID. It is only
+// appropriate to use this ID instead of the 128-bit ID if ID collisions
+// between files among any hosts in a vast fleet is a problem, such as a shared
+// global namespace for SST file backups. Under this criteria, the extreme
+// example above would expect a global file ID collision every 4 days with
+// 128-bit IDs (using some worst-case assumptions about process lifetime).
+// It's 10^17 years with 192-bit IDs.
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
+                                              std::string *out_id);
+
+// Converts a binary string (unique id) to hexadecimal, with each 64 bits
+// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
+// Also works on unique id prefix.
+std::string UniqueIdToHumanString(const std::string &id);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
new file mode 100644
index 000000000..0b0a85e1c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize,  // pick files of similar size
+  kCompactionStopStyleTotalSize     // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+  // Percentage flexibility while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy up to 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require up to
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  // Default: -1
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Option to optimize the universal multi level compaction by enabling
+  // trivial move for non overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
+  // EXPERIMENTAL
+  // If true, try to limit compaction size under max_compaction_bytes.
+  // This might cause higher write amplification, but can prevent some
+  // problem caused by large compactions.
+  // Default: false
+  bool incremental;
+
+  // Default set of parameters
+  CompactionOptionsUniversal()
+      : size_ratio(1),
+        min_merge_width(2),
+        max_merge_width(UINT_MAX),
+        max_size_amplification_percent(200),
+        compression_size_percent(-1),
+        stop_style(kCompactionStopStyleTotalSize),
+        allow_trivial_move(false),
+        incremental(false) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/agg_merge.h b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
new file mode 100644
index 000000000..4e21082db
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The feature is still in development so the encoding format is subject
+// to change.
+//
+// Aggregation Merge Operator is a merge operator that allows users to
+// aggregate merge operands of different keys with different registered
+// aggregation functions. The aggregation can also change for the same
+// key if the functions store the data in the same format.
+// The target application highly overlaps with merge operator in general
+// but we try to provide a better interface so that users are more likely
+// to use pre-implemented plug-in functions and connect with existing
+// third-party aggregation functions (such as those from SQL engines).
+// In this case, the need for users to write customized C++ plug-in code
+// is reduced.
+// If the idea proves to useful, we might consider to move it to be
+// a core functionality of RocksDB, and reduce the support of merge
+// operators.
+//
+// Users can implement aggregation functions by implementing abstract
+// class Aggregator, and register it using AddAggregator().
+// The merge operator can be retrieved from GetAggMergeOperator() and
+// it is a singleton.
+//
+// Users can push values to be updated with a merge operand encoded with
+// registered function name and payload using EncodeAggFuncAndPayload(),
+// and the merge operator will invoke the aggregation function.
+// An example:
+//
+//    // Assume class ExampleSumAggregator is implemented to do simple sum.
+//    AddAggregator("sum", std::make_unique<ExampleSumAggregator>());
+//    std::shared_ptr<MergeOperator> mp_guard = CreateAggMergeOperator();
+//    options.merge_operator = mp_guard.get();
+//    ...... // Creating DB
+//
+//
+//    std::string encoded_value;
+//    s = EncodeAggFuncAndPayload(kUnamedFuncName, "200", encoded_value);
+//    assert(s.ok());
+//    db->Put(WriteOptions(), "foo", encoded_value);
+//    s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+//    assert(s.ok());
+//    db->Merge(WriteOptions(), "foo", encoded_value);
+//    s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+//    assert(s.ok());
+//    db->Merge(WriteOptions(), "foo", encoded_value);
+//
+//    std::string value;
+//    Status s = db->Get(ReadOptions, "foo", &value);
+//    assert(s.ok());
+//    Slice func, aggregated_value;
+//    assert(ExtractAggFuncAndValue(value, func, aggregated_value));
+//    assert(func == "sum");
+//    assert(aggregated_value == "600");
+//
+//
+// DB::Put() can also be used to add a payloadin the same way as Merge().
+//
+// kUnamedFuncName can be used as a placeholder function name. This will
+// be aggregated with merge operands inserted later based on function
+// name given there.
+//
+// If the aggregation function is not registered or there is an error
+// returned by aggregation function, the result will be encoded with a fake
+// aggregation function kErrorFuncName, with each merge operands to be encoded
+// into a list that can be extracted using ExtractList();
+//
+// If users add a merge operand using a different aggregation function from
+// the previous one, the merge operands for the previous one is aggregated
+// and the payload part of the result is treated as the first payload of
+// the items for the new aggregation function. For example, users can
+// Merge("plus, 1"), merge("plus 2"), merge("minus 3") and the aggregation
+// result would be "minus 0".
+//
+
+// A class used to aggregate data per key/value. The plug-in function is
+// implemented and registered using AddAggregator(). And then use it
+// with merge operator created using CreateAggMergeOperator().
+class Aggregator {
+ public:
+  virtual ~Aggregator() {}
+  // The input list is in reverse insertion order, with values[0] to be
+  // the one inserted last and values.back() to be the one inserted first.
+  // The oldest one might be from Get().
+  // Return whether aggregation succeeded. False for aggregation error.
+  virtual bool Aggregate(const std::vector<Slice>& values,
+                         std::string& result) const = 0;
+
+  // True if a partial aggregation should be invoked. Some aggregators
+  // might opt to skip partial aggregation if possible.
+  virtual bool DoPartialAggregate() const { return true; }
+};
+
+// The function adds aggregation plugin by function name. It is used
+// by all the aggregation operator created using CreateAggMergeOperator().
+// It's currently not thread safe to run concurrently with the aggregation
+// merge operator. It is recommended that all the aggregation function
+// is added before calling CreateAggMergeOperator().
+Status AddAggregator(const std::string& function_name,
+                     std::unique_ptr<Aggregator>&& agg);
+
+// Get the singleton instance of merge operator for aggregation.
+// Always the same one is returned with a shared_ptr is hold as a
+// static variable by the function.
+// This is done so because options.merge_operator is shared_ptr.
+std::shared_ptr<MergeOperator> GetAggMergeOperator();
+
+// Encode aggregation function and payload that can be consumed by aggregation
+// merge operator.
+Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload,
+                               std::string& output);
+// Helper function to extract aggregation function name and payload.
+// Return false if it fails to decode.
+bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value);
+
+// Extract encoded list. This can be used to extract error merge operands when
+// the returned function name is kErrorFuncName.
+bool ExtractList(const Slice& encoded_list, std::vector<Slice>& decoded_list);
+
+// Special function name that allows it to be merged to subsequent type.
+extern const std::string kUnnamedFuncName;
+
+// Special error function name reserved for merging or aggregation error.
+extern const std::string kErrorFuncName;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/backup_engine.h b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
new file mode 100644
index 000000000..f28ad9618
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
@@ -0,0 +1,631 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The default DB file checksum function name.
+constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+// The default BackupEngine file checksum function name.
+constexpr char kBackupFileChecksumFuncName[] = "crc32c";
+
+struct BackupEngineOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // Default: nullptr
+  Env* backup_env;
+
+  // share_table_files supports table and blob files.
+  //
+  // If share_table_files == true, the backup directory will share table and
+  // blob files among backups, to save space among backups of the same DB and to
+  // enable incremental backups by only copying new files.
+  // If share_table_files == false, each backup will be on its own and will not
+  // share any data with other backups.
+  //
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup and
+  // restore even on a machine crash/reboot. Backup and restore processes are
+  // slower with sync enabled. If sync == false, we can only guarantee that
+  // other previously synced backups and restores are not modified while
+  // creating a new one.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "backup_rate_limiter"
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Backup rate limiter. Used to control transfer speed for backup. If this is
+  // not null, backup_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "restore_rate_limiter"
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Restore rate limiter. Used to control transfer speed during restore. If
+  // this is not null, restore_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+  // share_files_with_checksum supports table and blob files.
+  //
+  // Only used if share_table_files is set to true. Setting to false is
+  // DEPRECATED and potentially dangerous because in that case BackupEngine
+  // can lose data if backing up databases with distinct or divergent
+  // history, for example if restoring from a backup other than the latest,
+  // writing to the DB, and creating another backup. Setting to true (default)
+  // prevents these issues by ensuring that different table files (SSTs) and
+  // blob files with the same number are treated as distinct. See
+  // share_files_with_checksum_naming and ShareFilesNaming.
+  //
+  // Default: true
+  bool share_files_with_checksum;
+
+  // Up to this many background threads will copy files for CreateNewBackup()
+  // and RestoreDBFromBackup()
+  // Default: 1
+  int max_background_operations;
+
+  // During backup user can get callback every time next
+  // callback_trigger_interval_size bytes being copied.
+  // Default: 4194304
+  uint64_t callback_trigger_interval_size;
+
+  // For BackupEngineReadOnly, Open() will open at most this many of the
+  // latest non-corrupted backups.
+  //
+  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+  // writable BackupEngine because it would inhibit accounting for shared
+  // files for proper backup deletion, including purging any incompletely
+  // created backups on creation of a new backup.
+  //
+  // Default: INT_MAX
+  int max_valid_backups_to_open;
+
+  // ShareFilesNaming describes possible naming schemes for backup
+  // table and blob file names when they are stored in the
+  // shared_checksum directory (i.e., both share_table_files and
+  // share_files_with_checksum are true).
+  enum ShareFilesNaming : uint32_t {
+    // Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
+    // backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
+    // where <crc32c> is an unsigned decimal integer. This is the
+    // original/legacy naming scheme for share_files_with_checksum,
+    // with two problems:
+    // * At massive scale, collisions on this triple with different file
+    //   contents is plausible.
+    // * Determining the name to use requires computing the checksum,
+    //   so generally requires reading the whole file even if the file
+    //   is already backed up.
+    //
+    // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
+    kLegacyCrc32cAndFileSize = 1U,
+
+    // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
+    // pair of values should be very strongly unique for a given SST file
+    // and easily determined before computing a checksum. The 's' indicates
+    // the value is a DB session id, not a checksum.
+    //
+    // Exceptions:
+    // * For blob files, kLegacyCrc32cAndFileSize is used as currently
+    //   db_session_id is not supported by the blob file format.
+    // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
+    //   will be used instead, matching the names assigned by RocksDB versions
+    //   not supporting the newer naming scheme.
+    // * See also flags below.
+    kUseDbSessionId = 2U,
+
+    kMaskNoNamingFlags = 0xffffU,
+
+    // If not already part of the naming scheme, insert
+    //   _<file_size>
+    // before .sst and .blob in the name. In case of user code actually parsing
+    // the last _<whatever> before the .sst  and .blob as the file size, this
+    // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
+    // option makes official that unofficial feature of the backup metadata.
+    //
+    // We do not consider SST and blob file sizes to have sufficient entropy to
+    // contribute significantly to naming uniqueness.
+    kFlagIncludeFileSize = 1U << 31,
+
+    kMaskNamingFlags = ~kMaskNoNamingFlags,
+  };
+
+  // Naming option for share_files_with_checksum table and blob files. See
+  // ShareFilesNaming for details.
+  //
+  // Modifying this option cannot introduce a downgrade compatibility issue
+  // because RocksDB can read, restore, and delete backups using different file
+  // names, and it's OK for a backup directory to use a mixture of table and
+  // blob files naming schemes.
+  //
+  // However, modifying this option and saving more backups to the same
+  // directory can lead to the same file getting saved again to that
+  // directory, under the new shared name in addition to the old shared
+  // name.
+  //
+  // Default: kUseDbSessionId | kFlagIncludeFileSize
+  //
+  // Note: This option comes into effect only if both share_files_with_checksum
+  // and share_table_files are true.
+  ShareFilesNaming share_files_with_checksum_naming;
+
+  // Major schema version to use when writing backup meta files
+  // 1 (default) - compatible with very old versions of RocksDB.
+  // 2 - can be read by RocksDB versions >= 6.19.0. Minimum schema version for
+  //   * (Experimental) saving and restoring file temperature metadata
+  int schema_version = 1;
+
+  // (Experimental - subject to change or removal) When taking a backup and
+  // saving file temperature info (minimum schema_version is 2), there are
+  // two potential sources of truth for the placement of files into temperature
+  // tiers: (a) the current file temperature reported by the FileSystem or
+  // (b) the expected file temperature recorded in DB manifest. When this
+  // option is false (default), (b) overrides (a) if both are not UNKNOWN.
+  // When true, (a) overrides (b) if both are not UNKNOWN. Regardless of this
+  // setting, a known temperature overrides UNKNOWN.
+  bool current_temperatures_override_manifest = false;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupEngineOptions(
+      const std::string& _backup_dir, Env* _backup_env = nullptr,
+      bool _share_table_files = true, Logger* _info_log = nullptr,
+      bool _sync = true, bool _destroy_old_data = false,
+      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+      int _max_valid_backups_to_open = INT_MAX,
+      ShareFilesNaming _share_files_with_checksum_naming =
+          static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize))
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(true),
+        max_background_operations(_max_background_operations),
+        callback_trigger_interval_size(_callback_trigger_interval_size),
+        max_valid_backups_to_open(_max_valid_backups_to_open),
+        share_files_with_checksum_naming(_share_files_with_checksum_naming) {
+    assert(share_table_files || !share_files_with_checksum);
+    assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
+  }
+};
+
+inline BackupEngineOptions::ShareFilesNaming operator&(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert(r == BackupEngineOptions::kMaskNoNamingFlags ||
+         (r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l & r);
+}
+
+inline BackupEngineOptions::ShareFilesNaming operator|(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l | r);
+}
+
+struct CreateBackupOptions {
+  // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
+  bool flush_before_backup = false;
+
+  // Callback for reporting progress, based on callback_trigger_interval_size.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  std::function<void()> progress_callback = []() {};
+
+  // If false, background_thread_cpu_priority is ignored.
+  // Otherwise, the cpu priority can be decreased,
+  // if you try to increase the priority, the priority will not change.
+  // The initial priority of the threads is CpuPriority::kNormal,
+  // so you can decrease to priorities lower than kNormal.
+  bool decrease_background_thread_cpu_priority = false;
+  CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupEngineOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+using BackupID = uint32_t;
+
+using BackupFileInfo = FileStorageInfo;
+
+struct BackupInfo {
+  BackupID backup_id = 0U;
+  // Creation time, according to GetCurrentTime
+  int64_t timestamp = 0;
+
+  // Total size in bytes (based on file payloads, not including filesystem
+  // overheads or backup meta file)
+  uint64_t size = 0U;
+
+  // Number of backed up files, some of which might be shared with other
+  // backups. Does not include backup meta file.
+  uint32_t number_files = 0U;
+
+  // Backup API user metadata
+  std::string app_metadata;
+
+  // Backup file details, if requested with include_file_details=true
+  std::vector<BackupFileInfo> file_details;
+
+  // DB "name" (a directory in the backup_env) for opening this backup as a
+  // read-only DB. This should also be used as the DBOptions::wal_dir, such
+  // as by default setting wal_dir="". See also env_for_open.
+  // This field is only set if include_file_details=true
+  std::string name_for_open;
+
+  // An Env(+FileSystem) for opening this backup as a read-only DB, with
+  // DB::OpenForReadOnly or similar. This field is only set if
+  // include_file_details=true. (The FileSystem in this Env takes care
+  // of making shared backup files openable from the `name_for_open` DB
+  // directory.) See also name_for_open.
+  //
+  // This Env might or might not be shared with other backups. To work
+  // around DBOptions::env being a raw pointer, this is a shared_ptr so
+  // that keeping either this BackupInfo, the BackupEngine, or a copy of
+  // this shared_ptr alive is sufficient to keep the Env alive for use by
+  // a read-only DB.
+  std::shared_ptr<Env> env_for_open;
+
+  BackupInfo() {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files, const std::string& _app_metadata)
+      : backup_id(_backup_id),
+        timestamp(_timestamp),
+        size(_size),
+        number_files(_number_files),
+        app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
+};
+
+// Read-only functions of a BackupEngine. (Restore writes to another directory
+// not the backup directory.) See BackupEngine comments for details on
+// safe concurrent operations.
+class BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnlyBase() {}
+
+  // Returns info about the latest good backup in backup_info, or NotFound
+  // no good backup exists.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetLatestBackupInfo(
+      BackupInfo* backup_info, bool include_file_details = false) const = 0;
+
+  // Returns info about a specific backup in backup_info, or NotFound
+  // or Corruption status if the requested backup id does not exist or is
+  // known corrupt.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                               bool include_file_details = false) const = 0;
+
+  // Returns info about non-corrupt backups in backup_infos.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_infos,
+                             bool include_file_details = false) const = 0;
+
+  // Returns info about corrupt backups in corrupt_backups.
+  // WARNING: Any write to the BackupEngine could trigger automatic
+  // GarbageCollect(), which could delete files that would be needed to
+  // manually recover a corrupt backup or to preserve an unrecognized (e.g.
+  // incompatible future version) backup.
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const = 0;
+
+  // Restore to specified db_dir and wal_dir from backup_id.
+  virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                                       BackupID backup_id,
+                                       const std::string& db_dir,
+                                       const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
+  }
+
+  // If verify_with_checksum is true, this function
+  // inspects the current checksums and file sizes of backup files to see if
+  // they match our expectation.
+  //
+  // If verify_with_checksum is false, this function
+  // checks that each file exists and that the size of the file matches our
+  // expectation. It does not check file checksum.
+  //
+  // If this BackupEngine created the backup, it compares the files' current
+  // sizes (and current checksum) against the number of bytes written to
+  // them (and the checksum calculated) during creation.
+  // Otherwise, it compares the files' current sizes (and checksums) against
+  // their sizes (and checksums) when the BackupEngine was opened.
+  //
+  // Returns Status::OK() if all checks are good
+  virtual IOStatus VerifyBackup(BackupID backup_id,
+                                bool verify_with_checksum = false) const = 0;
+};
+
+// Append-only functions of a BackupEngine. See BackupEngine comment for
+// details on distinction between Append and Write operations and safe
+// concurrent operations.
+class BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngineAppendOnlyBase() {}
+
+  // same as CreateNewBackup, but stores extra application metadata.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      const CreateBackupOptions& options, DB* db,
+      const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0;
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackupWithMetadata(options, db, app_metadata);
+  }
+
+  // Captures the state of the database by creating a new (latest) backup.
+  // On success (OK status), the BackupID of the new backup is saved to
+  // *new_backup_id when not nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating backups,
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
+  virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db,
+                                   BackupID* new_backup_id = nullptr) {
+    return CreateNewBackupWithMetadata(options, db, "", new_backup_id);
+  }
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackup(
+      DB* db, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackup(options, db);
+  }
+
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediately, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
+  virtual void StopBackup() = 0;
+
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete). This does not delete corrupt backups but can delete files that
+  // would be needed to manually recover a corrupt backup or to preserve an
+  // unrecognized (e.g. incompatible future version) backup.
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
+  // NOTE2: For concurrency and interference purposes (see BackupEngine
+  // comment), GarbageCollect (GC) is like other Append operations, even
+  // though it seems different. Although GC can delete physical data, it does
+  // not delete any logical data read by Read operations. GC can interfere
+  // with Append or Write operations in another BackupEngine on the same
+  // backup_dir, because temporary files will be treated as obsolete and
+  // deleted.
+  virtual IOStatus GarbageCollect() = 0;
+};
+
+// A backup engine for organizing and managing backups.
+// This class is not user-extensible.
+//
+// This class declaration adds "Write" operations in addition to the
+// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase.
+//
+// # Concurrency between threads on the same BackupEngine* object
+//
+// As of version 6.20, BackupEngine* operations are generally thread-safe,
+// using a read-write lock, though single-thread operation is still
+// recommended to avoid TOCTOU bugs. Specifically, particular kinds of
+// concurrent operations behave like this:
+//
+// op1\op2| Read  | Append | Write
+// -------|-------|--------|--------
+//   Read | conc  | block  | block
+// Append | block | block  | block
+//  Write | block | block  | block
+//
+// conc = operations safely proceed concurrently
+// block = one of the operations safely blocks until the other completes.
+//   There is generally no guarantee as to which completes first.
+//
+// StopBackup is the only operation that affects an ongoing operation.
+//
+// # Interleaving operations between BackupEngine* objects open on the
+// same backup_dir
+//
+// It is recommended only to have one BackupEngine* object open for a given
+// backup_dir, but it is possible to mix / interleave some operations
+// (regardless of whether they are concurrent) with these caveats:
+//
+// op1\op2|  Open  |  Read  | Append | Write
+// -------|--------|--------|--------|--------
+//   Open | conc   | conc   | atomic | unspec
+//   Read | conc   | conc   | old    | unspec
+// Append | atomic | old    | unspec | unspec
+//  Write | unspec | unspec | unspec | unspec
+//
+// Special case: Open with destroy_old_data=true is really a Write
+//
+// conc = operations safely proceed, concurrently when applicable
+// atomic = operations are effectively atomic; if a concurrent Append
+//   operation has not completed at some key point during Open, the
+//   opened BackupEngine* will never see the result of the Append op.
+// old = Read operations do not include any state changes from other
+//   BackupEngine* objects; they return the state at their Open time.
+// unspec = Behavior is unspecified, including possibly trashing the
+//   backup_dir, but is "memory safe" (no C++ undefined behavior)
+//
+class BackupEngine : public BackupEngineReadOnlyBase,
+                     public BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngine() {}
+
+  // BackupEngineOptions have to be the same as the ones used in previous
+  // BackupEngines for the same backup directory.
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngine** backup_engine_ptr);
+
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngine** backup_engine_ptr) {
+    return BackupEngine::Open(options, db_env, backup_engine_ptr);
+  }
+
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
+  virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
+  virtual IOStatus DeleteBackup(BackupID backup_id) = 0;
+};
+
+// A variant of BackupEngine that only allows "Read" operations. See
+// BackupEngine comment for details. This class is not user-extensible.
+class BackupEngineReadOnly : public BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngineReadOnly** backup_engine_ptr);
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngineReadOnly** backup_engine_ptr) {
+    return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
new file mode 100644
index 000000000..fde03db7e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
@@ -0,0 +1,142 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The classes and functions in this header file is used for dumping out the
+// blocks in a block cache, storing or transfering the blocks to another
+// destination host, and load these blocks to the secondary cache at destination
+// host.
+// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They
+// my be changed in the future when the development continues.
+
+// The major and minor version number of the data format to be stored/trandfered
+// via CacheDumpWriter and read out via CacheDumpReader
+static const int kCacheDumpMajorVersion = 0;
+static const int kCacheDumpMinorVersion = 1;
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to write or transfer the data that is created by
+// CacheDumper. We pack one block with its block type, dump time, block key in
+// the block cache, block len, block crc32c checksum and block itself as a unit
+// and it is stored via WritePacket. Before we call WritePacket, we must call
+// WriteMetadata once, which stores the sequence number, block unit checksum,
+// and block unit size.
+// We provide file based CacheDumpWriter to store the metadata and its package
+// sequentially in a file as the defualt implementation. Users can implement
+// their own CacheDumpWriter to store/transfer the data. For example, user can
+// create a subclass which transfer the metadata and package on the fly.
+class CacheDumpWriter {
+ public:
+  virtual ~CacheDumpWriter() = default;
+
+  // Called ONCE before the calls to WritePacket
+  virtual IOStatus WriteMetadata(const Slice& metadata) = 0;
+  virtual IOStatus WritePacket(const Slice& data) = 0;
+  virtual IOStatus Close() = 0;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to read or receive the data that is stored
+// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called
+// once before we call a ReadPacket.
+class CacheDumpReader {
+ public:
+  virtual ~CacheDumpReader() = default;
+  // Called ONCE before the calls to ReadPacket
+  virtual IOStatus ReadMetadata(std::string* metadata) = 0;
+  // Sets data to empty string on EOF
+  virtual IOStatus ReadPacket(std::string* data) = 0;
+  // (Close not needed)
+};
+
+// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any
+// dump or load process related control variables can be added here.
+struct CacheDumpOptions {
+  SystemClock* clock;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This the class to dump out the block in the block cache, store/transfer them
+// via CacheDumpWriter. In order to dump out the blocks belonging to a certain
+// DB or a list of DB (block cache can be shared by many DB), user needs to call
+// SetDumpFilter to specify a list of DB to filter out the blocks that do not
+// belong to those DB.
+// A typical use case is: when we migrate a DB instance from host A to host B.
+// We need to reopen the DB at host B after all the files are copied to host B.
+// At this moment, the block cache at host B does not have any block from this
+// migrated DB. Therefore, the read performance can be low due to cache warm up.
+// By using CacheDumper before we shut down the DB at host A and using
+// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache
+// ahead. This function can be used in other use cases also.
+class CacheDumper {
+ public:
+  virtual ~CacheDumper() = default;
+  // Only dump the blocks in the block cache that belong to the DBs in this list
+  virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+    (void)db_list;
+    return Status::NotSupported("SetDumpFilter is not supported");
+  }
+  // The main function to dump out all the blocks that satisfy the filter
+  // condition from block cache to a certain CacheDumpWriter in one shot. This
+  // process may take some time.
+  virtual IOStatus DumpCacheEntriesToWriter() {
+    return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported");
+  }
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is the class to load the dumped blocks to the destination cache. For now
+// we only load the blocks to the SecondaryCache. In the future, we may plan to
+// support loading to the block cache.
+class CacheDumpedLoader {
+ public:
+  virtual ~CacheDumpedLoader() = default;
+  virtual IOStatus RestoreCacheEntriesToSecondaryCache() {
+    return IOStatus::NotSupported(
+        "RestoreCacheEntriesToSecondaryCache is not supported");
+  }
+};
+
+// Get the writer which stores all the metadata and data sequentially to a file
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+                                  const FileOptions& file_opts,
+                                  const std::string& file_name,
+                                  std::unique_ptr<CacheDumpWriter>* writer);
+
+// Get the reader which read out the metadata and data sequentially from a file
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+                                    const FileOptions& file_opts,
+                                    const std::string& file_name,
+                                    std::unique_ptr<CacheDumpReader>* reader);
+
+// Get the default cache dumper
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+                             const std::shared_ptr<Cache>& cache,
+                             std::unique_ptr<CacheDumpWriter>&& writer,
+                             std::unique_ptr<CacheDumper>* cache_dumper);
+
+// Get the default cache dump loader
+Status NewDefaultCacheDumpedLoader(
+    const CacheDumpOptions& dump_options,
+    const BlockBasedTableOptions& toptions,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    std::unique_ptr<CacheDumpReader>&& reader,
+    std::unique_ptr<CacheDumpedLoader>* cache_dump_loader);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 000000000..ecf920616
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
+
+class Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable snapshots
+  static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+  // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an
+  // absolute path. The specified directory should not exist, since it will be
+  // created by the API.
+  // When a checkpoint is created,
+  // (1) SST and blob files are hard linked if the output directory is on the
+  // same filesystem as the database, and copied otherwise.
+  // (2) other required files (like MANIFEST) are always copied.
+  // log_size_for_flush: if the total log file size is equal or larger than
+  // this value, then a flush is triggered for all the column families. The
+  // default value is 0, which means flush is always triggered. If you move
+  // away from the default, the checkpoint may not contain up-to-date data
+  // if WAL writing is not always enabled.
+  // Flush will always trigger if it is 2PC.
+  // sequence_number_ptr: if it is not nullptr, the value it points to will be
+  // set to a sequence number guaranteed to be part of the DB, not necessarily
+  // the latest. The default value of this parameter is nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating checkpoints
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
+                                  uint64_t log_size_for_flush = 0,
+                                  uint64_t* sequence_number_ptr = nullptr);
+
+  // Exports all live SST files of a specified Column Family onto export_dir,
+  // returning SST files information in metadata.
+  // - SST files will be created as hard links when the directory specified
+  //   is in the same partition as the db directory, copied otherwise.
+  // - export_dir should not already exist and will be created by this API.
+  // - Always triggers a flush.
+  virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                                    const std::string& export_dir,
+                                    ExportImportFilesMetaData** metadata);
+
+  virtual ~Checkpoint() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
new file mode 100644
index 000000000..f61afd69e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// This file was moved to rocksdb/convenience.h"
+
+#include "rocksdb/convenience.h"
diff --git a/src/rocksdb/include/rocksdb/utilities/customizable_util.h b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
new file mode 100644
index 000000000..62240763b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// The methods in this file are used to instantiate new Customizable
+// instances of objects.  These methods are most typically used by
+// the "CreateFromString" method of a customizable class.
+// If not developing a new Type of customizable class, you probably
+// do not need the methods in this file.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use customizable objects
+
+#pragma once
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The FactoryFunc functions are used to create a new customizable object
+// without going through the ObjectRegistry.  This methodology is especially
+// useful in LITE mode, where there is no ObjectRegistry.  The methods take
+// in an ID of the object to create and a pointer to store the created object.
+// If the factory successfully recognized the input ID, the method should return
+// success; otherwise false should be returned.  On success, the object
+// parameter contains the new object.
+template <typename T>
+using SharedFactoryFunc =
+    std::function<bool(const std::string&, std::shared_ptr<T>*)>;
+
+template <typename T>
+using UniqueFactoryFunc =
+    std::function<bool(const std::string&, std::unique_ptr<T>*)>;
+
+template <typename T>
+using StaticFactoryFunc = std::function<bool(const std::string&, T**)>;
+
+// Creates a new shared customizable instance object based on the
+// input parameters using the object registry.
+//
+// The id parameter specifies the instance class of the object to create.
+// The opt_map parameter specifies the configuration of the new instance.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewSharedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewSharedObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new managed customizable instance object based on the
+// input parameters using the object registry.  Unlike "shared" objects,
+// managed objects are limited to a single instance per ID.
+//
+// The id parameter specifies the instance class of the object to create.
+// If an object with this id exists in the registry, the existing object
+// will be returned.  If the object does not exist, a new one will be created.
+//
+// The opt_map parameter specifies the configuration of the new instance.
+// If the object already exists, the existing object is returned "as is" and
+// this parameter is ignored.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the object.  This string
+// will be used by the object registry to locate the appropriate object to
+// create or return.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The managed instance.
+template <typename T>
+static Status NewManagedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  Status status;
+  if (!id.empty()) {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->GetOrCreateManagedObject<T>(
+        id, result, [config_options, opt_map](T* object) {
+          return object->ConfigureFromMap(config_options, opt_map);
+        });
+#else
+    (void)result;
+    (void)opt_map;
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      return Status::OK();
+    }
+  } else {
+    status = Status::NotSupported("Cannot reset object ");
+  }
+  return status;
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+// This method parses the input value to determine the type of instance to
+// create. If there is an existing instance (in result) and it is the same ID
+// as the object being created, the existing configuration is stored and used as
+// the default for the new object.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadSharedObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const SharedFactoryFunc<T>& func,
+                               std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewSharedObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The "id" field from the value (either the whole field or "id=XX") is used
+// to determine the type/id of the object to return.  For a given id, there
+// the same instance of the object will be returned from this method (as opposed
+// to LoadSharedObject which would create different objects for the same id.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadManagedObject(const ConfigOptions& config_options,
+                                const std::string& value,
+                                std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, nullptr, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {  // No Id and no options.  Clear the object
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return NewManagedObject(config_options, id, opt_map, result);
+  }
+}
+
+// Creates a new unique pointer customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewUniqueObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::unique_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewUniqueObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new unique customizable instance object based on the input
+// parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadUniqueObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const UniqueFactoryFunc<T>& func,
+                               std::unique_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewUniqueObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewStaticObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map, T** result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewStaticObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status =
+          Customizable::ConfigureNewObject(config_options, *result, opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadStaticObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const StaticFactoryFunc<T>& func, T** result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, *result, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewStaticObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, *result, opt_map);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
new file mode 100644
index 000000000..d57e7473a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+//  meant to be removed from the db in a non-strict 'ttl' amount of time
+//  Therefore, this guarantees that key-values inserted will remain in the
+//  db for >= ttl amount of time and the db will make efforts to remove the
+//  key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+//  triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+//  corrupt values(timestamp suffixed) and no ttl effect will be there
+//  during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+//  whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+  virtual Status CreateColumnFamilyWithTtl(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      ColumnFamilyHandle** handle, int ttl) = 0;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DBWithTTL** dbptr, int32_t ttl = 0,
+                     bool read_only = false);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     DBWithTTL** dbptr, const std::vector<int32_t>& ttls,
+                     bool read_only = false);
+
+  virtual void SetTtl(int32_t ttl) = 0;
+
+  virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
+
+ protected:
+  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h
new file mode 100644
index 000000000..0e0526557
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/debug.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Data associated with a particular version of a key. A database may internally
+// store multiple versions of a same user key due to snapshots, compaction not
+// happening yet, etc.
+struct KeyVersion {
+  KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
+
+  KeyVersion(const std::string& _user_key, const std::string& _value,
+             SequenceNumber _sequence, int _type)
+      : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
+
+  std::string user_key;
+  std::string value;
+  SequenceNumber sequence;
+  int type;
+  std::string GetTypeName() const;
+};
+
+// Returns listing of all versions of keys in the provided user key range.
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
+// The result is inserted into the provided vector, `key_versions`.
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
new file mode 100644
index 000000000..ffde5effa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
@@ -0,0 +1,181 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// MirrorEnv is an Env implementation that mirrors all file-related
+// operations to two backing Env's (provided at construction time).
+// Writes are mirrored.  For read operations, we do the read from both
+// backends and assert that the results match.
+//
+// This is useful when implementing a new Env and ensuring that the
+// semantics and behavior are correct (in that they match that of an
+// existing, stable Env, like the default POSIX one).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SequentialFileMirror;
+class RandomAccessFileMirror;
+class WritableFileMirror;
+
+class EnvMirror : public EnvWrapper {
+  Env *a_, *b_;
+  bool free_a_, free_b_;
+
+ public:
+  EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false)
+      : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {}
+  ~EnvMirror() {
+    if (free_a_) delete a_;
+    if (free_b_) delete b_;
+  }
+
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override;
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override;
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override;
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* r,
+                           const EnvOptions& options) override;
+  virtual Status NewDirectory(const std::string& name,
+                              std::unique_ptr<Directory>* result) override {
+    std::unique_ptr<Directory> br;
+    Status as = a_->NewDirectory(name, result);
+    Status bs = b_->NewDirectory(name, &br);
+    assert(as == bs);
+    return as;
+  }
+  Status FileExists(const std::string& f) override {
+    Status as = a_->FileExists(f);
+    Status bs = b_->FileExists(f);
+    assert(as == bs);
+    return as;
+  }
+#if defined(_MSC_VER)
+#pragma warning(push)
+// logical operation on address of string constant
+#pragma warning(disable : 4130)
+#endif
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    std::vector<std::string> ar, br;
+    Status as = a_->GetChildren(dir, &ar);
+    Status bs = b_->GetChildren(dir, &br);
+    assert(as == bs);
+    std::sort(ar.begin(), ar.end());
+    std::sort(br.begin(), br.end());
+    if (!as.ok() || ar != br) {
+      assert(0 == "getchildren results don't match");
+    }
+    *r = ar;
+    return as;
+  }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+  Status DeleteFile(const std::string& f) override {
+    Status as = a_->DeleteFile(f);
+    Status bs = b_->DeleteFile(f);
+    assert(as == bs);
+    return as;
+  }
+  Status CreateDir(const std::string& d) override {
+    Status as = a_->CreateDir(d);
+    Status bs = b_->CreateDir(d);
+    assert(as == bs);
+    return as;
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    Status as = a_->CreateDirIfMissing(d);
+    Status bs = b_->CreateDirIfMissing(d);
+    assert(as == bs);
+    return as;
+  }
+  Status DeleteDir(const std::string& d) override {
+    Status as = a_->DeleteDir(d);
+    Status bs = b_->DeleteDir(d);
+    assert(as == bs);
+    return as;
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    uint64_t asize, bsize;
+    Status as = a_->GetFileSize(f, &asize);
+    Status bs = b_->GetFileSize(f, &bsize);
+    assert(as == bs);
+    assert(!as.ok() || asize == bsize);
+    *s = asize;
+    return as;
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    uint64_t amtime, bmtime;
+    Status as = a_->GetFileModificationTime(fname, &amtime);
+    Status bs = b_->GetFileModificationTime(fname, &bmtime);
+    assert(as == bs);
+    assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000);
+    *file_mtime = amtime;
+    return as;
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    Status as = a_->RenameFile(s, t);
+    Status bs = b_->RenameFile(s, t);
+    assert(as == bs);
+    return as;
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    Status as = a_->LinkFile(s, t);
+    Status bs = b_->LinkFile(s, t);
+    assert(as == bs);
+    return as;
+  }
+
+  class FileLockMirror : public FileLock {
+   public:
+    FileLock *a_, *b_;
+    FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {}
+  };
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    FileLock *al, *bl;
+    Status as = a_->LockFile(f, &al);
+    Status bs = b_->LockFile(f, &bl);
+    assert(as == bs);
+    if (as.ok()) *l = new FileLockMirror(al, bl);
+    return as;
+  }
+
+  Status UnlockFile(FileLock* l) override {
+    FileLockMirror* ml = static_cast<FileLockMirror*>(l);
+    Status as = a_->UnlockFile(ml->a_);
+    Status bs = b_->UnlockFile(ml->b_);
+    assert(as == bs);
+    delete ml;
+    return as;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
new file mode 100644
index 000000000..824f8a3df
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function can be used to list the Information logs,
+// given the db pointer.
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
new file mode 100644
index 000000000..007638192
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
@@ -0,0 +1,318 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/ldb_cmd_execute_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommand {
+ public:
+  // Command-line arguments
+  static const std::string ARG_ENV_URI;
+  static const std::string ARG_FS_URI;
+  static const std::string ARG_DB;
+  static const std::string ARG_PATH;
+  static const std::string ARG_SECONDARY_PATH;
+  static const std::string ARG_HEX;
+  static const std::string ARG_KEY_HEX;
+  static const std::string ARG_VALUE_HEX;
+  static const std::string ARG_CF_NAME;
+  static const std::string ARG_TTL;
+  static const std::string ARG_TTL_START;
+  static const std::string ARG_TTL_END;
+  static const std::string ARG_TIMESTAMP;
+  static const std::string ARG_TRY_LOAD_OPTIONS;
+  static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
+  static const std::string ARG_FROM;
+  static const std::string ARG_TO;
+  static const std::string ARG_MAX_KEYS;
+  static const std::string ARG_BLOOM_BITS;
+  static const std::string ARG_FIX_PREFIX_LEN;
+  static const std::string ARG_COMPRESSION_TYPE;
+  static const std::string ARG_COMPRESSION_MAX_DICT_BYTES;
+  static const std::string ARG_BLOCK_SIZE;
+  static const std::string ARG_AUTO_COMPACTION;
+  static const std::string ARG_DB_WRITE_BUFFER_SIZE;
+  static const std::string ARG_WRITE_BUFFER_SIZE;
+  static const std::string ARG_FILE_SIZE;
+  static const std::string ARG_CREATE_IF_MISSING;
+  static const std::string ARG_NO_VALUE;
+  static const std::string ARG_DISABLE_CONSISTENCY_CHECKS;
+  static const std::string ARG_ENABLE_BLOB_FILES;
+  static const std::string ARG_MIN_BLOB_SIZE;
+  static const std::string ARG_BLOB_FILE_SIZE;
+  static const std::string ARG_BLOB_COMPRESSION_TYPE;
+  static const std::string ARG_ENABLE_BLOB_GARBAGE_COLLECTION;
+  static const std::string ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF;
+  static const std::string ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD;
+  static const std::string ARG_BLOB_COMPACTION_READAHEAD_SIZE;
+  static const std::string ARG_BLOB_FILE_STARTING_LEVEL;
+  static const std::string ARG_PREPOPULATE_BLOB_CACHE;
+  static const std::string ARG_DECODE_BLOB_INDEX;
+  static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS;
+
+  struct ParsedParams {
+    std::string cmd;
+    std::vector<std::string> cmd_params;
+    std::map<std::string, std::string> option_map;
+    std::vector<std::string> flags;
+  };
+
+  static LDBCommand* SelectCommand(const ParsedParams& parsed_parms);
+
+  static LDBCommand* InitFromCmdLineArgs(
+      const std::vector<std::string>& args, const Options& options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families,
+      const std::function<LDBCommand*(const ParsedParams&)>& selector =
+          SelectCommand);
+
+  static LDBCommand* InitFromCmdLineArgs(
+      int argc, char const* const* argv, const Options& options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families);
+
+  bool ValidateCmdLineOptions();
+
+  virtual void PrepareOptions();
+
+  virtual void OverrideBaseOptions();
+
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts);
+
+  virtual void SetDBOptions(Options options) { options_ = options; }
+
+  virtual void SetColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>* column_families) {
+    if (column_families != nullptr) {
+      column_families_ = *column_families;
+    } else {
+      column_families_.clear();
+    }
+  }
+
+  void SetLDBOptions(const LDBOptions& ldb_options) {
+    ldb_options_ = ldb_options;
+  }
+
+  const std::map<std::string, std::string>& TEST_GetOptionMap() {
+    return option_map_;
+  }
+
+  const std::vector<std::string>& TEST_GetFlags() { return flags_; }
+
+  virtual bool NoDBOpen() { return false; }
+
+  virtual ~LDBCommand() { CloseDB(); }
+
+  /* Run the command, and return the execute result. */
+  void Run();
+
+  virtual void DoCommand() = 0;
+
+  LDBCommandExecuteResult GetExecuteState() { return exec_state_; }
+
+  void ClearPreviousRunState() { exec_state_.Reset(); }
+
+  // Consider using Slice::DecodeHex directly instead if you don't need the
+  // 0x prefix
+  static std::string HexToString(const std::string& str);
+
+  // Consider using Slice::ToString(true) directly instead if
+  // you don't need the 0x prefix
+  static std::string StringToHex(const std::string& str);
+
+  static const char* DELIM;
+
+ protected:
+  LDBCommandExecuteResult exec_state_;
+  std::string env_uri_;
+  std::string fs_uri_;
+  std::string db_path_;
+  // If empty, open DB as primary. If non-empty, open the DB as secondary
+  // with this secondary path. When running against a database opened by
+  // another process, ldb wll leave the source directory completely intact.
+  std::string secondary_path_;
+  std::string column_family_name_;
+  DB* db_;
+  DBWithTTL* db_ttl_;
+  std::map<std::string, ColumnFamilyHandle*> cf_handles_;
+
+  /**
+   * true implies that this command can work if the db is opened in read-only
+   * mode.
+   */
+  bool is_read_only_;
+
+  /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+  bool is_key_hex_;
+
+  /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+  bool is_value_hex_;
+
+  /** If true, the value is treated as timestamp suffixed */
+  bool is_db_ttl_;
+
+  // If true, the kvs are output with their insert/modify timestamp in a ttl db
+  bool timestamp_;
+
+  // If true, try to construct options from DB's option files.
+  bool try_load_options_;
+
+  // The value passed to options.force_consistency_checks.
+  bool force_consistency_checks_;
+
+  bool enable_blob_files_;
+
+  bool enable_blob_garbage_collection_;
+
+  bool create_if_missing_;
+
+  /**
+   * Map of options passed on the command-line.
+   */
+  const std::map<std::string, std::string> option_map_;
+
+  /**
+   * Flags passed on the command-line.
+   */
+  const std::vector<std::string> flags_;
+
+  /** List of command-line options valid for this command */
+  const std::vector<std::string> valid_cmd_line_options_;
+
+  /** Shared pointer to underlying environment if applicable **/
+  std::shared_ptr<Env> env_guard_;
+
+  bool ParseKeyValue(const std::string& line, std::string* key,
+                     std::string* value, bool is_key_hex, bool is_value_hex);
+
+  LDBCommand(const std::map<std::string, std::string>& options,
+             const std::vector<std::string>& flags, bool is_read_only,
+             const std::vector<std::string>& valid_cmd_line_options);
+
+  void OpenDB();
+
+  void CloseDB();
+
+  ColumnFamilyHandle* GetCfHandle();
+
+  static std::string PrintKeyValue(const std::string& key,
+                                   const std::string& value, bool is_key_hex,
+                                   bool is_value_hex);
+
+  static std::string PrintKeyValue(const std::string& key,
+                                   const std::string& value, bool is_hex);
+
+  /**
+   * Return true if the specified flag is present in the specified flags vector
+   */
+  static bool IsFlagPresent(const std::vector<std::string>& flags,
+                            const std::string& flag) {
+    return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+  }
+
+  static std::string HelpRangeCmdArgs();
+
+  /**
+   * A helper function that returns a list of command line options
+   * used by this command.  It includes the common options and the ones
+   * passed in.
+   */
+  static std::vector<std::string> BuildCmdLineOptions(
+      std::vector<std::string> options);
+
+  bool ParseIntOption(const std::map<std::string, std::string>& options,
+                      const std::string& option, int& value,
+                      LDBCommandExecuteResult& exec_state);
+
+  bool ParseDoubleOption(const std::map<std::string, std::string>& options,
+                         const std::string& option, double& value,
+                         LDBCommandExecuteResult& exec_state);
+
+  bool ParseStringOption(const std::map<std::string, std::string>& options,
+                         const std::string& option, std::string* value);
+
+  bool ParseCompressionTypeOption(
+      const std::map<std::string, std::string>& options,
+      const std::string& option, CompressionType& value,
+      LDBCommandExecuteResult& exec_state);
+
+  /**
+   * Returns the value of the specified option as a boolean.
+   * default_val is used if the option is not found in options.
+   * Throws an exception if the value of the option is not
+   * "true" or "false" (case insensitive).
+   */
+  bool ParseBooleanOption(const std::map<std::string, std::string>& options,
+                          const std::string& option, bool default_val);
+
+  Options options_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  ConfigOptions config_options_;
+  LDBOptions ldb_options_;
+
+ private:
+  /**
+   * Interpret command line options and flags to determine if the key
+   * should be input/output in hex.
+   */
+  bool IsKeyHex(const std::map<std::string, std::string>& options,
+                const std::vector<std::string>& flags);
+
+  /**
+   * Interpret command line options and flags to determine if the value
+   * should be input/output in hex.
+   */
+  bool IsValueHex(const std::map<std::string, std::string>& options,
+                  const std::vector<std::string>& flags);
+
+  bool IsTryLoadOptions(const std::map<std::string, std::string>& options,
+                        const std::vector<std::string>& flags);
+
+  /**
+   * Converts val to a boolean.
+   * val must be either true or false (case insensitive).
+   * Otherwise an exception is thrown.
+   */
+  bool StringToBool(std::string val);
+};
+
+class LDBCommandRunner {
+ public:
+  static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name,
+                        bool to_stderr = true);
+
+  // Returns the status code to return. 0 is no error.
+  static int RunCommand(
+      int argc, char const* const* argv, Options options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
new file mode 100644
index 000000000..57bac3346
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef FAILED
+#undef FAILED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommandExecuteResult {
+ public:
+  enum State {
+    EXEC_NOT_STARTED = 0,
+    EXEC_SUCCEED = 1,
+    EXEC_FAILED = 2,
+  };
+
+  LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
+
+  LDBCommandExecuteResult(State state, std::string& msg)
+      : state_(state), message_(msg) {}
+
+  std::string ToString() {
+    std::string ret;
+    switch (state_) {
+      case EXEC_SUCCEED:
+        break;
+      case EXEC_FAILED:
+        ret.append("Failed: ");
+        break;
+      case EXEC_NOT_STARTED:
+        ret.append("Not started: ");
+    }
+    if (!message_.empty()) {
+      ret.append(message_);
+    }
+    return ret;
+  }
+
+  void Reset() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  bool IsSucceed() { return state_ == EXEC_SUCCEED; }
+
+  bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; }
+
+  bool IsFailed() { return state_ == EXEC_FAILED; }
+
+  static LDBCommandExecuteResult Succeed(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+  }
+
+  static LDBCommandExecuteResult Failed(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_FAILED, msg);
+  }
+
+ private:
+  State state_;
+  std::string message_;
+
+  bool operator==(const LDBCommandExecuteResult&);
+  bool operator!=(const LDBCommandExecuteResult&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 000000000..7e4a6faa4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,145 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+struct Options;
+class Snapshot;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-NULL, or to a file stored
+  // in the same directory as the DB contents if info_log is NULL.
+  // Default: NULL
+  Logger* info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to two write buffers may be held in memory at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL, use the specified cache for blocks.
+  // If NULL, leveldb will automatically create and use an 8MB internal cache.
+  // Default: NULL
+  Cache* block_cache;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // If non-NULL, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: NULL
+  const FilterPolicy* filter_policy;
+
+  // Create a LevelDBOptions object with default values for all fields.
+  LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
new file mode 100644
index 000000000..f617da02b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifdef LUA
+
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+// A class that used to define custom C Library that is callable
+// from Lua script
+class RocksLuaCustomLibrary {
+ public:
+  virtual ~RocksLuaCustomLibrary() {}
+  // The name of the C library.  This name will also be used as the table
+  // (namespace) in Lua that contains the C library.
+  virtual const char* Name() const = 0;
+
+  // Returns a "static const struct luaL_Reg[]", which includes a list of
+  // C functions.  Note that the last entry of this static array must be
+  // {nullptr, nullptr} as required by Lua.
+  //
+  // More details about how to implement Lua C libraries can be found
+  // in the official Lua document http://www.lua.org/pil/26.2.html
+  virtual const struct luaL_Reg* Lib() const = 0;
+
+  // A function that will be called right after the library has been created
+  // and pushed on the top of the lua_State.  This custom setup function
+  // allows developers to put additional table or constant values inside
+  // the same table / namespace.
+  virtual void CustomSetup(lua_State* /*L*/) const {}
+};
+}  // namespace lua
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
new file mode 100644
index 000000000..3427b65ef
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+#ifdef LUA
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+class LuaStateWrapper {
+ public:
+  explicit LuaStateWrapper(const std::string& lua_script) {
+    lua_state_ = luaL_newstate();
+    Init(lua_script, {});
+  }
+  LuaStateWrapper(
+      const std::string& lua_script,
+      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+    lua_state_ = luaL_newstate();
+    Init(lua_script, libraries);
+  }
+  lua_State* GetLuaState() const { return lua_state_; }
+  ~LuaStateWrapper() { lua_close(lua_state_); }
+
+ private:
+  void Init(
+      const std::string& lua_script,
+      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+    if (lua_state_) {
+      luaL_openlibs(lua_state_);
+      for (const auto& library : libraries) {
+        luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
+        library->CustomSetup(lua_state_);
+      }
+      luaL_dostring(lua_state_, lua_script.c_str());
+    }
+  }
+
+  lua_State* lua_state_;
+};
+}  // namespace lua
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h
new file mode 100644
index 000000000..4f1606b51
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the current memory usage of the specified DB instances.
+class MemoryUtil {
+ public:
+  enum UsageType : int {
+    // Memory usage of all the mem-tables.
+    kMemTableTotal = 0,
+    // Memory usage of those un-flushed mem-tables.
+    kMemTableUnFlushed = 1,
+    // Memory usage of all the table readers.
+    kTableReadersTotal = 2,
+    // Memory usage by Cache.
+    kCacheTotal = 3,
+    kNumUsageTypes = 4
+  };
+
+  // Returns the approximate memory usage of different types in the input
+  // list of DBs and Cache set.  For instance, in the output map
+  // usage_by_type, usage_by_type[kMemTableTotal] will store the memory
+  // usage of all the mem-tables from all the input rocksdb instances.
+  //
+  // Note that for memory usage inside Cache class, we will
+  // only report the usage of the input "cache_set" without
+  // including those Cache usage inside the input list "dbs"
+  // of DBs.
+  static Status GetApproximateMemoryUsageByType(
+      const std::vector<DB*>& dbs,
+      const std::unordered_set<const Cache*> cache_set,
+      std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h
new file mode 100644
index 000000000..3bafb837c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h
@@ -0,0 +1,585 @@
+// Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Customizable;
+class Logger;
+class ObjectLibrary;
+
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
+template <typename T>
+using FactoryFunc =
+    std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+// The signature of the function for loading factories
+// into an object library.  This method is expected to register
+// factory functions in the supplied ObjectLibrary.
+// The ObjectLibrary is the library in which the factories will be loaded.
+// The std::string is the argument passed to the loader function.
+// The RegistrarFunc should return the number of objects loaded into this
+// library
+using RegistrarFunc = std::function<int(ObjectLibrary&, const std::string&)>;
+
+template <typename T>
+using ConfigureFunc = std::function<Status(T*)>;
+
+class ObjectLibrary {
+ private:
+  // Base class for an Entry in the Registry.
+  class Entry {
+   public:
+    virtual ~Entry() {}
+    virtual bool Matches(const std::string& target) const = 0;
+    virtual const char* Name() const = 0;
+  };
+
+ public:
+  // Class for matching target strings to a pattern.
+  // Entries consist of a name that starts the pattern and attributes
+  // The following attributes can be added to the entry:
+  //   -Suffix: Comparable to name(suffix)
+  //   -Separator: Comparable to name(separator).+ or name(separator).*
+  //   -Number: Comparable to name(separator).[0-9]+
+  //   -AltName: Comparable to (name|alt)
+  //   -Optional: Comparable to name(separator)?
+  // Multiple separators can be combined and cause multiple matches.
+  // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#")
+  // is roughly equivalent to "(A|B)@.+#.+"
+  //
+  // Note that though this class does provide some regex-style matching,
+  // it is not a full regex parser and has some key differences:
+  //   - Separators are matched left-most.  For example, an entry
+  //     Name("Hello").AddSeparator(" ").AddSuffix("!") would match
+  //     "Hello world!", but not "Hello world!!"
+  //   - No backtracking is necessary, enabling reliably efficient matching
+  class PatternEntry : public Entry {
+   private:
+    enum Quantifier {
+      kMatchZeroOrMore,  // [suffix].*
+      kMatchAtLeastOne,  // [suffix].+
+      kMatchExact,       // [suffix]
+      kMatchInteger,     // [suffix][0-9]+
+      kMatchDecimal,     // [suffix][0-9]+[.][0-9]+
+    };
+
+   public:
+    // Short-cut for creating an entry that matches to a
+    // Customizable::IndividualId
+    static PatternEntry AsIndividualId(const std::string& name) {
+      PatternEntry entry(name, true);
+      entry.AddSeparator("@");
+      entry.AddSeparator("#");
+      return entry;
+    }
+
+    // Creates a new PatternEntry for "name".  If optional is true,
+    // Matches will also return true if name==target
+    explicit PatternEntry(const std::string& name, bool optional = true)
+        : name_(name), optional_(optional), slength_(0) {
+      nlength_ = name_.size();
+    }
+
+    // Adds a suffix (exact match of separator with no trailing characters) to
+    // the separator
+    PatternEntry& AddSuffix(const std::string& suffix) {
+      separators_.emplace_back(suffix, kMatchExact);
+      slength_ += suffix.size();
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing characters) to
+    // the entry
+    // If at_least_one is true, the separator must be followed by at least
+    // one character (e.g. separator.+).
+    // If at_least_one is false, the separator may be followed by zero or
+    // more characters (e.g. separator.*).
+    PatternEntry& AddSeparator(const std::string& separator,
+                               bool at_least_one = true) {
+      slength_ += separator.size();
+      if (at_least_one) {
+        separators_.emplace_back(separator, kMatchAtLeastOne);
+        ++slength_;
+      } else {
+        separators_.emplace_back(separator, kMatchZeroOrMore);
+      }
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing numbers) to the
+    // entry
+    PatternEntry& AddNumber(const std::string& separator, bool is_int = true) {
+      separators_.emplace_back(separator,
+                               (is_int) ? kMatchInteger : kMatchDecimal);
+      slength_ += separator.size() + 1;
+      return *this;
+    }
+
+    // Sets another name that this entry will match, similar to (name|alt)
+    PatternEntry& AnotherName(const std::string& alt) {
+      names_.emplace_back(alt);
+      return *this;
+    }
+
+    // Sets whether the separators are required -- similar to name(separator)?
+    // If optional is true, then name(separator)? would match
+    // If optional is false, then the separators must also match
+    PatternEntry& SetOptional(bool optional) {
+      optional_ = optional;
+      return *this;
+    }
+
+    // Checks to see if the target matches this entry
+    bool Matches(const std::string& target) const override;
+    const char* Name() const override { return name_.c_str(); }
+
+   private:
+    size_t MatchSeparatorAt(size_t start, Quantifier mode,
+                            const std::string& target, size_t tlen,
+                            const std::string& pattern) const;
+
+    bool MatchesTarget(const std::string& name, size_t nlen,
+                       const std::string& target, size_t ylen) const;
+    std::string name_;                // The base name for this entry
+    size_t nlength_;                  // The length of name_
+    std::vector<std::string> names_;  // Alternative names for this entry
+    bool optional_;   // Whether matching of separators is required
+    size_t slength_;  // The minimum required length to match the separators
+    std::vector<std::pair<std::string, Quantifier>>
+        separators_;  // What to match
+  };                  // End class Entry
+
+ private:
+  // An Entry containing a FactoryFunc for creating new Objects
+  template <typename T>
+  class FactoryEntry : public Entry {
+   public:
+    FactoryEntry(Entry* e, FactoryFunc<T> f)
+        : entry_(e), factory_(std::move(f)) {}
+    bool Matches(const std::string& target) const override {
+      return entry_->Matches(target);
+    }
+    const char* Name() const override { return entry_->Name(); }
+
+    // Creates a new T object.
+    T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+                        std::string* msg) const {
+      return factory_(target, guard, msg);
+    }
+    const FactoryFunc<T>& GetFactory() const { return factory_; }
+
+   private:
+    std::unique_ptr<Entry> entry_;  // What to match for this entry
+    FactoryFunc<T> factory_;
+  };  // End class FactoryEntry
+ public:
+  explicit ObjectLibrary(const std::string& id) { id_ = id; }
+
+  const std::string& GetID() const { return id_; }
+
+  // Finds the factory function for the input target.
+  // @see PatternEntry for the matching rules to target
+  // @return If matched, the FactoryFunc for this target, else nullptr
+  template <typename T>
+  FactoryFunc<T> FindFactory(const std::string& target) const {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto factories = factories_.find(T::Type());
+    if (factories != factories_.end()) {
+      for (const auto& e : factories->second) {
+        if (e->Matches(target)) {
+          const auto* fe =
+              static_cast<const ObjectLibrary::FactoryEntry<T>*>(e.get());
+          return fe->GetFactory();
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns the total number of factories registered for this library.
+  // This method returns the sum of all factories registered for all types.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(size_t* num_types) const;
+
+  // Returns the number of factories registered for this library
+  // for the input type.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(const std::string& type) const;
+
+  // Returns the registered factory names for the input type
+  // names is updated to include the names for the type
+  void GetFactoryNames(const std::string& type,
+                       std::vector<std::string>* names) const;
+
+  void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+  void Dump(Logger* logger) const;
+
+  // Registers the factory with the library for the name.
+  // If name==target, the factory may be used to create a new object.
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const std::string& name,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> entry(
+        new FactoryEntry<T>(new PatternEntry(name), func));
+    AddFactoryEntry(T::Type(), std::move(entry));
+    return func;
+  }
+
+  // Registers the factory with the library for the entry.
+  // If the entry matches the target, the factory may be used to create a new
+  // object.
+  // @see PatternEntry for the matching rules.
+  // NOTE: This function replaces the old ObjectLibrary::Register()
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const PatternEntry& entry,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> factory(
+        new FactoryEntry<T>(new PatternEntry(entry), func));
+    AddFactoryEntry(T::Type(), std::move(factory));
+    return func;
+  }
+
+  // Invokes the registrar function with the supplied arg for this library.
+  int Register(const RegistrarFunc& registrar, const std::string& arg) {
+    return registrar(*this, arg);
+  }
+
+  // Returns the default ObjectLibrary
+  static std::shared_ptr<ObjectLibrary>& Default();
+
+ private:
+  void AddFactoryEntry(const char* type, std::unique_ptr<Entry>&& entry) {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto& factories = factories_[type];
+    factories.emplace_back(std::move(entry));
+  }
+
+  // Protects the entry map
+  mutable std::mutex mu_;
+  // ** FactoryFunctions for this loader, organized by type
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>>
+      factories_;
+
+  // The name for this library
+  std::string id_;
+};
+
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+  static std::shared_ptr<ObjectRegistry> NewInstance();
+  static std::shared_ptr<ObjectRegistry> NewInstance(
+      const std::shared_ptr<ObjectRegistry>& parent);
+  static std::shared_ptr<ObjectRegistry> Default();
+  explicit ObjectRegistry(const std::shared_ptr<ObjectRegistry>& parent)
+      : parent_(parent) {}
+  explicit ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library);
+
+  std::shared_ptr<ObjectLibrary> AddLibrary(const std::string& id) {
+    auto library = std::make_shared<ObjectLibrary>(id);
+    AddLibrary(library);
+    return library;
+  }
+
+  void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+    std::unique_lock<std::mutex> lock(library_mutex_);
+    libraries_.push_back(library);
+  }
+
+  void AddLibrary(const std::string& id, const RegistrarFunc& registrar,
+                  const std::string& arg) {
+    auto library = AddLibrary(id);
+    library->Register(registrar, arg);
+  }
+
+  // Finds the factory for target and instantiates a new T.
+  // Returns NotSupported if no factory is found
+  // Returns InvalidArgument if a factory is found but the factory failed.
+  template <typename T>
+  Status NewObject(const std::string& target, T** object,
+                   std::unique_ptr<T>* guard) {
+    assert(guard != nullptr);
+    guard->reset();
+    auto factory = FindFactory<T>(target);
+    if (factory != nullptr) {
+      std::string errmsg;
+      *object = factory(target, guard, &errmsg);
+      if (*object != nullptr) {
+        return Status::OK();
+      } else if (errmsg.empty()) {
+        return Status::InvalidArgument(
+            std::string("Could not load ") + T::Type(), target);
+      } else {
+        return Status::InvalidArgument(errmsg, target);
+      }
+    } else {
+      return Status::NotSupported(std::string("Could not load ") + T::Type(),
+                                  target);
+    }
+  }
+  // Creates a new unique T using the input factory functions.
+  // Returns OK if a new unique T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a unique ptr)
+  template <typename T>
+  Status NewUniqueObject(const std::string& target,
+                         std::unique_ptr<T>* result) {
+    T* ptr = nullptr;
+    std::unique_ptr<T> guard;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a unique ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
+
+  // Creates a new shared T using the input factory functions.
+  // Returns OK if a new shared T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a shared ptr)
+  template <typename T>
+  Status NewSharedObject(const std::string& target,
+                         std::shared_ptr<T>* result) {
+    std::unique_ptr<T> guard;
+    T* ptr = nullptr;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a shared ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
+
+  // Creates a new static T using the input factory functions.
+  // Returns OK if a new static T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return a guarded object
+  //                      (meaning it is managed by a unique ptr)
+  template <typename T>
+  Status NewStaticObject(const std::string& target, T** result) {
+    std::unique_ptr<T> guard;
+    T* ptr = nullptr;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard.get()) {
+      return Status::InvalidArgument(std::string("Cannot make a static ") +
+                                         T::Type() + " from a guarded one ",
+                                     target);
+    } else {
+      *result = ptr;
+      return Status::OK();
+    }
+  }
+
+  // Sets the object for the given id/type to be the input object
+  // If the registry does not contain this id/type, the object is added and OK
+  // is returned. If the registry contains a different object, an error is
+  // returned. If the registry contains the input object, OK is returned.
+  template <typename T>
+  Status SetManagedObject(const std::shared_ptr<T>& object) {
+    assert(object != nullptr);
+    return SetManagedObject(object->GetId(), object);
+  }
+
+  template <typename T>
+  Status SetManagedObject(const std::string& id,
+                          const std::shared_ptr<T>& object) {
+    const auto c = std::static_pointer_cast<Customizable>(object);
+    return SetManagedObject(T::Type(), id, c);
+  }
+
+  // Returns the object for the given id, if one exists.
+  // If the object is not found in the registry, a nullptr is returned
+  template <typename T>
+  std::shared_ptr<T> GetManagedObject(const std::string& id) const {
+    auto c = GetManagedObject(T::Type(), id);
+    return std::static_pointer_cast<T>(c);
+  }
+
+  // Returns the set of managed objects found in the registry matching
+  // the input type and ID.
+  // If the input id is not empty, then only objects of that class
+  // (IsInstanceOf(id)) will be returned (for example, only return LRUCache
+  // objects) If the input id is empty, then all objects of that type (all Cache
+  // objects)
+  template <typename T>
+  Status ListManagedObjects(const std::string& id,
+                            std::vector<std::shared_ptr<T>>* results) const {
+    std::vector<std::shared_ptr<Customizable>> customizables;
+    results->clear();
+    Status s = ListManagedObjects(T::Type(), id, &customizables);
+    if (s.ok()) {
+      for (const auto& c : customizables) {
+        results->push_back(std::static_pointer_cast<T>(c));
+      }
+    }
+    return s;
+  }
+
+  template <typename T>
+  Status ListManagedObjects(std::vector<std::shared_ptr<T>>* results) const {
+    return ListManagedObjects("", results);
+  }
+
+  // Creates a new ManagedObject in the registry for the id if one does not
+  // currently exist.  If an object with that ID already exists, the current
+  // object is returned.
+  //
+  // The ID is the identifier of the object to be returned/created and returned
+  // in result
+  // If a new object is created (using the object factories), the cfunc
+  // parameter will be invoked to configure the new object.
+  template <typename T>
+  Status GetOrCreateManagedObject(const std::string& id,
+                                  std::shared_ptr<T>* result,
+                                  const ConfigureFunc<T>& cfunc = nullptr) {
+    if (parent_ != nullptr) {
+      auto object = parent_->GetManagedObject(T::Type(), id);
+      if (object != nullptr) {
+        *result = std::static_pointer_cast<T>(object);
+        return Status::OK();
+      }
+    }
+    {
+      std::unique_lock<std::mutex> lock(objects_mutex_);
+      auto key = ToManagedObjectKey(T::Type(), id);
+      auto iter = managed_objects_.find(key);
+      if (iter != managed_objects_.end()) {
+        auto object = iter->second.lock();
+        if (object != nullptr) {
+          *result = std::static_pointer_cast<T>(object);
+          return Status::OK();
+        }
+      }
+      std::shared_ptr<T> object;
+      Status s = NewSharedObject(id, &object);
+      if (s.ok() && cfunc != nullptr) {
+        s = cfunc(object.get());
+      }
+      if (s.ok()) {
+        auto c = std::static_pointer_cast<Customizable>(object);
+        if (id != c->Name()) {
+          // If the ID is not the base name of the class, add the new
+          // object under the input ID
+          managed_objects_[key] = c;
+        }
+        if (id != c->GetId() && c->GetId() != c->Name()) {
+          // If the input and current ID do not match, and the
+          // current ID is not the base bame, add the new object under
+          // its new ID
+          key = ToManagedObjectKey(T::Type(), c->GetId());
+          managed_objects_[key] = c;
+        }
+        *result = object;
+      }
+      return s;
+    }
+  }
+
+  // Returns the number of factories registered for this library
+  // for the input type.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(const std::string& type) const;
+
+  // Returns the names of registered factories for the input type.
+  // names is updated to include the names for the type
+  void GetFactoryNames(const std::string& type,
+                       std::vector<std::string>* names) const;
+
+  void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+  // Dump the contents of the registry to the logger
+  void Dump(Logger* logger) const;
+
+  // Invokes the input function to retrieve the properties for this plugin.
+  int RegisterPlugin(const std::string& name, const RegistrarFunc& func);
+
+ private:
+  static std::string ToManagedObjectKey(const std::string& type,
+                                        const std::string& id) {
+    return type + "://" + id;
+  }
+
+  // Returns the Customizable managed object associated with the key (Type/ID).
+  // If not found, nullptr is returned.
+  std::shared_ptr<Customizable> GetManagedObject(const std::string& type,
+                                                 const std::string& id) const;
+  Status ListManagedObjects(
+      const std::string& type, const std::string& pattern,
+      std::vector<std::shared_ptr<Customizable>>* results) const;
+  // Sets the managed object associated with the key (Type/ID) to c.
+  // If the named managed object does not exist, the object is added and OK is
+  // returned If the object exists and is the same as c, OK is returned
+  // Otherwise, an error status is returned.
+  Status SetManagedObject(const std::string& type, const std::string& id,
+                          const std::shared_ptr<Customizable>& c);
+
+  // Searches (from back to front) the libraries looking for the
+  // factory that matches this name.
+  // Returns the factory if it is found, and nullptr otherwise
+  template <typename T>
+  const FactoryFunc<T> FindFactory(const std::string& name) const {
+    {
+      std::unique_lock<std::mutex> lock(library_mutex_);
+      for (auto iter = libraries_.crbegin(); iter != libraries_.crend();
+           ++iter) {
+        const auto factory = iter->get()->FindFactory<T>(name);
+        if (factory != nullptr) {
+          return factory;
+        }
+      }
+    }
+    if (parent_ == nullptr) {
+      return nullptr;
+    } else {
+      return parent_->FindFactory<T>(name);
+    }
+  }
+
+  // The set of libraries to search for factories for this registry.
+  // The libraries are searched in reverse order (back to front) when
+  // searching for entries.
+  std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+  std::vector<std::string> plugins_;
+  static std::unordered_map<std::string, RegistrarFunc> builtins_;
+  std::map<std::string, std::weak_ptr<Customizable>> managed_objects_;
+  std::shared_ptr<ObjectRegistry> parent_;
+  mutable std::mutex objects_mutex_;  // Mutex for managed objects
+  mutable std::mutex library_mutex_;  // Mutex for managed libraries
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
new file mode 100644
index 000000000..c070e49a3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Transaction;
+
+// Database with Transaction support.
+//
+// See optimistic_transaction.h and examples/transaction_example.cc
+
+// Options to use when starting an Optimistic Transaction
+struct OptimisticTransactionOptions {
+  // Setting set_snapshot=true is the same as calling SetSnapshot().
+  bool set_snapshot = false;
+
+  // Should be set if the DB has a non-default comparator.
+  // See comment in WriteBatchWithIndex constructor.
+  const Comparator* cmp = BytewiseComparator();
+};
+
+enum class OccValidationPolicy {
+  // Validate serially at commit stage, AFTER entering the write-group.
+  // Isolation validation is processed single-threaded(since in the
+  // write-group).
+  // May suffer from high mutex contention, as per this link:
+  // https://github.com/facebook/rocksdb/issues/4402
+  kValidateSerial = 0,
+  // Validate parallelly before commit stage, BEFORE entering the write-group to
+  // reduce mutex contention. Each txn acquires locks for its write-set
+  // records in some well-defined order.
+  kValidateParallel = 1
+};
+
+struct OptimisticTransactionDBOptions {
+  OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel;
+
+  // works only if validate_policy == OccValidationPolicy::kValidateParallel
+  uint32_t occ_lock_buckets = (1 << 20);
+};
+
+// Range deletions (including those in `WriteBatch`es passed to `Write()`) are
+// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status`
+class OptimisticTransactionDB : public StackableDB {
+ public:
+  // Open an OptimisticTransactionDB similar to DB::Open().
+  static Status Open(const Options& options, const std::string& dbname,
+                     OptimisticTransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     OptimisticTransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options,
+                     const OptimisticTransactionDBOptions& occ_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     OptimisticTransactionDB** dbptr);
+
+  virtual ~OptimisticTransactionDB() {}
+
+  // Starts a new Transaction.
+  //
+  // Caller is responsible for deleting the returned transaction when no
+  // longer needed.
+  //
+  // If old_txn is not null, BeginTransaction will reuse this Transaction
+  // handle instead of allocating a new one.  This is an optimization to avoid
+  // extra allocations when repeatedly creating transactions.
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const OptimisticTransactionOptions& txn_options =
+          OptimisticTransactionOptions(),
+      Transaction* old_txn = nullptr) = 0;
+
+  OptimisticTransactionDB(const OptimisticTransactionDB&) = delete;
+  void operator=(const OptimisticTransactionDB&) = delete;
+
+ protected:
+  // To Create an OptimisticTransactionDB, call Open()
+  explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
new file mode 100644
index 000000000..a73324a9e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Try to migrate DB created with old_opts to be use new_opts.
+// Multiple column families is not supported.
+// It is best-effort. No guarantee to succeed.
+// A full compaction may be executed.
+// If the target options use FIFO compaction, the FIFO condition might be
+// sacrificed: for data migrated, data inserted later might be dropped
+// earlier. This is to gurantee FIFO compaction won't drop all the
+// migrated data to fit max_table_files_size.
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+                             const Options& new_opts);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_type.h b/src/rocksdb/include/rocksdb/utilities/options_type.h
new file mode 100644
index 000000000..cd340ed59
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_type.h
@@ -0,0 +1,1221 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// The OptionTypeInfo and related classes provide a framework for
+// configuring and validating RocksDB classes via the Options framework.
+// This file is part of the public API to allow developers who wish to
+// write their own extensions and plugins to take use the Options
+// framework in their custom implementations.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use custom extensions
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct DBOptions;
+
+// The underlying "class/type" of the option.
+// This enum is used to determine how the option should
+// be converted to/from strings and compared.
+enum class OptionType {
+  kBoolean,
+  kInt,
+  kInt32T,
+  kInt64T,
+  kUInt,
+  kUInt8T,
+  kUInt32T,
+  kUInt64T,
+  kSizeT,
+  kString,
+  kDouble,
+  kCompactionStyle,
+  kCompactionPri,
+  kCompressionType,
+  kCompactionStopStyle,
+  kChecksumType,
+  kEncodingType,
+  kEnv,
+  kEnum,
+  kStruct,
+  kVector,
+  kConfigurable,
+  kCustomizable,
+  kEncodedString,
+  kTemperature,
+  kArray,
+  kUnknown,
+};
+
+enum class OptionVerificationType {
+  kNormal,
+  kByName,               // The option is pointer typed so we can only verify
+                         // based on it's name.
+  kByNameAllowNull,      // Same as kByName, but it also allows the case
+                         // where one of them is a nullptr.
+  kByNameAllowFromNull,  // Same as kByName, but it also allows the case
+                         // where the old option is nullptr.
+  kDeprecated,           // The option is no longer used in rocksdb. The RocksDB
+                         // OptionsParser will still accept this option if it
+                         // happen to exists in some Options file.  However,
+                         // the parser will not include it in serialization
+                         // and verification processes.
+  kAlias,                // This option represents is a name/shortcut for
+                         // another option and should not be written or verified
+                         // independently
+};
+
+// A set of modifier flags used to alter how an option is evaluated or
+// processed. These flags can be combined together (e.g. kMutable | kShared).
+// The kCompare flags can be used to control if/when options are compared.
+// If kCompareNever is set, two related options would never be compared (always
+// equal) If kCompareExact is set, the options will only be compared if the
+// sanity mode
+//                  is exact
+// kMutable       means the option can be changed after it is prepared
+// kShared        means the option is contained in a std::shared_ptr
+// kUnique        means the option is contained in a std::uniqued_ptr
+// kRawPointer    means the option is a raw pointer value.
+// kAllowNull     means that an option is allowed to be null for verification
+//                purposes.
+// kDontSerialize means this option should not be serialized and included in
+//                the string representation.
+// kDontPrepare   means do not call PrepareOptions for this pointer value.
+enum class OptionTypeFlags : uint32_t {
+  kNone = 0x00,  // No flags
+  kCompareDefault = 0x0,
+  kCompareNever = ConfigOptions::kSanityLevelNone,
+  kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible,
+  kCompareExact = ConfigOptions::kSanityLevelExactMatch,
+
+  kMutable = 0x0100,         // Option is mutable
+  kRawPointer = 0x0200,      // The option is stored as a raw pointer
+  kShared = 0x0400,          // The option is stored as a shared_ptr
+  kUnique = 0x0800,          // The option is stored as a unique_ptr
+  kAllowNull = 0x1000,       // The option can be null
+  kDontSerialize = 0x2000,   // Don't serialize the option
+  kDontPrepare = 0x4000,     // Don't prepare or sanitize this option
+  kStringNameOnly = 0x8000,  // The option serializes to a name only
+};
+
+inline OptionTypeFlags operator|(const OptionTypeFlags& a,
+                                 const OptionTypeFlags& b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) |
+                                      static_cast<uint32_t>(b));
+}
+
+inline OptionTypeFlags operator&(const OptionTypeFlags& a,
+                                 const OptionTypeFlags& b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) &
+                                      static_cast<uint32_t>(b));
+}
+
+// Converts an string into its enumerated value.
+// @param type_map Mapping between strings and enum values
+// @param type The string representation of the enum
+// @param value Returns the enum value represented by the string
+// @return true if the string was found in the enum map, false otherwise.
+template <typename T>
+bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
+               const std::string& type, T* value) {
+  auto iter = type_map.find(type);
+  if (iter != type_map.end()) {
+    *value = iter->second;
+    return true;
+  }
+  return false;
+}
+
+// Converts an enum into its string representation.
+// @param type_map Mapping between strings and enum values
+// @param type The enum
+// @param value Returned as the string representation of the enum
+// @return true if the enum was found in the enum map, false otherwise.
+template <typename T>
+bool SerializeEnum(const std::unordered_map<std::string, T>& type_map,
+                   const T& type, std::string* value) {
+  for (const auto& pair : type_map) {
+    if (pair.second == type) {
+      *value = pair.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+                  const OptionTypeInfo& elem_info, char separator,
+                  const std::string& name, const std::string& value,
+                  std::array<T, kSize>* result);
+
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+                      const OptionTypeInfo& elem_info, char separator,
+                      const std::string& name, const std::array<T, kSize>& vec,
+                      std::string* value);
+
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+                    const OptionTypeInfo& elem_info, const std::string& name,
+                    const std::array<T, kSize>& array1,
+                    const std::array<T, kSize>& array2, std::string* mismatch);
+
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result);
+
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value);
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch);
+
+// Function for converting a option string value into its underlying
+// representation in "addr"
+// On success, Status::OK is returned and addr is set to the parsed form
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the value is parsed
+// @param name  The name of the options being parsed
+// @param value The string representation of the option
+// @param addr  Pointer to the object
+using ParseFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const std::string& /*value*/, void* /*addr*/)>;
+
+// Function for converting an option "addr" into its string representation.
+// On success, Status::OK is returned and value is the serialized form.
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the values are serialized
+// @param name  The name of the options being serialized
+// @param addr  Pointer to the value being serialized
+// @param value The result of the serialization.
+using SerializeFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr*/, std::string* /*value*/)>;
+
+// Function for comparing two option values
+// If they are not equal, updates "mismatch" with the name of the bad option
+// @param opts  The ConfigOptions controlling how the values are compared
+// @param name  The name of the options being compared
+// @param addr1 The first address to compare
+// @param addr2 The address to compare to
+// @param mismatch If the values are not equal, the name of the option that
+// first differs
+using EqualsFunc = std::function<bool(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>;
+
+// Function for preparing/initializing an option.
+using PrepareFunc =
+    std::function<Status(const ConfigOptions& /*opts*/,
+                         const std::string& /*name*/, void* /*addr*/)>;
+
+// Function for validating an option.
+using ValidateFunc = std::function<Status(
+    const DBOptions& /*db_opts*/, const ColumnFamilyOptions& /*cf_opts*/,
+    const std::string& /*name*/, const void* /*addr*/)>;
+
+// A struct for storing constant option information such as option name,
+// option type, and offset.
+class OptionTypeInfo {
+ public:
+  // A simple "normal", non-mutable Type "type" at offset
+  OptionTypeInfo(int offset, OptionType type)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(OptionVerificationType::kNormal),
+        flags_(OptionTypeFlags::kNone) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func,
+                 const SerializeFunc& serialize_func,
+                 const EqualsFunc& equals_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(serialize_func),
+        equals_func_(equals_func),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  // Creates an OptionTypeInfo for an enum type.  Enums use an additional
+  // map to convert the enums to/from their string representation.
+  // To create an OptionTypeInfo that is an Enum, one should:
+  // - Create a static map of string values to the corresponding enum value
+  // - Call this method passing the static map in as a parameter.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  template <typename T>
+  static OptionTypeInfo Enum(
+      int offset, const std::unordered_map<std::string, T>* const map,
+      OptionTypeFlags flags = OptionTypeFlags::kNone) {
+    OptionTypeInfo info(offset, OptionType::kEnum,
+                        OptionVerificationType::kNormal, flags);
+    info.SetParseFunc(
+        // Uses the map argument to convert the input string into
+        // its corresponding enum value.  If value is found in the map,
+        // addr is updated to the corresponding map entry.
+        // @return OK if the value is found in the map
+        // @return InvalidArgument if the value is not found in the map
+        [map](const ConfigOptions&, const std::string& name,
+              const std::string& value, void* addr) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (ParseEnum<T>(*map, value, static_cast<T*>(addr))) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        });
+    info.SetSerializeFunc(
+        // Uses the map argument to convert the input enum into
+        // its corresponding string value.  If enum value is found in the map,
+        // value is updated to the corresponding string value in the map.
+        // @return OK if the enum is found in the map
+        // @return InvalidArgument if the enum is not found in the map
+        [map](const ConfigOptions&, const std::string& name, const void* addr,
+              std::string* value) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (SerializeEnum<T>(*map, (*static_cast<const T*>(addr)),
+                                      value)) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        });
+    info.SetEqualsFunc(
+        // Casts addr1 and addr2 to the enum type and returns true if
+        // they are equal, false otherwise.
+        [](const ConfigOptions&, const std::string&, const void* addr1,
+           const void* addr2, std::string*) {
+          return (*static_cast<const T*>(addr1) ==
+                  *static_cast<const T*>(addr2));
+        });
+    return info;
+  }  // End OptionTypeInfo::Enum
+
+  // Creates an OptionTypeInfo for a Struct type.  Structs have a
+  // map of string-OptionTypeInfo associated with them that describes how
+  // to process the object for parsing, serializing, and matching.
+  // Structs also have a struct_name, which is the name of the object
+  // as registered in the parent map.
+  // When processing a struct, the option name can be specified as:
+  //   - <struct_name>       Meaning to process the entire struct.
+  //   - <struct_name.field> Meaning to process the single field
+  //   - <field>             Process the single fields
+  // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions
+  // are all examples of Struct options.
+  //
+  // To create an OptionTypeInfo that is a Struct, one should:
+  // - Create a static map of string-OptionTypeInfo corresponding to the
+  //   properties of the object that can be set via the options.
+  // - Call this method passing the name and map in as parameters.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kStruct, verification, flags);
+    info.SetParseFunc(
+        // Parses the struct and updates the fields at addr
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name,
+                                  const std::string& value, void* addr) {
+          return ParseStruct(opts, struct_name, struct_map, name, value, addr);
+        });
+    info.SetSerializeFunc(
+        // Serializes the struct options into value
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr,
+                                  std::string* value) {
+          return SerializeStruct(opts, struct_name, struct_map, name, addr,
+                                 value);
+        });
+    info.SetEqualsFunc(
+        // Compares the struct fields of addr1 and addr2 for equality
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
+          return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
+                                 addr2, mismatch);
+        });
+    return info;
+  }
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags,
+      const ParseFunc& parse_func) {
+    OptionTypeInfo info(
+        Struct(struct_name, struct_map, offset, verification, flags));
+    return info.SetParseFunc(parse_func);
+  }
+
+  template <typename T, size_t kSize>
+  static OptionTypeInfo Array(int _offset, OptionVerificationType _verification,
+                              OptionTypeFlags _flags,
+                              const OptionTypeInfo& elem_info,
+                              char separator = ':') {
+    OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags);
+    info.SetParseFunc([elem_info, separator](
+                          const ConfigOptions& opts, const std::string& name,
+                          const std::string& value, void* addr) {
+      auto result = static_cast<std::array<T, kSize>*>(addr);
+      return ParseArray<T, kSize>(opts, elem_info, separator, name, value,
+                                  result);
+    });
+    info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+                                                 const std::string& name,
+                                                 const void* addr,
+                                                 std::string* value) {
+      const auto& array = *(static_cast<const std::array<T, kSize>*>(addr));
+      return SerializeArray<T, kSize>(opts, elem_info, separator, name, array,
+                                      value);
+    });
+    info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+                                   const std::string& name, const void* addr1,
+                                   const void* addr2, std::string* mismatch) {
+      const auto& array1 = *(static_cast<const std::array<T, kSize>*>(addr1));
+      const auto& array2 = *(static_cast<const std::array<T, kSize>*>(addr2));
+      return ArraysAreEqual<T, kSize>(opts, elem_info, name, array1, array2,
+                                      mismatch);
+    });
+    return info;
+  }
+
+  template <typename T>
+  static OptionTypeInfo Vector(int _offset,
+                               OptionVerificationType _verification,
+                               OptionTypeFlags _flags,
+                               const OptionTypeInfo& elem_info,
+                               char separator = ':') {
+    OptionTypeInfo info(_offset, OptionType::kVector, _verification, _flags);
+    info.SetParseFunc([elem_info, separator](
+                          const ConfigOptions& opts, const std::string& name,
+                          const std::string& value, void* addr) {
+      auto result = static_cast<std::vector<T>*>(addr);
+      return ParseVector<T>(opts, elem_info, separator, name, value, result);
+    });
+    info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+                                                 const std::string& name,
+                                                 const void* addr,
+                                                 std::string* value) {
+      const auto& vec = *(static_cast<const std::vector<T>*>(addr));
+      return SerializeVector<T>(opts, elem_info, separator, name, vec, value);
+    });
+    info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+                                   const std::string& name, const void* addr1,
+                                   const void* addr2, std::string* mismatch) {
+      const auto& vec1 = *(static_cast<const std::vector<T>*>(addr1));
+      const auto& vec2 = *(static_cast<const std::vector<T>*>(addr2));
+      return VectorsAreEqual<T>(opts, elem_info, name, vec1, vec2, mismatch);
+    });
+    return info;
+  }
+
+  // Create a new std::shared_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::shared_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kShared);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto* shared = static_cast<std::shared_ptr<T>*>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        shared->reset();
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, shared);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomSharedPtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  // Create a new std::unique_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::unique_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kUnique);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto* unique = static_cast<std::unique_ptr<T>*>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        unique->reset();
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, unique);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomUniquePtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  // Create a new Customizable* OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // T object.
+  //
+  // @param _offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kRawPointer);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto** pointer = static_cast<T**>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        *pointer = nullptr;
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, pointer);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags,
+                                       const SerializeFunc& serialize_func,
+                                       const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomRawPtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  OptionTypeInfo& SetParseFunc(const ParseFunc& f) {
+    parse_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetSerializeFunc(const SerializeFunc& f) {
+    serialize_func_ = f;
+    return *this;
+  }
+  OptionTypeInfo& SetEqualsFunc(const EqualsFunc& f) {
+    equals_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetPrepareFunc(const PrepareFunc& f) {
+    prepare_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetValidateFunc(const ValidateFunc& f) {
+    validate_func_ = f;
+    return *this;
+  }
+
+  bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; }
+
+  bool IsEditable(const ConfigOptions& opts) const {
+    if (opts.mutable_options_only) {
+      return IsMutable();
+    } else {
+      return true;
+    }
+  }
+  bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); }
+
+  bool IsDeprecated() const {
+    return IsEnabled(OptionVerificationType::kDeprecated);
+  }
+
+  // Returns true if the option is marked as an Alias.
+  // Aliases are valid options that are parsed but are not converted to strings
+  // or compared.
+  bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); }
+
+  bool IsEnabled(OptionVerificationType ovf) const {
+    return verification_ == ovf;
+  }
+
+  // Returns the sanity level for comparing the option.
+  // If the options should not be compared, returns None
+  // If the option has a compare flag, returns it.
+  // Otherwise, returns "exact"
+  ConfigOptions::SanityLevel GetSanityLevel() const {
+    if (IsDeprecated() || IsAlias()) {
+      return ConfigOptions::SanityLevel::kSanityLevelNone;
+    } else {
+      auto match = (flags_ & OptionTypeFlags::kCompareExact);
+      if (match == OptionTypeFlags::kCompareDefault) {
+        return ConfigOptions::SanityLevel::kSanityLevelExactMatch;
+      } else {
+        return (ConfigOptions::SanityLevel)match;
+      }
+    }
+  }
+
+  // Returns true if the option should be serialized.
+  // Options should be serialized if the are not deprecated, aliases,
+  // or marked as "Don't Serialize".
+  bool ShouldSerialize() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  bool ShouldPrepare() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else if (IsEnabled(OptionTypeFlags::kDontPrepare)) {
+      return false;
+    } else {
+      return (prepare_func_ != nullptr || IsConfigurable());
+    }
+  }
+
+  bool ShouldValidate() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else {
+      return (validate_func_ != nullptr || IsConfigurable());
+    }
+  }
+
+  // Returns true if the option is allowed to be null.
+  // Options can be null if the verification type is allow from null
+  // or if the flags specify allow null.
+  bool CanBeNull() const {
+    return (IsEnabled(OptionTypeFlags::kAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowFromNull));
+  }
+
+  bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); }
+
+  bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); }
+
+  bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); }
+
+  bool IsByName() const {
+    return (verification_ == OptionVerificationType::kByName ||
+            verification_ == OptionVerificationType::kByNameAllowNull ||
+            verification_ == OptionVerificationType::kByNameAllowFromNull);
+  }
+
+  bool IsStruct() const { return (type_ == OptionType::kStruct); }
+
+  bool IsConfigurable() const {
+    return (type_ == OptionType::kConfigurable ||
+            type_ == OptionType::kCustomizable);
+  }
+
+  bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); }
+
+  inline const void* GetOffset(const void* base) const {
+    return static_cast<const char*>(base) + offset_;
+  }
+
+  inline void* GetOffset(void* base) const {
+    return static_cast<char*>(base) + offset_;
+  }
+
+  template <typename T>
+  const T* GetOffsetAs(const void* base) const {
+    const void* addr = GetOffset(base);
+    return static_cast<const T*>(addr);
+  }
+
+  template <typename T>
+  T* GetOffsetAs(void* base) const {
+    void* addr = GetOffset(base);
+    return static_cast<T*>(addr);
+  }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  const T* AsRawPointer(const void* const base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    if (IsUniquePtr()) {
+      const auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      const auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      const T* const* ptr = GetOffsetAs<T* const>(base_addr);
+      return *ptr;
+    } else {
+      return GetOffsetAs<T>(base_addr);
+    }
+  }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  T* AsRawPointer(void* base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    if (IsUniquePtr()) {
+      auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      auto ptr = GetOffsetAs<T*>(base_addr);
+      return *ptr;
+    } else {
+      return GetOffsetAs<T>(base_addr);
+    }
+  }
+
+  // Parses the option in "opt_value" according to the rules of this class
+  // and updates the value at "opt_ptr".
+  // On success, Status::OK() is returned.  On failure:
+  // NotFound means the opt_name is not valid for this option
+  // NotSupported means we do not know how to parse the value for this option
+  // InvalidArgument means the opt_value is not valid for this option.
+  Status Parse(const ConfigOptions& config_options, const std::string& opt_name,
+               const std::string& opt_value, void* const opt_ptr) const;
+
+  // Serializes the option in "opt_addr" according to the rules of this class
+  // into the value at "opt_value".
+  Status Serialize(const ConfigOptions& config_options,
+                   const std::string& opt_name, const void* const opt_ptr,
+                   std::string* opt_value) const;
+
+  // Compares the "addr1" and "addr2" values according to the rules of this
+  // class and returns true if they match.  On a failed match, mismatch is the
+  // name of the option that failed to match.
+  bool AreEqual(const ConfigOptions& config_options,
+                const std::string& opt_name, const void* const addr1,
+                const void* const addr2, std::string* mismatch) const;
+
+  // Used to override the match rules for "ByName" options.
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const void* const that_ptr) const;
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const std::string& that_value) const;
+
+  Status Prepare(const ConfigOptions& config_options, const std::string& name,
+                 void* opt_ptr) const;
+  Status Validate(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts,
+                  const std::string& name, const void* opt_ptr) const;
+
+  // Parses the input opts_map according to the type_map for the opt_addr
+  // For each name-value pair in opts_map, find the corresponding name in
+  // type_map If the name is found:
+  //    - set the corresponding value in opt_addr, returning the status on
+  //    failure;
+  // If the name is not found:
+  //    - If unused is specified, add the name-value to unused and continue
+  //    - If ingore_unknown_options is false, return NotFound
+  // Returns OK if all options were either:
+  //    - Successfully set
+  //    - options were not found and ignore_unknown_options=true
+  //    - options were not found and unused was specified
+  // Note that this method is much less sophisticated than the comparable
+  // Configurable::Configure methods.  For example, on error, there is no
+  // attempt to return opt_addr to the initial state.  Additionally, there
+  // is no effort to initialize (Configurable::PrepareOptions) the object
+  // on success.  This method should typically only be used for simpler,
+  // standalone structures and not those that contain shared and embedded
+  // objects.
+  static Status ParseType(
+      const ConfigOptions& config_options, const std::string& opts_str,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+  static Status ParseType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+
+  // Parses the input value according to the map for the struct at opt_addr
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it, based on struct_name and
+  // opt_name.
+  static Status ParseStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const std::string& value, void* opt_addr);
+
+  // Serializes the values from opt_addr using the rules in type_map.
+  // Returns the serialized form in result.
+  // Returns OK on success or non-OK if some option could not be serialized.
+  static Status SerializeType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      const void* opt_addr, std::string* value);
+
+  // Serializes the input addr according to the map for the struct to value.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static Status SerializeStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* opt_addr, std::string* value);
+
+  // Compares the values in this_addr and that_addr using the rules in type_map.
+  // If the values are equal, returns true
+  // If the values are not equal, returns false and sets mismatch to the name
+  // of the first value that did not match.
+  static bool TypesAreEqual(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& map,
+      const void* this_addr, const void* that_addr, std::string* mismatch);
+
+  // Compares the input offsets according to the map for the struct and returns
+  // true if they are equivalent, false otherwise.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static bool StructsAreEqual(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* this_offset,
+      const void* that_offset, std::string* mismatch);
+
+  // Finds the entry for the opt_name in the opt_map, returning
+  // nullptr if not found.
+  // If found, elem_name will be the name of option to find.
+  // This may be opt_name, or a substring of opt_name.
+  // For "simple" options, opt_name will be equal to elem_name.  Given the
+  // opt_name "opt", elem_name will equal "opt".
+  // For "embedded" options (like structs), elem_name may be opt_name
+  // or a field within the opt_name.  For example, given the struct "struct",
+  // and opt_name of "struct.field", elem_name will be "field"
+  static const OptionTypeInfo* Find(
+      const std::string& opt_name,
+      const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+      std::string* elem_name);
+
+  // Returns the next token marked by the delimiter from "opts" after start in
+  // token and updates end to point to where that token stops. Delimiters inside
+  // of braces are ignored. Returns OK if a token is found and an error if the
+  // input opts string is mis-formatted.
+  // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points
+  // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B"
+  //
+  // @param opts The string in which to find the next token
+  // @param delimiter The delimiter between tokens
+  // @param start     The position in opts to start looking for the token
+  // @param ed        Returns the end position in opts of the token
+  // @param token     Returns the token
+  // @returns OK if a token was found
+  // @return InvalidArgument if the braces mismatch
+  //          (e.g. "{a={b=c;}" ) -- missing closing brace
+  // @return InvalidArgument if an expected delimiter is not found
+  //        e.g. "{a=b}c=d;" -- missing delimiter before "c"
+  static Status NextToken(const std::string& opts, char delimiter, size_t start,
+                          size_t* end, std::string* token);
+
+  constexpr static const char* kIdPropName() { return "id"; }
+  constexpr static const char* kIdPropSuffix() { return ".id"; }
+
+ private:
+  int offset_;
+
+  // The optional function to convert a string to its representation
+  ParseFunc parse_func_;
+
+  // The optional function to convert a value to its string representation
+  SerializeFunc serialize_func_;
+
+  // The optional function to match two option values
+  EqualsFunc equals_func_;
+
+  PrepareFunc prepare_func_;
+  ValidateFunc validate_func_;
+  OptionType type_;
+  OptionVerificationType verification_;
+  OptionTypeFlags flags_;
+};
+
+// Parses the input value into elements of the result array, which has fixed
+// array size. For example, if the value=1:2:3 and elem_info parses integers,
+// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo
+// the input value has to match with that.
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name      The name associated with this array option
+// @param value     The input string to parse into tokens
+// @param result    Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or element number
+//                          doesn't match array size defined in OptionTypeInfo
+//                          or if the token could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+                  const OptionTypeInfo& elem_info, char separator,
+                  const std::string& name, const std::string& value,
+                  std::array<T, kSize>* result) {
+  Status status;
+
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
+  size_t i = 0, start = 0, end = 0;
+  for (; status.ok() && i < kSize && start < value.size() &&
+         end != std::string::npos;
+       i++, start = end + 1) {
+    std::string token;
+    status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+    if (status.ok()) {
+      status = elem_info.Parse(copy, name, token, &((*result)[i]));
+      if (config_options.ignore_unsupported_options &&
+          status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
+      }
+    }
+  }
+  if (!status.ok()) {
+    return status;
+  }
+  // make sure the element number matches the array size
+  if (i < kSize) {
+    return Status::InvalidArgument(
+        "Serialized value has less elements than array size", name);
+  }
+  if (start < value.size() && end != std::string::npos) {
+    return Status::InvalidArgument(
+        "Serialized value has more elements than array size", name);
+  }
+  return status;
+}
+
+// Serializes the fixed size input array into its output value.  Elements are
+// separated by the separator character.  This element will convert all of the
+// elements in array into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the array contains the integers 1,2,3 and elem_info
+// serializes the output would be 1:2:3 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name      The name associated with this array option
+// @param array     The input array to serialize
+// @param value     The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound        If the tokenized value contains unknown options for
+//                          its type
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+                      const OptionTypeInfo& elem_info, char separator,
+                      const std::string& name,
+                      const std::array<T, kSize>& array, std::string* value) {
+  std::string result;
+  ConfigOptions embedded = config_options;
+  embedded.delimiter = ";";
+  int printed = 0;
+  for (const auto& elem : array) {
+    std::string elem_str;
+    Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+    if (!s.ok()) {
+      return s;
+    } else if (!elem_str.empty()) {
+      if (printed++ > 0) {
+        result += separator;
+      }
+      // If the element contains embedded separators, put it inside of brackets
+      if (elem_str.find(separator) != std::string::npos) {
+        result += "{" + elem_str + "}";
+      } else {
+        result += elem_str;
+      }
+    }
+  }
+  if (result.find("=") != std::string::npos) {
+    *value = "{" + result + "}";
+  } else if (printed > 1 && result.at(0) == '{') {
+    *value = "{" + result + "}";
+  } else {
+    *value = result;
+  }
+  return Status::OK();
+}
+
+// Compares the input arrays array1 and array2 for equality
+// Elements of the array are compared one by one using elem_info to perform the
+// comparison.
+//
+// @param config_options Controls how the arrays are compared.
+// @param elem_info  Controls how individual elements in the arrays are compared
+// @param name          The name associated with this array option
+// @param array1,array2 The arrays to compare.
+// @param mismatch      If the arrays are not equivalent, mismatch will point to
+//                       the first element of the comparison that did not match.
+// @return true         If vec1 and vec2 are "equal", false otherwise
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+                    const OptionTypeInfo& elem_info, const std::string& name,
+                    const std::array<T, kSize>& array1,
+                    const std::array<T, kSize>& array2, std::string* mismatch) {
+  assert(array1.size() == kSize);
+  assert(array2.size() == kSize);
+  for (size_t i = 0; i < kSize; ++i) {
+    if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i],
+                            mismatch)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Parses the input value into elements of the result vector.  This method
+// will break the input value into the individual tokens (based on the
+// separator), where each of those tokens will be parsed based on the rules of
+// elem_info. The result vector will be populated with elements based on the
+// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses
+// integers, the result vector will contain the integers 1,2,3,4,5
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param value     The input string to parse into tokens
+// @param result    Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result) {
+  result->clear();
+  Status status;
+
+  // Turn off ignore_unknown_objects so we can tell if the returned
+  // object is valid or not.
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
+  for (size_t start = 0, end = 0;
+       status.ok() && start < value.size() && end != std::string::npos;
+       start = end + 1) {
+    std::string token;
+    status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+    if (status.ok()) {
+      T elem;
+      status = elem_info.Parse(copy, name, token, &elem);
+      if (status.ok()) {
+        result->emplace_back(elem);
+      } else if (config_options.ignore_unsupported_options &&
+                 status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
+      }
+    }
+  }
+  return status;
+}
+
+// Serializes the input vector into its output value.  Elements are
+// separated by the separator character.  This element will convert all of the
+// elements in vec into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the vec contains the integers 1,2,3,4,5 and elem_info
+// serializes the output would be 1:2:3:4:5 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param vec       The input vector to serialize
+// @param value     The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value) {
+  std::string result;
+  ConfigOptions embedded = config_options;
+  embedded.delimiter = ";";
+  int printed = 0;
+  for (const auto& elem : vec) {
+    std::string elem_str;
+    Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+    if (!s.ok()) {
+      return s;
+    } else if (!elem_str.empty()) {
+      if (printed++ > 0) {
+        result += separator;
+      }
+      // If the element contains embedded separators, put it inside of brackets
+      if (elem_str.find(separator) != std::string::npos) {
+        result += "{" + elem_str + "}";
+      } else {
+        result += elem_str;
+      }
+    }
+  }
+  if (result.find("=") != std::string::npos) {
+    *value = "{" + result + "}";
+  } else if (printed > 1 && result.at(0) == '{') {
+    *value = "{" + result + "}";
+  } else {
+    *value = result;
+  }
+  return Status::OK();
+}
+
+// Compares the input vectors vec1 and vec2 for equality
+// If the vectors are the same size, elements of the vectors are compared one by
+// one using elem_info to perform the comparison.
+//
+// @param config_options Controls how the vectors are compared.
+// @param elem_info Controls how individual elements in the vectors are compared
+// @param name      The name associated with this vector option
+// @param vec1,vec2 The vectors to compare.
+// @param mismatch  If the vectors are not equivalent, mismatch will point to
+// the first
+//                  element of the comparison that did not match.
+// @return true     If vec1 and vec2 are "equal", false otherwise
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch) {
+  if (vec1.size() != vec2.size()) {
+    *mismatch = name;
+    return false;
+  } else {
+    for (size_t i = 0; i < vec1.size(); ++i) {
+      if (!elem_info.AreEqual(
+              config_options, name, reinterpret_cast<const char*>(&vec1[i]),
+              reinterpret_cast<const char*>(&vec2[i]), mismatch)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h
new file mode 100644
index 000000000..064c087f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_util.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This file contains utility functions for RocksDB Options.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+// Constructs the DBOptions and ColumnFamilyDescriptors by loading the
+// latest RocksDB options file stored in the specified rocksdb database.
+//
+// Note that the all the pointer options (except table_factory, which will
+// be described in more details below) will be initialized with the default
+// values.  Developers can further initialize them after this function call.
+// Below is an example list of pointer options which will be initialized
+//
+// * env
+// * memtable_factory
+// * compaction_filter_factory
+// * prefix_extractor
+// * comparator
+// * merge_operator
+// * compaction_filter
+//
+// User can also choose to load customized comparator, env, and/or
+// merge_operator through object registry:
+// * comparator needs to be registered through Registrar<const Comparator>
+// * env needs to be registered through Registrar<Env>
+// * merge operator needs to be registered through
+//     Registrar<std::shared_ptr<MergeOperator>>.
+//
+// For table_factory, this function further supports deserializing
+// BlockBasedTableFactory and its BlockBasedTableOptions except the
+// pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+// block_cache, and block_cache_compressed), which will be initialized with
+// default values.  Developers can further specify these three options by
+// casting the return value of TableFactory::GetOptions() to
+// BlockBasedTableOptions and making necessary changes.
+//
+// ignore_unknown_options can be set to true if you want to ignore options
+// that are from a newer version of the db, essentially for forward
+// compatibility.
+//
+// config_options contains a set of options that controls the processing
+// of the options.  The LoadLatestOptions(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding options
+// in the ConfigOptions parameter.
+//
+// examples/options_file_example.cc demonstrates how to use this function
+// to open a RocksDB instance.
+//
+// @return the function returns an OK status when it went successfully.  If
+//     the specified "dbpath" does not contain any option file, then a
+//     Status::NotFound will be returned.  A return value other than
+//     Status::OK or Status::NotFound indicates there is some error related
+//     to the options file itself.
+//
+// @see LoadOptionsFromFile
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+                         DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         bool ignore_unknown_options = false,
+                         std::shared_ptr<Cache>* cache = {});
+Status LoadLatestOptions(const ConfigOptions& config_options,
+                         const std::string& dbpath, DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         std::shared_ptr<Cache>* cache = {});
+
+// Similar to LoadLatestOptions, this function constructs the DBOptions
+// and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+//
+// The LoadOptionsFile(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @see LoadLatestOptions
+Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           bool ignore_unknown_options = false,
+                           std::shared_ptr<Cache>* cache = {});
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+                           const std::string& options_file_name,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           std::shared_ptr<Cache>* cache = {});
+
+// Returns the latest options file name under the specified db path.
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+                                std::string* options_file_name);
+
+// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors
+// are compatible with the latest options stored in the specified DB path.
+//
+// If the return status is non-ok, it means the specified RocksDB instance
+// might not be correctly opened with the input set of options.  Currently,
+// changing one of the following options will fail the compatibility check:
+//
+// * comparator
+// * prefix_extractor
+// * table_factory
+// * merge_operator
+Status CheckOptionsCompatibility(
+    const std::string& dbpath, Env* env, const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs,
+    bool ignore_unknown_options = false);
+Status CheckOptionsCompatibility(
+    const ConfigOptions& config_options, const std::string& dbpath,
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/replayer.h b/src/rocksdb/include/rocksdb/utilities/replayer.h
new file mode 100644
index 000000000..4fdd8d73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/replayer.h
@@ -0,0 +1,87 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TraceRecord;
+class TraceRecordResult;
+
+struct ReplayOptions {
+  // Number of threads used for replaying. If 0 or 1, replay using
+  // single thread.
+  uint32_t num_threads;
+
+  // Enables fast forwarding a replay by increasing/reducing the delay between
+  // the ingested traces.
+  //   If > 0.0 and < 1.0, slow down the replay by this amount.
+  //   If 1.0, replay the operations at the same rate as in the trace stream.
+  //   If > 1, speed up the replay by this amount.
+  double fast_forward;
+
+  ReplayOptions() : num_threads(1), fast_forward(1.0) {}
+
+  ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio)
+      : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {}
+};
+
+// Replayer helps to replay the captured RocksDB query level operations.
+// The Replayer can either be created from DB::NewReplayer method, or be
+// instantiated via db_bench today, on using "replay" benchmark.
+class Replayer {
+ public:
+  virtual ~Replayer() = default;
+
+  // Make some preparation before replaying the trace. This will also reset the
+  // replayer in order to restart replaying.
+  virtual Status Prepare() = 0;
+
+  // Return the timestamp when the trace recording was started.
+  virtual uint64_t GetHeaderTimestamp() const = 0;
+
+  // Atomically read one trace into a TraceRecord (excluding the header and
+  // footer traces).
+  // Return Status::OK() on success;
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the read trace type is not supported.
+  virtual Status Next(std::unique_ptr<TraceRecord>* record) = 0;
+
+  // Execute one TraceRecord.
+  // Return Status::OK() if the execution was successful. Get/MultiGet traces
+  // will still return Status::OK() even if they got Status::NotFound()
+  // from DB::Get() or DB::MultiGet();
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the operation is not supported;
+  // Otherwise, return the corresponding error status.
+  //
+  // The actual operation execution status and result(s) will be saved in
+  // result. For example, a GetQueryTraceRecord will have its DB::Get() status
+  // and the returned value saved in a SingleValueTraceExecutionResult.
+  virtual Status Execute(const std::unique_ptr<TraceRecord>& record,
+                         std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Replay all the traces from the provided trace stream, taking the delay
+  // between the traces into consideration.
+  //
+  // result_callback reports the status of executing a trace record, and the
+  // actual operation execution result (See the description for Execute()).
+  virtual Status Replay(
+      const ReplayOptions& options,
+      const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+          result_callback) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
new file mode 100644
index 000000000..a682c7748
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCache;
+
+// For instrumentation purpose, use NewSimCache instead of NewLRUCache API
+// NewSimCache is a wrapper function returning a SimCache instance that can
+// have additional interface provided in Simcache class besides Cache interface
+// to predict block cache hit rate without actually allocating the memory. It
+// can help users tune their current block cache size, and determine how
+// efficient they are using the memory.
+//
+// Since GetSimCapacity() returns the capacity for simulation, it differs from
+// actual memory usage, which can be estimated as:
+// sim_capacity * entry_size / (entry_size + block_size),
+// where 76 <= entry_size <= 104,
+// BlockBasedTableOptions.block_size = 4096 by default but is configurable,
+// Therefore, generally the actual memory overhead of SimCache is Less than
+// sim_capacity * 2%
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+                                             size_t sim_capacity,
+                                             int num_shard_bits);
+
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                             std::shared_ptr<Cache> cache,
+                                             int num_shard_bits);
+
+class SimCache : public Cache {
+ public:
+  SimCache() {}
+
+  ~SimCache() override {}
+
+  const char* Name() const override { return "SimCache"; }
+
+  // returns the maximum configured capacity of the simcache for simulation
+  virtual size_t GetSimCapacity() const = 0;
+
+  // simcache doesn't provide internal handler reference to user, so always
+  // PinnedUsage = 0 and the behavior will be not exactly consistent the
+  // with real cache.
+  // returns the memory size for the entries residing in the simcache.
+  virtual size_t GetSimUsage() const = 0;
+
+  // sets the maximum configured capacity of the simcache. When the new
+  // capacity is less than the old capacity and the existing usage is
+  // greater than new capacity, the implementation will purge old entries
+  // to fit new capacity.
+  virtual void SetSimCapacity(size_t capacity) = 0;
+
+  // returns the lookup times of simcache
+  virtual uint64_t get_miss_counter() const = 0;
+  // returns the hit times of simcache
+  virtual uint64_t get_hit_counter() const = 0;
+  // reset the lookup and hit counters
+  virtual void reset_counter() = 0;
+  // String representation of the statistics of the simcache
+  virtual std::string ToString() const = 0;
+
+  // Start storing logs of the cache activity (Add/Lookup) into
+  // a file located at activity_log_file, max_logging_size option can be used to
+  // stop logging to the file automatically after reaching a specific size in
+  // bytes, a values of 0 disable this feature
+  virtual Status StartActivityLogging(const std::string& activity_log_file,
+                                      Env* env,
+                                      uint64_t max_logging_size = 0) = 0;
+
+  // Stop cache activity logging if any
+  virtual void StopActivityLogging() = 0;
+
+  // Status of cache logging happening in background
+  virtual Status GetActivityLoggingStatus() = 0;
+
+ private:
+  SimCache(const SimCache&);
+  SimCache& operator=(const SimCache&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
new file mode 100644
index 000000000..9b13c3bdf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -0,0 +1,566 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/db.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB take sole ownership of the underlying db.
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  // StackableDB take shared ownership of the underlying db.
+  explicit StackableDB(std::shared_ptr<DB> db)
+      : db_(db.get()), shared_db_ptr_(db) {}
+
+  ~StackableDB() {
+    if (shared_db_ptr_ == nullptr) {
+      delete db_;
+    } else {
+      assert(shared_db_ptr_.get() == db_);
+    }
+    db_ = nullptr;
+  }
+
+  virtual Status Close() override { return db_->Close(); }
+
+  virtual DB* GetBaseDB() { return db_; }
+
+  virtual DB* GetRootDB() override { return db_->GetRootDB(); }
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamily(options, column_family_name, handle);
+  }
+
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override {
+    return db_->CreateColumnFamilies(options, column_family_names, handles);
+  }
+
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override {
+    return db_->CreateColumnFamilies(column_families, handles);
+  }
+
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override {
+    return db_->DropColumnFamily(column_family);
+  }
+
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->DropColumnFamilies(column_families);
+  }
+
+  virtual Status DestroyColumnFamilyHandle(
+      ColumnFamilyHandle* column_family) override {
+    return db_->DestroyColumnFamilyHandle(column_family);
+  }
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, column_family, key, val);
+  }
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& ts, const Slice& val) override {
+    return db_->Put(options, column_family, key, ts, val);
+  }
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override {
+    return db_->PutEntity(options, column_family, key, columns);
+  }
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override {
+    return db_->Get(options, column_family, key, value);
+  }
+
+  using DB::GetEntity;
+  Status GetEntity(const ReadOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableWideColumns* columns) override {
+    return db_->GetEntity(options, column_family, key, columns);
+  }
+
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* slice,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) override {
+    return db_->GetMergeOperands(options, column_family, key, slice,
+                                 get_merge_operands_options,
+                                 number_of_operands);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return db_->MultiGet(options, column_family, keys, values);
+  }
+
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override {
+    return db_->MultiGet(options, column_family, num_keys, keys, values,
+                         statuses, sorted_input);
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) override {
+    return db_->IngestExternalFile(column_family, external_files, options);
+  }
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override {
+    return db_->IngestExternalFiles(args);
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamilyWithImport(options, column_family_name,
+                                             import_options, metadata, handle);
+  }
+
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_opts) override {
+    return db_->VerifyFileChecksums(read_opts);
+  }
+
+  virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
+
+  virtual Status VerifyChecksum(const ReadOptions& options) override {
+    return db_->VerifyChecksum(options);
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
+  }
+
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return db_->Delete(wopts, column_family, key);
+  }
+  Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family,
+                const Slice& key, const Slice& ts) override {
+    return db_->Delete(wopts, column_family, key, ts);
+  }
+
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override {
+    return db_->SingleDelete(wopts, column_family, key);
+  }
+  Status SingleDelete(const WriteOptions& wopts,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override {
+    return db_->SingleDelete(wopts, column_family, key, ts);
+  }
+
+  using DB::DeleteRange;
+  Status DeleteRange(const WriteOptions& wopts,
+                     ColumnFamilyHandle* column_family, const Slice& start_key,
+                     const Slice& end_key) override {
+    return db_->DeleteRange(wopts, column_family, start_key, end_key);
+  }
+
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, column_family, key, value);
+  }
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) override {
+    return db_->Merge(options, column_family, key, ts, value);
+  }
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
+    return db_->Write(opts, updates);
+  }
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override {
+    return db_->NewIterator(opts, column_family);
+  }
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override {
+    return db_->NewIterators(options, column_families, iterators);
+  }
+
+  virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  using DB::GetMapProperty;
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+    return db_->GetProperty(column_family, property, value);
+  }
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override {
+    return db_->GetMapProperty(column_family, property, value);
+  }
+
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override {
+    return db_->GetIntProperty(column_family, property, value);
+  }
+
+  using DB::GetAggregatedIntProperty;
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* value) override {
+    return db_->GetAggregatedIntProperty(property, value);
+  }
+
+  using DB::GetApproximateSizes;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* r, int n,
+                                     uint64_t* sizes) override {
+    return db_->GetApproximateSizes(options, column_family, r, n, sizes);
+  }
+
+  using DB::GetApproximateMemTableStats;
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) override {
+    return db_->GetApproximateMemTableStats(column_family, range, count, size);
+  }
+
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override {
+    return db_->CompactRange(options, column_family, begin, end);
+  }
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    return db_->CompactFiles(compact_options, column_family, input_file_names,
+                             output_level, output_path_id, output_file_names,
+                             compaction_job_info);
+  }
+
+  virtual Status PauseBackgroundWork() override {
+    return db_->PauseBackgroundWork();
+  }
+  virtual Status ContinueBackgroundWork() override {
+    return db_->ContinueBackgroundWork();
+  }
+
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) override {
+    return db_->EnableAutoCompaction(column_family_handles);
+  }
+
+  virtual void EnableManualCompaction() override {
+    return db_->EnableManualCompaction();
+  }
+  virtual void DisableManualCompaction() override {
+    return db_->DisableManualCompaction();
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return db_->NumberLevels(column_family);
+  }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(
+      ColumnFamilyHandle* column_family) override {
+    return db_->MaxMemCompactionLevel(column_family);
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override {
+    return db_->Level0StopWriteTrigger(column_family);
+  }
+
+  virtual const std::string& GetName() const override { return db_->GetName(); }
+
+  virtual Env* GetEnv() const override { return db_->GetEnv(); }
+
+  virtual FileSystem* GetFileSystem() const override {
+    return db_->GetFileSystem();
+  }
+
+  using DB::GetOptions;
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+    return db_->GetOptions(column_family);
+  }
+
+  using DB::GetDBOptions;
+  virtual DBOptions GetDBOptions() const override {
+    return db_->GetDBOptions();
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& fopts,
+                       ColumnFamilyHandle* column_family) override {
+    return db_->Flush(fopts, column_family);
+  }
+  virtual Status Flush(
+      const FlushOptions& fopts,
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->Flush(fopts, column_families);
+  }
+
+  virtual Status SyncWAL() override { return db_->SyncWAL(); }
+
+  virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+
+  virtual Status LockWAL() override { return db_->LockWAL(); }
+
+  virtual Status UnlockWAL() override { return db_->UnlockWAL(); }
+
+#ifndef ROCKSDB_LITE
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override {
+    db_->GetLiveFilesMetaData(metadata);
+  }
+
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override {
+    return db_->GetLiveFilesChecksumInfo(checksum_list);
+  }
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override {
+    return db_->GetLiveFilesStorageInfo(opts, files);
+  }
+
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* cf_meta) override {
+    db_->GetColumnFamilyMetaData(column_family, cf_meta);
+  }
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& trace_options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer));
+  }
+
+  Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& options,
+      std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartIOTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override { return db_->EndIOTrace(); }
+
+  using DB::StartTrace;
+  Status StartTrace(const TraceOptions& options,
+                    std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndTrace;
+  Status EndTrace() override { return db_->EndTrace(); }
+
+  using DB::NewDefaultReplayer;
+  Status NewDefaultReplayer(const std::vector<ColumnFamilyHandle*>& handles,
+                            std::unique_ptr<TraceReader>&& reader,
+                            std::unique_ptr<Replayer>* replayer) override {
+    return db_->NewDefaultReplayer(handles, std::move(reader), replayer);
+  }
+
+#endif  // ROCKSDB_LITE
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+    return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override {
+    return db_->IncreaseFullHistoryTsLow(column_family, ts_low);
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override {
+    return db_->GetFullHistoryTsLow(column_family, ts_low);
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override {
+    return db_->GetCurrentWalFile(current_log_file);
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override {
+    return db_->GetCreationTimeOfOldestFile(creation_time);
+  }
+
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) const override {
+    return db_->GetDbIdentity(identity);
+  }
+
+  virtual Status GetDbSessionId(std::string& session_id) const override {
+    return db_->GetDbSessionId(session_id);
+  }
+
+  using DB::SetOptions;
+  virtual Status SetOptions(ColumnFamilyHandle* column_family_handle,
+                            const std::unordered_map<std::string, std::string>&
+                                new_options) override {
+    return db_->SetOptions(column_family_handle, new_options);
+  }
+
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& new_options)
+      override {
+    return db_->SetDBOptions(new_options);
+  }
+
+  using DB::ResetStats;
+  virtual Status ResetStats() override { return db_->ResetStats(); }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override {
+    return db_->GetPropertiesOfAllTables(column_family, props);
+  }
+
+  using DB::GetPropertiesOfTablesInRange;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override {
+    return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
+  }
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options) override {
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
+  }
+
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                     const Slice* begin,
+                                     const Slice* end) override {
+    return db_->SuggestCompactRange(column_family, begin, end);
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+                           int target_level) override {
+    return db_->PromoteL0(column_family, target_level);
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return db_->DefaultColumnFamily();
+  }
+
+#ifndef ROCKSDB_LITE
+  Status TryCatchUpWithPrimary() override {
+    return db_->TryCatchUpWithPrimary();
+  }
+#endif  // ROCKSDB_LITE
+
+ protected:
+  DB* db_;
+  std::shared_ptr<DB> shared_db_ptr_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
new file mode 100644
index 000000000..f3a4ba005
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries or the ratio of tombstone
+// entries in the whole file >= the specified deletion ratio.
+class CompactOnDeletionCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entries, or the ratio of tombstone
+  // entries >= deletion_ratio.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                    size_t deletion_trigger,
+                                    double deletion_ratio);
+
+  ~CompactOnDeletionCollectorFactory() {}
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override;
+
+  // Change the value of sliding_window_size "N"
+  // Setting it to 0 disables the delete triggered compaction
+  void SetWindowSize(size_t sliding_window_size) {
+    sliding_window_size_.store(sliding_window_size);
+  }
+  size_t GetWindowSize() const { return sliding_window_size_.load(); }
+
+  // Change the value of deletion_trigger "D"
+  void SetDeletionTrigger(size_t deletion_trigger) {
+    deletion_trigger_.store(deletion_trigger);
+  }
+
+  size_t GetDeletionTrigger() const { return deletion_trigger_.load(); }
+  // Change deletion ratio.
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  void SetDeletionRatio(double deletion_ratio) {
+    deletion_ratio_.store(deletion_ratio);
+  }
+
+  double GetDeletionRatio() const { return deletion_ratio_.load(); }
+  static const char* kClassName() { return "CompactOnDeletionCollector"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::string ToString() const override;
+
+ private:
+  std::atomic<size_t> sliding_window_size_;
+  std::atomic<size_t> deletion_trigger_;
+  std::atomic<double> deletion_ratio_;
+};
+
+// Creates a factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries, or the ratio of tombstone
+// entries >= deletion_ratio.
+//
+// @param sliding_window_size "N". Note that this number will be
+//     round up to the smallest multiple of 128 that is no less
+//     than the specified size.
+// @param deletion_trigger "D".  Note that even when "N" is changed,
+//     the specified number for "D" will not be changed.
+// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+//     based on deletion ratio. Disabled by default.
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                     size_t deletion_trigger,
+                                     double deletion_ratio = 0);
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h
new file mode 100644
index 000000000..1d2822988
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction.h
@@ -0,0 +1,686 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+class TransactionDB;
+class WriteBatchWithIndex;
+
+using TransactionName = std::string;
+
+using TransactionID = uint64_t;
+
+using TxnTimestamp = uint64_t;
+
+constexpr TxnTimestamp kMaxTxnTimestamp =
+    std::numeric_limits<TxnTimestamp>::max();
+
+/*
+  class Endpoint allows to define prefix ranges.
+
+  Prefix ranges are introduced below.
+
+  == Basic Ranges ==
+  Let's start from basic ranges. Key Comparator defines ordering of rowkeys.
+  Then, one can specify finite closed ranges by just providing rowkeys of their
+  endpoints:
+
+    lower_endpoint <= X <= upper_endpoint
+
+  However our goal is to provide a richer set of endpoints. Read on.
+
+  == Lexicographic ordering ==
+  A lexicographic (or dictionary) ordering satisfies these criteria: If there
+  are two keys in form
+    key_a = {prefix_a, suffix_a}
+    key_b = {prefix_b, suffix_b}
+  and
+    prefix_a < prefix_b
+  then
+    key_a < key_b.
+
+  == Prefix ranges ==
+  With lexicographic ordering, one may want to define ranges in form
+
+     "prefix is $PREFIX"
+
+  which translates to a range in form
+
+    {$PREFIX, -infinity} < X < {$PREFIX, +infinity}
+
+  where -infinity will compare less than any possible suffix, and +infinity
+  will compare as greater than any possible suffix.
+
+  class Endpoint allows to define these kind of rangtes.
+
+  == Notes ==
+  BytewiseComparator and ReverseBytewiseComparator produce lexicographic
+  ordering.
+
+  The row comparison function is able to compare key prefixes. If the data
+  domain includes keys A and B, then the comparison function is able to compare
+  equal-length prefixes:
+
+    min_len= min(byte_length(A), byte_length(B));
+    cmp(Slice(A, min_len), Slice(B, min_len));  // this call is valid
+
+  == Other options ==
+  As far as MyRocks is concerned, the alternative to prefix ranges would be to
+  support both open (non-inclusive) and closed (inclusive) range endpoints.
+*/
+
+class Endpoint {
+ public:
+  Slice slice;
+
+  /*
+    true  : the key has a "+infinity" suffix. A suffix that would compare as
+            greater than any other suffix
+    false : otherwise
+  */
+  bool inf_suffix;
+
+  explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false)
+      : slice(slice_arg), inf_suffix(inf_suffix_arg) {}
+
+  explicit Endpoint(const char* s, bool inf_suffix_arg = false)
+      : slice(s), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint(const char* s, size_t size, bool inf_suffix_arg = false)
+      : slice(s, size), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint() : inf_suffix(false) {}
+};
+
+// Provides notification to the caller of SetSnapshotOnNextOperation when
+// the actual snapshot gets created
+class TransactionNotifier {
+ public:
+  virtual ~TransactionNotifier() {}
+
+  // Implement this method to receive notification when a snapshot is
+  // requested via SetSnapshotOnNextOperation.
+  // Do not take exclusive ownership of `newSnapshot` because it is shared with
+  // the underlying transaction.
+  virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
+};
+
+// Provides BEGIN/COMMIT/ROLLBACK transactions.
+//
+// To use transactions, you must first create either an OptimisticTransactionDB
+// or a TransactionDB.  See examples/[optimistic_]transaction_example.cc for
+// more information.
+//
+// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
+//
+// It is up to the caller to synchronize access to this object.
+//
+// See examples/transaction_example.cc for some simple examples.
+//
+// TODO(agiardullo): Not yet implemented
+//  -PerfContext statistics
+//  -Support for using Transactions with DBWithTTL
+class Transaction {
+ public:
+  // No copying allowed
+  Transaction(const Transaction&) = delete;
+  void operator=(const Transaction&) = delete;
+
+  virtual ~Transaction() {}
+
+  // If a transaction has a snapshot set, the transaction will ensure that
+  // any keys successfully written(or fetched via GetForUpdate()) have not
+  // been modified outside of this transaction since the time the snapshot was
+  // set.
+  // If a snapshot has not been set, the transaction guarantees that keys have
+  // not been modified since the time each key was first written (or fetched via
+  // GetForUpdate()).
+  //
+  // Using SetSnapshot() will provide stricter isolation guarantees at the
+  // expense of potentially more transaction failures due to conflicts with
+  // other writes.
+  //
+  // Calling SetSnapshot() has no effect on keys written before this function
+  // has been called.
+  //
+  // SetSnapshot() may be called multiple times if you would like to change
+  // the snapshot used for different operations in this transaction.
+  //
+  // Calling SetSnapshot will not affect the version of Data returned by Get()
+  // methods.  See Transaction::Get() for more details.
+  virtual void SetSnapshot() = 0;
+
+  // Similar to SetSnapshot(), but will not change the current snapshot
+  // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
+  // By calling this function, the transaction will essentially call
+  // SetSnapshot() for you right before performing the next write/GetForUpdate.
+  //
+  // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
+  // returned by GetSnapshot() until the next write/GetForUpdate is executed.
+  //
+  // When the snapshot is created the notifier's SnapshotCreated method will
+  // be called so that the caller can get access to the snapshot.
+  //
+  // This is an optimization to reduce the likelihood of conflicts that
+  // could occur in between the time SetSnapshot() is called and the first
+  // write/GetForUpdate operation.  Eg, this prevents the following
+  // race-condition:
+  //
+  //   txn1->SetSnapshot();
+  //                             txn2->Put("A", ...);
+  //                             txn2->Commit();
+  //   txn1->GetForUpdate(opts, "A", ...);  // FAIL!
+  //
+  // WriteCommittedTxn only: a new snapshot will be taken upon next operation,
+  // and next operation can be a Commit.
+  // TODO(yanqin) remove the "write-committed only" limitation.
+  virtual void SetSnapshotOnNextOperation(
+      std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
+
+  // Returns the Snapshot created by the last call to SetSnapshot().
+  //
+  // REQUIRED: The returned Snapshot is only valid up until the next time
+  // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
+  // is called, or the Transaction is deleted.
+  virtual const Snapshot* GetSnapshot() const = 0;
+
+  // Returns the Snapshot created by the last call to SetSnapshot().
+  // The returned snapshot can outlive the transaction.
+  virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const = 0;
+
+  // Clears the current snapshot (i.e. no snapshot will be 'set')
+  //
+  // This removes any snapshot that currently exists or is set to be created
+  // on the next update operation (SetSnapshotOnNextOperation).
+  //
+  // Calling ClearSnapshot() has no effect on keys written before this function
+  // has been called.
+  //
+  // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
+  // longer be valid and should be discarded after a call to ClearSnapshot().
+  virtual void ClearSnapshot() = 0;
+
+  // Prepare the current transaction for 2PC
+  virtual Status Prepare() = 0;
+
+  // Write all batched keys to the db atomically.
+  //
+  // Returns OK on success.
+  //
+  // May return any error status that could be returned by DB:Write().
+  //
+  // If this transaction was created by an OptimisticTransactionDB(),
+  // Status::Busy() may be returned if the transaction could not guarantee
+  // that there are no write conflicts.  Status::TryAgain() may be returned
+  // if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain).
+  //
+  // If this transaction was created by a TransactionDB(), Status::Expired()
+  // may be returned if this transaction has lived for longer than
+  // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if
+  // TransactionOptions.skip_prepare is false and Prepare is not called on this
+  // transaction before Commit.
+  virtual Status Commit() = 0;
+
+  // In addition to Commit(), also creates a snapshot of the db after all
+  // writes by this txn are visible to other readers.
+  // Caller is responsible for ensuring that
+  // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
+  // in which snapshot1 and snapshot2 are created by this API.
+  //
+  // Currently only supported by WriteCommittedTxn. Calling this method on
+  // other types of transactions will return non-ok Status resulting from
+  // Commit() or a `NotSupported` error.
+  // This method returns OK if and only if the transaction successfully
+  // commits. It is possible that transaction commits successfully but fails to
+  // create a timestamped snapshot. Therefore, the caller should check that the
+  // snapshot is created.
+  // notifier will be notified upon next snapshot creation. Nullable.
+  // ret non-null output argument storing a shared_ptr to the newly created
+  // snapshot.
+  Status CommitAndTryCreateSnapshot(
+      std::shared_ptr<TransactionNotifier> notifier =
+          std::shared_ptr<TransactionNotifier>(),
+      TxnTimestamp ts = kMaxTxnTimestamp,
+      std::shared_ptr<const Snapshot>* snapshot = nullptr);
+
+  // Discard all batched writes in this transaction.
+  virtual Status Rollback() = 0;
+
+  // Records the state of the transaction for future calls to
+  // RollbackToSavePoint().  May be called multiple times to set multiple save
+  // points.
+  virtual void SetSavePoint() = 0;
+
+  // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
+  // since the most recent call to SetSavePoint() and removes the most recent
+  // SetSavePoint().
+  // If there is no previous call to SetSavePoint(), returns Status::NotFound()
+  virtual Status RollbackToSavePoint() = 0;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
+  // This function is similar to DB::Get() except it will also read pending
+  // changes in this transaction.  Currently, this function will return
+  // Status::MergeInProgress if the most recent write to the queried key in
+  // this batch is a Merge.
+  //
+  // If read_options.snapshot is not set, the current version of the key will
+  // be read.  Calling SetSnapshot() does not affect the version of the data
+  // returned.
+  //
+  // Note that setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) = 0;
+
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, column_family, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
+
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
+  // expected to override this with an implementation that calls
+  // DBImpl::MultiGet()
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    for (size_t i = 0; i < num_keys; ++i) {
+      statuses[i] = Get(options, column_family, keys[i], &values[i]);
+    }
+  }
+
+  // Read this key and ensure that this transaction will only
+  // be able to be committed if this key is not written outside this
+  // transaction after it has first been read (or after the snapshot if a
+  // snapshot is set in this transaction and do_validate is true). If
+  // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
+  // that GetForUpdate returns the latest committed value. The transaction
+  // behavior is the same regardless of whether the key exists or not.
+  //
+  // Note: Currently, this function will return Status::MergeInProgress
+  // if the most recent write to the queried key in this batch is a Merge.
+  //
+  // The values returned by this function are similar to Transaction::Get().
+  // If value==nullptr, then this function will not read any data, but will
+  // still ensure that this key cannot be written to by outside of this
+  // transaction.
+  //
+  // If this transaction was created by an OptimisticTransaction, GetForUpdate()
+  // could cause commit() to fail.  Otherwise, it could return any error
+  // that could be returned by DB::Get().
+  //
+  // If this transaction was created by a TransactionDB, it can return
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain)
+  // Status::MergeInProgress() if merge operations cannot be resolved.
+  // or other errors if this key could not be read.
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, std::string* value,
+                              bool exclusive = true,
+                              const bool do_validate = true) = 0;
+
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, PinnableSlice* pinnable_val,
+                              bool exclusive = true,
+                              const bool do_validate = true) {
+    if (pinnable_val == nullptr) {
+      std::string* null_str = nullptr;
+      return GetForUpdate(options, column_family, key, null_str, exclusive,
+                          do_validate);
+    } else {
+      auto s = GetForUpdate(options, column_family, key,
+                            pinnable_val->GetSelf(), exclusive, do_validate);
+      pinnable_val->PinSelf();
+      return s;
+    }
+  }
+
+  // Get a range lock on [start_endpoint; end_endpoint].
+  virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&,
+                              const Endpoint&) {
+    return Status::NotSupported();
+  }
+
+  virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
+                              std::string* value, bool exclusive = true,
+                              const bool do_validate = true) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) = 0;
+
+  // Returns an iterator that will iterate on all keys in the default
+  // column family including both keys in the DB and uncommitted keys in this
+  // transaction.
+  //
+  // Setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  //
+  // Caller is responsible for deleting the returned Iterator.
+  //
+  // The returned iterator is only valid until Commit(), Rollback(), or
+  // RollbackToSavePoint() is called.
+  virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
+
+  virtual Iterator* GetIterator(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family) = 0;
+
+  // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
+  // functions in WriteBatch, but will also do conflict checking on the
+  // keys being written.
+  //
+  // assume_tracked=true expects the key be already tracked. More
+  // specifically, it means the the key was previous tracked in the same
+  // savepoint, with the same exclusive flag, and at a lower sequence number.
+  // If valid then it skips ValidateSnapshot.  Returns error otherwise.
+  //
+  // If this Transaction was created on an OptimisticTransactionDB, these
+  // functions should always return Status::OK().
+  //
+  // If this Transaction was created on a TransactionDB, the status returned
+  // can be:
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain)
+  // or other errors on unexpected failures.
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value, const bool assume_tracked = false) = 0;
+  virtual Status Put(const Slice& key, const Slice& value) = 0;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value,
+                     const bool assume_tracked = false) = 0;
+  virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
+
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value,
+                       const bool assume_tracked = false) = 0;
+  virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const bool assume_tracked = false) = 0;
+  virtual Status Delete(const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key,
+                        const bool assume_tracked = false) = 0;
+  virtual Status Delete(const SliceParts& key) = 0;
+
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key,
+                              const bool assume_tracked = false) = 0;
+  virtual Status SingleDelete(const Slice& key) = 0;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const bool assume_tracked = false) = 0;
+  virtual Status SingleDelete(const SliceParts& key) = 0;
+
+  // PutUntracked() will write a Put to the batch of operations to be committed
+  // in this transaction.  This write will only happen if this transaction
+  // gets committed successfully.  But unlike Transaction::Put(),
+  // no conflict checking will be done for this key.
+  //
+  // If this Transaction was created on a PessimisticTransactionDB, this
+  // function will still acquire locks necessary to make sure this write doesn't
+  // cause conflicts in other transactions and may return Status::Busy().
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const SliceParts& value) = 0;
+  virtual Status PutUntracked(const SliceParts& key,
+                              const SliceParts& value) = 0;
+
+  virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) = 0;
+  virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
+
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const Slice& key) = 0;
+
+  virtual Status DeleteUntracked(const Slice& key) = 0;
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const SliceParts& key) = 0;
+  virtual Status DeleteUntracked(const SliceParts& key) = 0;
+  virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                                       const Slice& key) = 0;
+
+  virtual Status SingleDeleteUntracked(const Slice& key) = 0;
+
+  // Similar to WriteBatch::PutLogData
+  virtual void PutLogData(const Slice& blob) = 0;
+
+  // By default, all Put/Merge/Delete operations will be indexed in the
+  // transaction so that Get/GetForUpdate/GetIterator can search for these
+  // keys.
+  //
+  // If the caller does not want to fetch the keys about to be written,
+  // they may want to avoid indexing as a performance optimization.
+  // Calling DisableIndexing() will turn off indexing for all future
+  // Put/Merge/Delete operations until EnableIndexing() is called.
+  //
+  // If a key is Put/Merge/Deleted after DisableIndexing is called and then
+  // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
+  // undefined.
+  virtual void DisableIndexing() = 0;
+  virtual void EnableIndexing() = 0;
+
+  // Returns the number of distinct Keys being tracked by this transaction.
+  // If this transaction was created by a TransactionDB, this is the number of
+  // keys that are currently locked by this transaction.
+  // If this transaction was created by an OptimisticTransactionDB, this is the
+  // number of keys that need to be checked for conflicts at commit time.
+  virtual uint64_t GetNumKeys() const = 0;
+
+  // Returns the number of Puts/Deletes/Merges that have been applied to this
+  // transaction so far.
+  virtual uint64_t GetNumPuts() const = 0;
+  virtual uint64_t GetNumDeletes() const = 0;
+  virtual uint64_t GetNumMerges() const = 0;
+
+  // Returns the elapsed time in milliseconds since this Transaction began.
+  virtual uint64_t GetElapsedTime() const = 0;
+
+  // Fetch the underlying write batch that contains all pending changes to be
+  // committed.
+  //
+  // Note:  You should not write or delete anything from the batch directly and
+  // should only use the functions in the Transaction class to
+  // write to this transaction.
+  virtual WriteBatchWithIndex* GetWriteBatch() = 0;
+
+  // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
+  // this transaction.
+  // Has no effect on OptimisticTransactions.
+  virtual void SetLockTimeout(int64_t timeout) = 0;
+
+  // Return the WriteOptions that will be used during Commit()
+  virtual WriteOptions* GetWriteOptions() = 0;
+
+  // Reset the WriteOptions that will be used during Commit().
+  virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
+
+  // If this key was previously fetched in this transaction using
+  // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
+  // the transaction that it no longer needs to do any conflict checking
+  // for this key.
+  //
+  // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
+  // then UndoGetForUpdate will only have an effect if it is also called N
+  // times.  If this key has been written to in this transaction,
+  // UndoGetForUpdate() will have no effect.
+  //
+  // If SetSavePoint() has been called after the GetForUpdate(),
+  // UndoGetForUpdate() will not have any effect.
+  //
+  // If this Transaction was created by an OptimisticTransactionDB,
+  // calling UndoGetForUpdate can affect whether this key is conflict checked
+  // at commit time.
+  // If this Transaction was created by a TransactionDB,
+  // calling UndoGetForUpdate may release any held locks for this key.
+  virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+                                const Slice& key) = 0;
+  virtual void UndoGetForUpdate(const Slice& key) = 0;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
+
+  // Note: data in the commit-time-write-batch bypasses concurrency control,
+  // thus should be used with great caution.
+  // For write-prepared/write-unprepared transactions,
+  // GetCommitTimeWriteBatch() can be used only if the transaction is started
+  // with
+  // `TransactionOptions::use_only_the_last_commit_time_batch_for_recovery` set
+  // to true. Otherwise, it is possible that two uncommitted versions of the
+  // same key exist in the database due to the current implementation (see the
+  // explanation in WritePreparedTxn::CommitInternal).
+  // During bottommost compaction, RocksDB may
+  // set the sequence numbers of both to zero once becoming committed, causing
+  // output SST file to have two identical internal keys.
+  virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
+
+  virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
+
+  virtual uint64_t GetLogNumber() const { return log_number_; }
+
+  virtual Status SetName(const TransactionName& name) = 0;
+
+  virtual TransactionName GetName() const { return name_; }
+
+  virtual TransactionID GetID() const { return 0; }
+
+  virtual bool IsDeadlockDetect() const { return false; }
+
+  virtual std::vector<TransactionID> GetWaitingTxns(
+      uint32_t* /*column_family_id*/, std::string* /*key*/) const {
+    assert(false);
+    return std::vector<TransactionID>();
+  }
+
+  enum TransactionState {
+    STARTED = 0,
+    AWAITING_PREPARE = 1,
+    PREPARED = 2,
+    AWAITING_COMMIT = 3,
+    COMMITTED = 4,
+    COMMITED = COMMITTED,  // old misspelled name
+    AWAITING_ROLLBACK = 5,
+    ROLLEDBACK = 6,
+    LOCKS_STOLEN = 7,
+  };
+
+  TransactionState GetState() const { return txn_state_; }
+  void SetState(TransactionState state) { txn_state_ = state; }
+
+  // NOTE: Experimental feature
+  // The globally unique id with which the transaction is identified. This id
+  // might or might not be set depending on the implementation. Similarly the
+  // implementation decides the point in lifetime of a transaction at which it
+  // assigns the id. Although currently it is the case, the id is not guaranteed
+  // to remain the same across restarts.
+  uint64_t GetId() { return id_; }
+
+  virtual Status SetReadTimestampForValidation(TxnTimestamp /*ts*/) {
+    return Status::NotSupported("timestamp not supported");
+  }
+
+  virtual Status SetCommitTimestamp(TxnTimestamp /*ts*/) {
+    return Status::NotSupported("timestamp not supported");
+  }
+
+  virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; }
+
+ protected:
+  explicit Transaction(const TransactionDB* /*db*/) {}
+  Transaction() : log_number_(0), txn_state_(STARTED) {}
+
+  // the log in which the prepared section for this txn resides
+  // (for two phase commit)
+  uint64_t log_number_;
+  TransactionName name_;
+
+  // Execution status of the transaction.
+  std::atomic<TransactionState> txn_state_;
+
+  uint64_t id_ = 0;
+  virtual void SetId(uint64_t id) {
+    assert(id_ == 0);
+    id_ = id;
+  }
+
+  virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
+ private:
+  friend class PessimisticTransactionDB;
+  friend class WriteUnpreparedTxnDB;
+  friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+  friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
new file mode 100644
index 000000000..741c59574
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -0,0 +1,508 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+// Database with Transaction support.
+//
+// See transaction.h and examples/transaction_example.cc
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexFactory;
+
+enum TxnDBWritePolicy {
+  WRITE_COMMITTED = 0,  // write only the committed data
+  WRITE_PREPARED,       // write data after the prepare phase of 2pc
+  WRITE_UNPREPARED      // write data before the prepare phase of 2pc
+};
+
+constexpr uint32_t kInitialMaxDeadlocks = 5;
+
+class LockManager;
+struct RangeLockInfo;
+
+// A lock manager handle
+// The workflow is as follows:
+//  * Use a factory method (like NewRangeLockManager()) to create a lock
+//    manager and get its handle.
+//  * A Handle for a particular kind of lock manager will have extra
+//    methods and parameters to control the lock manager
+//  * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
+//    will be used to perform locking.
+class LockManagerHandle {
+ public:
+  // PessimisticTransactionDB will call this to get the Lock Manager it's going
+  // to use.
+  virtual LockManager* getLockManager() = 0;
+
+  virtual ~LockManagerHandle() {}
+};
+
+// Same as class Endpoint, but use std::string to manage the buffer allocation
+struct EndpointWithString {
+  std::string slice;
+  bool inf_suffix;
+};
+
+struct RangeDeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+
+  EndpointWithString m_start;
+  EndpointWithString m_end;
+};
+
+struct RangeDeadlockPath {
+  std::vector<RangeDeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
+                             const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+// A handle to control RangeLockManager (Range-based lock manager) from outside
+// RocksDB
+class RangeLockManagerHandle : public LockManagerHandle {
+ public:
+  // Set total amount of lock memory to use.
+  //
+  //  @return 0 Ok
+  //  @return EDOM Failed to set because currently using more memory than
+  //        specified
+  virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
+  virtual size_t GetMaxLockMemory() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+
+  // Lock Escalation barrier check function.
+  // It is called for a couple of endpoints A and B, such that A < B.
+  // If escalation_barrier_check_func(A, B)==true, then there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed
+  // to bridge the gap between A and B.
+  //
+  // The function may be called from any thread that acquires or releases
+  // locks. It should not throw exceptions. There is currently no way to return
+  // an error.
+  using EscalationBarrierFunc =
+      std::function<bool(const Endpoint& a, const Endpoint& b)>;
+
+  // Set the user-provided barrier check function
+  virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0;
+
+  virtual RangeLockStatus GetRangeLockStatusData() = 0;
+
+  class Counters {
+   public:
+    // Number of times lock escalation was triggered (for all column families)
+    uint64_t escalation_count;
+
+    // Number of times lock acquisition had to wait for a conflicting lock
+    // to be released. This counts both successful waits (where the desired
+    // lock was acquired) and waits that timed out or got other error.
+    uint64_t lock_wait_count;
+
+    // How much memory is currently used for locks (total for all column
+    // families)
+    uint64_t current_lock_memory;
+  };
+
+  // Get the current counter values
+  virtual Counters GetStatus() = 0;
+
+  // Functions for range-based Deadlock reporting.
+  virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
+  virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  virtual ~RangeLockManagerHandle() {}
+};
+
+// A factory function to create a Range Lock Manager. The created object should
+// be:
+//  1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
+//     range-locking mode
+//  2. Used to control the lock manager when the DB is already open.
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+struct TransactionDBOptions {
+  // Specifies the maximum number of keys that can be locked at the same time
+  // per column family.
+  // If the number of locked keys is greater than max_num_locks, transaction
+  // writes (or GetForUpdate) will return an error.
+  // If this value is not positive, no limit will be enforced.
+  int64_t max_num_locks = -1;
+
+  // Stores the number of latest deadlocks to track
+  uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
+
+  // Increasing this value will increase the concurrency by dividing the lock
+  // table (per column family) into more sub-tables, each with their own
+  // separate mutex.
+  size_t num_stripes = 16;
+
+  // If positive, specifies the default wait timeout in milliseconds when
+  // a transaction attempts to lock a key if not specified by
+  // TransactionOptions::lock_timeout.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout.  Not using a timeout is not recommended
+  // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
+  // recover from a deadlock.
+  int64_t transaction_lock_timeout = 1000;  // 1 second
+
+  // If positive, specifies the wait timeout in milliseconds when writing a key
+  // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
+  // directly).
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout and will block indefinitely when acquiring
+  // a lock.
+  //
+  // Not using a timeout can lead to deadlocks.  Currently, there
+  // is no deadlock-detection to recover from a deadlock.  While DB writes
+  // cannot deadlock with other DB writes, they can deadlock with a transaction.
+  // A negative timeout should only be used if all transactions have a small
+  // expiration set.
+  int64_t default_lock_timeout = 1000;  // 1 second
+
+  // If set, the TransactionDB will use this implementation of a mutex and
+  // condition variable for all transaction locking instead of the default
+  // mutex/condvar implementation.
+  std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+
+  // The policy for when to write the data into the DB. The default policy is to
+  // write only the committed data (WRITE_COMMITTED). The data could be written
+  // before the commit phase. The DB then needs to provide the mechanisms to
+  // tell apart committed from uncommitted data.
+  TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+
+  // TODO(myabandeh): remove this option
+  // Note: this is a temporary option as a hot fix in rollback of writeprepared
+  // txns in myrocks. MyRocks uses merge operands for autoinc column id without
+  // however obtaining locks. This breaks the assumption behind the rollback
+  // logic in myrocks. This hack of simply not rolling back merge operands works
+  // for the special way that myrocks uses this operands.
+  bool rollback_merge_operands = false;
+
+  // nullptr means use default lock manager.
+  // Other value means the user provides a custom lock manager.
+  std::shared_ptr<LockManagerHandle> lock_mgr_handle;
+
+  // If true, the TransactionDB implementation might skip concurrency control
+  // unless it is overridden by TransactionOptions or
+  // TransactionDBWriteOptimizations. This can be used in conjunction with
+  // DBOptions::unordered_write when the TransactionDB is used solely for write
+  // ordering rather than concurrency control.
+  bool skip_concurrency_control = false;
+
+  // This option is only valid for write unprepared. If a write batch exceeds
+  // this threshold, then the transaction will implicitly flush the currently
+  // pending writes into the database. A value of 0 or less means no limit.
+  int64_t default_write_batch_flush_threshold = 0;
+
+  // This option is valid only for write-prepared/write-unprepared. Transaction
+  // will rely on this callback to determine if a key should be rolled back
+  // with Delete or SingleDelete when necessary. If the callback returns true,
+  // then SingleDelete should be used. If the callback is not callable or the
+  // callback returns false, then a Delete is used.
+  // The application should ensure thread-safety of this callback.
+  // The callback should not throw because RocksDB is not exception-safe.
+  // The callback may be removed if we allow mixing Delete and SingleDelete in
+  // the future.
+  std::function<bool(TransactionDB* /*db*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/)>
+      rollback_deletion_type_callback;
+
+ private:
+  // 128 entries
+  // Should the default value change, please also update wp_snapshot_cache_bits
+  // in db_stress_gflags.cc
+  size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
+  // 8m entry, 64MB size
+  // Should the default value change, please also update wp_commit_cache_bits
+  // in db_stress_gflags.cc
+  size_t wp_commit_cache_bits = static_cast<size_t>(23);
+
+  // For testing, whether transaction name should be auto-generated or not. This
+  // is useful for write unprepared which requires named transactions.
+  bool autogenerate_name = false;
+
+  friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+  friend class WritePreparedTransactionTestBase;
+  friend class TransactionTestBase;
+  friend class MySQLStyleTransactionTest;
+  friend class StressTest;
+};
+
+struct TransactionOptions {
+  // Setting set_snapshot=true is the same as calling
+  // Transaction::SetSnapshot().
+  bool set_snapshot = false;
+
+  // Setting to true means that before acquiring locks, this transaction will
+  // check if doing so will cause a deadlock. If so, it will return with
+  // Status::Busy.  The user should retry their transaction.
+  bool deadlock_detect = false;
+
+  // If set, it states that the CommitTimeWriteBatch represents the latest state
+  // of the application, has only one sub-batch, i.e., no duplicate keys,  and
+  // meant to be used later during recovery. It enables an optimization to
+  // postpone updating the memtable with CommitTimeWriteBatch to only
+  // SwitchMemtable or recovery.
+  // This option does not affect write-committed. Only
+  // write-prepared/write-unprepared transactions will be affected.
+  bool use_only_the_last_commit_time_batch_for_recovery = false;
+
+  // TODO(agiardullo): TransactionDB does not yet support comparators that allow
+  // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
+  // return 0 if
+  // a.compare(b) returns 0.
+
+  // If positive, specifies the wait timeout in milliseconds when
+  // a transaction attempts to lock a key.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
+  int64_t lock_timeout = -1;
+
+  // Expiration duration in milliseconds.  If non-negative, transactions that
+  // last longer than this many milliseconds will fail to commit.  If not set,
+  // a forgotten transaction that is never committed, rolled back, or deleted
+  // will never relinquish any locks it holds.  This could prevent keys from
+  // being written by other writers.
+  int64_t expiration = -1;
+
+  // The number of traversals to make during deadlock detection.
+  int64_t deadlock_detect_depth = 50;
+
+  // The maximum number of bytes used for the write batch. 0 means no limit.
+  size_t max_write_batch_size = 0;
+
+  // Skip Concurrency Control. This could be as an optimization if the
+  // application knows that the transaction would not have any conflict with
+  // concurrent transactions. It could also be used during recovery if (i)
+  // application guarantees no conflict between prepared transactions in the WAL
+  // (ii) application guarantees that recovered transactions will be rolled
+  // back/commit before new transactions start.
+  // Default: false
+  bool skip_concurrency_control = false;
+
+  // In pessimistic transaction, if this is true, then you can skip Prepare
+  // before Commit, otherwise, you must Prepare before Commit.
+  bool skip_prepare = true;
+
+  // See TransactionDBOptions::default_write_batch_flush_threshold for
+  // description. If a negative value is specified, then the default value from
+  // TransactionDBOptions is used.
+  int64_t write_batch_flush_threshold = -1;
+};
+
+// The per-write optimizations that do not involve transactions. TransactionDB
+// implementation might or might not make use of the specified optimizations.
+struct TransactionDBWriteOptimizations {
+  // If it is true it means that the application guarantees that the
+  // key-set in the write batch do not conflict with any concurrent transaction
+  // and hence the concurrency control mechanism could be skipped for this
+  // write.
+  bool skip_concurrency_control = false;
+  // If true, the application guarantees that there is no duplicate <column
+  // family, key> in the write batch and any employed mechanism to handle
+  // duplicate keys could be skipped.
+  bool skip_duplicate_key_check = false;
+};
+
+struct KeyLockInfo {
+  std::string key;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
+struct RangeLockInfo {
+  EndpointWithString start;
+  EndpointWithString end;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
+struct DeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+  std::string m_waiting_key;
+};
+
+struct DeadlockPath {
+  std::vector<DeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
+                        const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+class TransactionDB : public StackableDB {
+ public:
+  // Optimized version of ::Write that receives more optimization request such
+  // as skip_concurrency_control.
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts,
+                       const TransactionDBWriteOptimizations&,
+                       WriteBatch* updates) {
+    // The default implementation ignores TransactionDBWriteOptimizations and
+    // falls back to the un-optimized version of ::Write
+    return Write(opts, updates);
+  }
+  // Transactional `DeleteRange()` is not yet supported.
+  // However, users who know their deleted range does not conflict with
+  // anything can still use it via the `Write()` API. In all cases, the
+  // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
+  // used and `skip_concurrency_control` must be set. When using either
+  // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
+  // additionally be set.
+  using StackableDB::DeleteRange;
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
+  // Open a TransactionDB similar to DB::Open().
+  // Internally call PrepareWrap() and WrapDB()
+  // If the return status is not ok, then dbptr is set to nullptr.
+  static Status Open(const Options& options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname, TransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     TransactionDB** dbptr);
+  // Note: PrepareWrap() may change parameters, make copies before the
+  // invocation if needed.
+  static void PrepareWrap(DBOptions* db_options,
+                          std::vector<ColumnFamilyDescriptor>* column_families,
+                          std::vector<size_t>* compaction_enabled_cf_indices);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
+  static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
+                       const std::vector<size_t>& compaction_enabled_cf_indices,
+                       const std::vector<ColumnFamilyHandle*>& handles,
+                       TransactionDB** dbptr);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
+  static Status WrapStackableDB(
+      StackableDB* db, const TransactionDBOptions& txn_db_options,
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
+  // Since the destructor in StackableDB is virtual, this destructor is virtual
+  // too. The root db will be deleted by the base's destructor.
+  ~TransactionDB() override {}
+
+  // Starts a new Transaction.
+  //
+  // Caller is responsible for deleting the returned transaction when no
+  // longer needed.
+  //
+  // If old_txn is not null, BeginTransaction will reuse this Transaction
+  // handle instead of allocating a new one.  This is an optimization to avoid
+  // extra allocations when repeatedly creating transactions.
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const TransactionOptions& txn_options = TransactionOptions(),
+      Transaction* old_txn = nullptr) = 0;
+
+  virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
+  virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
+
+  // Returns set of all locks held.
+  //
+  // The mapping is column family id -> KeyLockInfo
+  virtual std::unordered_multimap<uint32_t, KeyLockInfo>
+  GetLockStatusData() = 0;
+
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+  virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  // Create a snapshot and assign ts to it. Return the snapshot to caller. The
+  // snapshot-timestamp mapping is also tracked by the database.
+  // Caller must ensure there are no active writes when this API is called.
+  virtual std::pair<Status, std::shared_ptr<const Snapshot>>
+  CreateTimestampedSnapshot(TxnTimestamp ts) = 0;
+
+  // Return the latest timestamped snapshot if present.
+  std::shared_ptr<const Snapshot> GetLatestTimestampedSnapshot() const {
+    return GetTimestampedSnapshot(kMaxTxnTimestamp);
+  }
+  // Return the snapshot correponding to given timestamp. If ts is
+  // kMaxTxnTimestamp, then we return the latest timestamped snapshot if
+  // present. Othersise, we return the snapshot whose timestamp is equal to
+  // `ts`. If no such snapshot exists, then we return null.
+  virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
+      TxnTimestamp ts) const = 0;
+  // Release timestamped snapshots whose timestamps are less than or equal to
+  // ts.
+  virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0;
+
+  // Get all timestamped snapshots which will be stored in
+  // timestamped_snapshots.
+  Status GetAllTimestampedSnapshots(
+      std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+      const {
+    return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp,
+                                   timestamped_snapshots);
+  }
+
+  // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub).
+  // timestamped_snapshots will be cleared and contain returned snapshots.
+  virtual Status GetTimestampedSnapshots(
+      TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+      std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+      const = 0;
+
+ protected:
+  // To Create an TransactionDB, call Open()
+  // The ownership of db is transferred to the base StackableDB
+  explicit TransactionDB(DB* db) : StackableDB(db) {}
+  // No copying allowed
+  TransactionDB(const TransactionDB&) = delete;
+  void operator=(const TransactionDB&) = delete;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
new file mode 100644
index 000000000..e352f325a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
@@ -0,0 +1,91 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TransactionDBMutex and TransactionDBCondVar APIs allows applications to
+// implement custom mutexes and condition variables to be used by a
+// TransactionDB when locking keys.
+//
+// To open a TransactionDB with a custom TransactionDBMutexFactory, set
+// TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutex {
+ public:
+  virtual ~TransactionDBMutex() {}
+
+  // Attempt to acquire lock.  Return OK on success, or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status Lock() = 0;
+
+  // Attempt to acquire lock.  If timeout is non-negative, operation may be
+  // failed after this many microseconds.
+  // Returns OK on success,
+  //         TimedOut if timed out,
+  //         or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status TryLockFor(int64_t timeout_time) = 0;
+
+  // Unlock Mutex that was successfully locked by Lock() or TryLockUntil()
+  virtual void UnLock() = 0;
+};
+
+class TransactionDBCondVar {
+ public:
+  virtual ~TransactionDBCondVar() {}
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll().  Wait() will be called with mutex locked.
+  // Returns OK if notified.
+  // Returns non-OK if TransactionDB should stop waiting and fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0;
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll(), or if the timeout is reached.
+  // Wait() will be called with mutex locked.
+  //
+  // If timeout is non-negative, operation should be failed after this many
+  // microseconds.
+  // If implementing a custom version of this class, the implementation may
+  // choose to ignore the timeout.
+  //
+  // Returns OK if notified.
+  // Returns TimedOut if timeout is reached.
+  // Returns other status if TransactionDB should otherwise stop waiting and
+  //  fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+                         int64_t timeout_time) = 0;
+
+  // If any threads are waiting on *this, unblock at least one of the
+  // waiting threads.
+  virtual void Notify() = 0;
+
+  // Unblocks all threads waiting on *this.
+  virtual void NotifyAll() = 0;
+};
+
+// Factory class that can allocate mutexes and condition variables.
+class TransactionDBMutexFactory {
+ public:
+  // Create a TransactionDBMutex object.
+  virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0;
+
+  // Create a TransactionDBCondVar object.
+  virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0;
+
+  virtual ~TransactionDBMutexFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
new file mode 100644
index 000000000..84dc11a31
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -0,0 +1,309 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class Comparator;
+class DB;
+class ReadCallback;
+struct ReadOptions;
+struct DBOptions;
+
+enum WriteType {
+  kPutRecord,
+  kMergeRecord,
+  kDeleteRecord,
+  kSingleDeleteRecord,
+  kDeleteRangeRecord,
+  kLogDataRecord,
+  kXIDRecord,
+  kUnknownRecord,
+};
+
+// an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
+// Used in WBWIIterator.
+struct WriteEntry {
+  WriteType type = kUnknownRecord;
+  Slice key;
+  Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+  virtual ~WBWIIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  virtual void SeekToFirst() = 0;
+
+  virtual void SeekToLast() = 0;
+
+  virtual void Seek(const Slice& key) = 0;
+
+  virtual void SeekForPrev(const Slice& key) = 0;
+
+  virtual void Next() = 0;
+
+  virtual void Prev() = 0;
+
+  // the return WriteEntry is only valid until the next mutation of
+  // WriteBatchWithIndex
+  virtual WriteEntry Entry() const = 0;
+
+  virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() Delete(), or SingleDelete(), the same function of the
+// wrapped will be called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex : public WriteBatchBase {
+ public:
+  // backup_index_comparator: the backup comparator used to compare keys
+  // within the same column family, if column family is not given in the
+  // interface, or we can't find a column family from the column family handle
+  // passed in, backup_index_comparator will be used for the column family.
+  // reserved_bytes: reserved bytes in underlying WriteBatch
+  // max_bytes: maximum size of underlying WriteBatch in bytes
+  // overwrite_key: if true, overwrite the key in the index when inserting
+  //                the same key as previously, so iterator will never
+  //                show two entries with the same key.
+  explicit WriteBatchWithIndex(
+      const Comparator* backup_index_comparator = BytewiseComparator(),
+      size_t reserved_bytes = 0, bool overwrite_key = false,
+      size_t max_bytes = 0, size_t protection_bytes_per_key = 0);
+
+  ~WriteBatchWithIndex() override;
+  WriteBatchWithIndex(WriteBatchWithIndex&&);
+  WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
+
+  using WriteBatchBase::Put;
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+
+  Status Put(const Slice& key, const Slice& value) override;
+
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& ts, const Slice& value) override;
+
+  Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    if (!column_family) {
+      return Status::InvalidArgument(
+          "Cannot call this method without a column family handle");
+    }
+
+    return Status::NotSupported(
+        "PutEntity not supported by WriteBatchWithIndex");
+  }
+
+  using WriteBatchBase::Merge;
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value) override;
+
+  Status Merge(const Slice& key, const Slice& value) override;
+  Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*ts*/, const Slice& /*value*/) override {
+    return Status::NotSupported(
+        "Merge does not support user-defined timestamp");
+  }
+
+  using WriteBatchBase::Delete;
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(const Slice& key) override;
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const Slice& ts) override;
+
+  using WriteBatchBase::SingleDelete;
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const Slice& key) override;
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  using WriteBatchBase::DeleteRange;
+  Status DeleteRange(ColumnFamilyHandle* /* column_family */,
+                     const Slice& /* begin_key */,
+                     const Slice& /* end_key */) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+  Status DeleteRange(const Slice& /* begin_key */,
+                     const Slice& /* end_key */) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+  Status DeleteRange(ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*begin_key*/, const Slice& /*end_key*/,
+                     const Slice& /*ts*/) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+
+  using WriteBatchBase::PutLogData;
+  Status PutLogData(const Slice& blob) override;
+
+  using WriteBatchBase::Clear;
+  void Clear() override;
+
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override;
+
+  // Create an iterator of a column family. User can call iterator.Seek() to
+  // search to the next entry of or after a key. Keys will be iterated in the
+  // order given by index_comparator. For multiple updates on the same key,
+  // each update will be returned as a separate entry, in the order of update
+  // time.
+  //
+  // The returned iterator should be deleted by the caller.
+  WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+  // Create an iterator of the default column family.
+  WBWIIterator* NewIterator();
+
+  // Will create a new Iterator that will use WBWIIterator as a delta and
+  // base_iterator as base.
+  //
+  // This function is only supported if the WriteBatchWithIndex was
+  // constructed with overwrite_key=true.
+  //
+  // The returned iterator should be deleted by the caller.
+  // The base_iterator is now 'owned' by the returned iterator. Deleting the
+  // returned iterator will also delete the base_iterator.
+  //
+  // Updating write batch with the current key of the iterator is not safe.
+  // We strongly recommend users not to do it. It will invalidate the current
+  // key() and value() of the iterator. This invalidation happens even before
+  // the write batch update finishes. The state may recover after Next() is
+  // called.
+  Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+                                Iterator* base_iterator,
+                                const ReadOptions* opts = nullptr);
+  // default column family
+  Iterator* NewIteratorWithBase(Iterator* base_iterator);
+
+  // Similar to DB::Get() but will only read the key from this batch.
+  // If the batch does not have enough data to resolve Merge operations,
+  // MergeInProgress status may be returned.
+  Status GetFromBatch(ColumnFamilyHandle* column_family,
+                      const DBOptions& options, const Slice& key,
+                      std::string* value);
+
+  // Similar to previous function but does not require a column_family.
+  // Note:  An InvalidArgument status will be returned if there are any Merge
+  // operators for this key.  Use previous method instead.
+  Status GetFromBatch(const DBOptions& options, const Slice& key,
+                      std::string* value) {
+    return GetFromBatch(nullptr, options, key, value);
+  }
+
+  // Similar to DB::Get() but will also read writes from this batch.
+  //
+  // This function will query both this batch and the DB and then merge
+  // the results using the DB's merge operator (if the batch contains any
+  // merge requests).
+  //
+  // Setting read_options.snapshot will affect what is read from the DB
+  // but will NOT change which keys are read from the batch (the keys in
+  // this batch do not yet belong to any snapshot and will be fetched
+  // regardless).
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, std::string* value);
+
+  // An overload of the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, PinnableSlice* value);
+
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value);
+
+  // An overload of the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value);
+
+  void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family,
+                              const size_t num_keys, const Slice* keys,
+                              PinnableSlice* values, Status* statuses,
+                              bool sorted_input);
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, SingleDelete,
+  // PutLogData) since the most recent call to SetSavePoint() and removes the
+  // most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  //
+  // Calling RollbackToSavePoint invalidates any open iterators on this batch.
+  //
+  // Returns Status::OK() on success,
+  //         Status::NotFound() if no previous call to SetSavePoint(),
+  //         or other Status on corruption.
+  Status RollbackToSavePoint() override;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status PopSavePoint() override;
+
+  void SetMaxBytes(size_t max_bytes) override;
+  size_t GetDataSize() const;
+
+ private:
+  friend class PessimisticTransactionDB;
+  friend class WritePreparedTxn;
+  friend class WriteUnpreparedTxn;
+  friend class WriteBatchWithIndex_SubBatchCnt_Test;
+  friend class WriteBatchWithIndexInternal;
+  // Returns the number of sub-batches inside the write batch. A sub-batch
+  // starts right before inserting a key that is a duplicate of a key in the
+  // last sub-batch.
+  size_t SubBatchCnt();
+
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value, ReadCallback* callback);
+  void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family,
+                              const size_t num_keys, const Slice* keys,
+                              PinnableSlice* values, Status* statuses,
+                              bool sorted_input, ReadCallback* callback);
+  struct Rep;
+  std::unique_ptr<Rep> rep;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
new file mode 100644
index 000000000..c54f3a2c3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+// NOTE: in 'main' development branch, this should be the *next*
+// minor or major version number planned for release.
+#define ROCKSDB_MAJOR 7
+#define ROCKSDB_MINOR 9
+#define ROCKSDB_PATCH 2
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a set of properties indicating how/when/where this version of RocksDB
+// was created.
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties();
+
+// Returns the current version of RocksDB as a string (e.g. "6.16.0").
+// If with_patch is true, the patch is included (6.16.x).
+// Otherwise, only major and minor version is included (6.16)
+std::string GetRocksVersionAsString(bool with_patch = true);
+
+// Gets the set of build properties (@see GetRocksBuildProperties) into a
+// string. Properties are returned one-per-line, with the first line being:
+// "<program> from RocksDB <version>.
+// If verbose is true, the full set of properties is
+// printed. If verbose is false, only the version information (@see
+// GetRocksVersionString) is printed.
+std::string GetRocksBuildInfoAsString(const std::string& program,
+                                      bool verbose = false);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h
new file mode 100644
index 000000000..3e66c39e4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wal_filter.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteBatch;
+struct ConfigOptions;
+
+// WALFilter allows an application to inspect write-ahead-log (WAL)
+// records or modify their processing on recovery.
+// Please see the details below.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class WalFilter : public Customizable {
+ public:
+  static const char* Type() { return "WalFilter"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value, WalFilter** result);
+  enum class WalProcessingOption {
+    // Continue processing as usual
+    kContinueProcessing = 0,
+    // Ignore the current record but continue processing of log(s)
+    kIgnoreCurrentRecord = 1,
+    // Stop replay of logs and discard logs
+    // Logs won't be replayed on subsequent recovery
+    kStopReplay = 2,
+    // Corrupted record detected by filter
+    kCorruptedRecord = 3,
+    // Marker for enum count
+    kWalProcessingOptionMax = 4
+  };
+
+  virtual ~WalFilter() {}
+
+  // Provide ColumnFamily->LogNumber map to filter
+  // so that filter can determine whether a log number applies to a given
+  // column family (i.e. that log hasn't been flushed to SST already for the
+  // column family).
+  // We also pass in name->id map as only name is known during
+  // recovery (as handles are opened post-recovery).
+  // while write batch callbacks happen in terms of column family id.
+  //
+  // @params cf_lognumber_map column_family_id to lognumber map
+  // @params cf_name_id_map   column_family_name to column_family_id map
+
+  virtual void ColumnFamilyLogNumberMap(
+      const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/,
+      const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {}
+
+  // LogRecord is invoked for each log record encountered for all the logs
+  // during replay on logs on recovery. This method can be used to:
+  //  * inspect the record (using the batch parameter)
+  //  * ignoring current record
+  //    (by returning WalProcessingOption::kIgnoreCurrentRecord)
+  //  * reporting corrupted record
+  //    (by returning WalProcessingOption::kCorruptedRecord)
+  //  * stop log replay
+  //    (by returning kStop replay) - please note that this implies
+  //    discarding the logs from current record onwards.
+  //
+  // @params log_number     log_number of the current log.
+  //                        Filter might use this to determine if the log
+  //                        record is applicable to a certain column family.
+  // @params log_file_name  log file name - only for informational purposes
+  // @params batch          batch encountered in the log during recovery
+  // @params new_batch      new_batch to populate if filter wants to change
+  //                        the batch (for example to filter some records out,
+  //                        or alter some records).
+  //                        Please note that the new batch MUST NOT contain
+  //                        more records than original, else recovery would
+  //                        be failed.
+  // @params batch_changed  Whether batch was changed by the filter.
+  //                        It must be set to true if new_batch was populated,
+  //                        else new_batch has no effect.
+  // @returns               Processing option for the current record.
+  //                        Please see WalProcessingOption enum above for
+  //                        details.
+  virtual WalProcessingOption LogRecordFound(
+      unsigned long long /*log_number*/, const std::string& /*log_file_name*/,
+      const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+    // Default implementation falls back to older function for compatibility
+    return LogRecord(batch, new_batch, batch_changed);
+  }
+
+  // Please see the comments for LogRecord above. This function is for
+  // compatibility only and contains a subset of parameters.
+  // New code should use the function above.
+  virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                        WriteBatch* /*new_batch*/,
+                                        bool* /*batch_changed*/) const {
+    return WalProcessingOption::kContinueProcessing;
+  }
+
+  // Returns a name that identifies this WAL filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const override = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wide_columns.h b/src/rocksdb/include/rocksdb/wide_columns.h
new file mode 100644
index 000000000..7ddc61f03
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wide_columns.h
@@ -0,0 +1,171 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <ostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Class representing a wide column, which is defined as a pair of column name
+// and column value.
+class WideColumn {
+ public:
+  WideColumn() = default;
+
+  // Initializes a WideColumn object by forwarding the name and value
+  // arguments to the corresponding member Slices. This makes it possible to
+  // construct a WideColumn using combinations of const char*, const
+  // std::string&, const Slice& etc., for example:
+  //
+  // constexpr char foo[] = "foo";
+  // const std::string bar("bar");
+  // WideColumn column(foo, bar);
+  template <typename N, typename V>
+  WideColumn(N&& name, V&& value)
+      : name_(std::forward<N>(name)), value_(std::forward<V>(value)) {}
+
+  // Initializes a WideColumn object by forwarding the elements of
+  // name_tuple and value_tuple to the constructors of the corresponding member
+  // Slices. This makes it possible to initialize the Slices using the Slice
+  // constructors that take more than one argument, for example:
+  //
+  // constexpr char foo_name[] = "foo_name";
+  // constexpr char bar_value[] = "bar_value";
+  // WideColumn column(std::piecewise_construct,
+  //                   std::forward_as_tuple(foo_name, 3),
+  //                   std::forward_as_tuple(bar_value, 3));
+  template <typename NTuple, typename VTuple>
+  WideColumn(std::piecewise_construct_t, NTuple&& name_tuple,
+             VTuple&& value_tuple)
+      : name_(std::make_from_tuple<Slice>(std::forward<NTuple>(name_tuple))),
+        value_(std::make_from_tuple<Slice>(std::forward<VTuple>(value_tuple))) {
+  }
+
+  const Slice& name() const { return name_; }
+  const Slice& value() const { return value_; }
+
+  Slice& name() { return name_; }
+  Slice& value() { return value_; }
+
+ private:
+  Slice name_;
+  Slice value_;
+};
+
+// Note: column names and values are compared bytewise.
+inline bool operator==(const WideColumn& lhs, const WideColumn& rhs) {
+  return lhs.name() == rhs.name() && lhs.value() == rhs.value();
+}
+
+inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) {
+  return !(lhs == rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) {
+  const bool hex =
+      (os.flags() & std::ios_base::basefield) == std::ios_base::hex;
+  os << column.name().ToString(hex) << ':' << column.value().ToString(hex);
+
+  return os;
+}
+
+// A collection of wide columns.
+using WideColumns = std::vector<WideColumn>;
+
+// The anonymous default wide column (an empty Slice).
+extern const Slice kDefaultWideColumnName;
+
+// An empty set of wide columns.
+extern const WideColumns kNoWideColumns;
+
+// A self-contained collection of wide columns. Used for the results of
+// wide-column queries.
+class PinnableWideColumns {
+ public:
+  const WideColumns& columns() const { return columns_; }
+  size_t serialized_size() const { return value_.size(); }
+
+  void SetPlainValue(const Slice& value);
+  void SetPlainValue(const Slice& value, Cleanable* cleanable);
+
+  Status SetWideColumnValue(const Slice& value);
+  Status SetWideColumnValue(const Slice& value, Cleanable* cleanable);
+
+  void Reset();
+
+ private:
+  void CopyValue(const Slice& value);
+  void PinOrCopyValue(const Slice& value, Cleanable* cleanable);
+  void CreateIndexForPlainValue();
+  Status CreateIndexForWideColumns();
+
+  PinnableSlice value_;
+  WideColumns columns_;
+};
+
+inline void PinnableWideColumns::CopyValue(const Slice& value) {
+  value_.PinSelf(value);
+}
+
+inline void PinnableWideColumns::PinOrCopyValue(const Slice& value,
+                                                Cleanable* cleanable) {
+  if (!cleanable) {
+    CopyValue(value);
+    return;
+  }
+
+  value_.PinSlice(value, cleanable);
+}
+
+inline void PinnableWideColumns::CreateIndexForPlainValue() {
+  columns_ = WideColumns{{kDefaultWideColumnName, value_}};
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value) {
+  CopyValue(value);
+  CreateIndexForPlainValue();
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value,
+                                               Cleanable* cleanable) {
+  PinOrCopyValue(value, cleanable);
+  CreateIndexForPlainValue();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) {
+  CopyValue(value);
+  return CreateIndexForWideColumns();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value,
+                                                      Cleanable* cleanable) {
+  PinOrCopyValue(value, cleanable);
+  return CreateIndexForWideColumns();
+}
+
+inline void PinnableWideColumns::Reset() {
+  value_.Reset();
+  columns_.clear();
+}
+
+inline bool operator==(const PinnableWideColumns& lhs,
+                       const PinnableWideColumns& rhs) {
+  return lhs.columns() == rhs.columns();
+}
+
+inline bool operator!=(const PinnableWideColumns& lhs,
+                       const PinnableWideColumns& rhs) {
+  return !(lhs == rhs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
new file mode 100644
index 000000000..61ba5a739
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -0,0 +1,494 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SavePoints;
+struct SliceParts;
+
+struct SavePoint {
+  size_t size;  // size of rep_
+  int count;    // count of elements in rep_
+  uint32_t content_flags;
+
+  SavePoint() : size(0), count(0), content_flags(0) {}
+
+  SavePoint(size_t _size, int _count, uint32_t _flags)
+      : size(_size), count(_count), content_flags(_flags) {}
+
+  void clear() {
+    size = 0;
+    count = 0;
+    content_flags = 0;
+  }
+
+  bool is_cleared() const { return (size | count | content_flags) == 0; }
+};
+
+class WriteBatch : public WriteBatchBase {
+ public:
+  explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
+      : WriteBatch(reserved_bytes, max_bytes, 0, 0) {}
+
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                      size_t protection_bytes_per_key, size_t default_cf_ts_sz);
+  ~WriteBatch() override;
+
+  using WriteBatchBase::Put;
+  // Store the mapping "key->value" in the database.
+  // The following Put(..., const Slice& key, ...) API can also be used when
+  // user-defined timestamp is enabled as long as `key` points to a contiguous
+  // buffer with timestamp appended after user key. The caller is responsible
+  // for setting up the memory buffer pointed to by `key`.
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+  Status Put(const Slice& key, const Slice& value) override {
+    return Put(nullptr, key, value);
+  }
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& ts, const Slice& value) override;
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatenations of arrays of
+  // slices.
+  // The following Put(..., const SliceParts& key, ...) API can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
+  Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value) override;
+  Status Put(const SliceParts& key, const SliceParts& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+  // column family specified by "column_family".
+  using WriteBatchBase::PutEntity;
+  Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override;
+
+  using WriteBatchBase::Delete;
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  // The following Delete(..., const Slice& key) can be used when user-defined
+  // timestamp is enabled as long as `key` points to a contiguous buffer with
+  // timestamp appended after user key. The caller is responsible for setting
+  // up the memory buffer pointed to by `key`.
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const Slice& ts) override;
+
+  // variant that takes SliceParts
+  // These two variants of Delete(..., const SliceParts& key) can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
+  Status Delete(ColumnFamilyHandle* column_family,
+                const SliceParts& key) override;
+  Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+  using WriteBatchBase::SingleDelete;
+  // WriteBatch implementation of DB::SingleDelete().  See db.h.
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const Slice& key) override {
+    return SingleDelete(nullptr, key);
+  }
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  // variant that takes SliceParts
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const SliceParts& key) override;
+  Status SingleDelete(const SliceParts& key) override {
+    return SingleDelete(nullptr, key);
+  }
+
+  using WriteBatchBase::DeleteRange;
+  // WriteBatch implementation of DB::DeleteRange().  See db.h.
+  Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key) override;
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key) override {
+    return DeleteRange(nullptr, begin_key, end_key);
+  }
+  // begin_key and end_key should be user keys without timestamp.
+  Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key, const Slice& ts) override;
+
+  // variant that takes SliceParts
+  Status DeleteRange(ColumnFamilyHandle* column_family,
+                     const SliceParts& begin_key,
+                     const SliceParts& end_key) override;
+  Status DeleteRange(const SliceParts& begin_key,
+                     const SliceParts& end_key) override {
+    return DeleteRange(nullptr, begin_key, end_key);
+  }
+
+  using WriteBatchBase::Merge;
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value) override;
+  Status Merge(const Slice& key, const Slice& value) override {
+    return Merge(nullptr, key, value);
+  }
+  Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*ts*/, const Slice& /*value*/) override;
+
+  // variant that takes SliceParts
+  Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+               const SliceParts& value) override;
+  Status Merge(const SliceParts& key, const SliceParts& value) override {
+    return Merge(nullptr, key, value);
+  }
+
+  using WriteBatchBase::PutLogData;
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in which they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  Status PutLogData(const Slice& blob) override;
+
+  using WriteBatchBase::Clear;
+  // Clear all updates buffered in this batch.
+  void Clear() override;
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status RollbackToSavePoint() override;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status PopSavePoint() override;
+
+  // Support for iterating over the contents of a batch.
+  // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
+  class Handler {
+   public:
+    virtual ~Handler();
+    // All handler functions in this class provide default implementations so
+    // we won't break existing clients of Handler on a source code level when
+    // adding a new member function.
+
+    // default implementation will just call Put without column family for
+    // backwards compatibility. If the column family is not default,
+    // the function is noop
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      if (column_family_id == 0) {
+        // Put() historically doesn't return status. We didn't want to be
+        // backwards incompatible so we didn't change the return status
+        // (this is a public API). We do an ordinary get and return Status::OK()
+        Put(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and PutCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutEntityCF(uint32_t /* column_family_id */,
+                               const Slice& /* key */,
+                               const Slice& /* entity */) {
+      return Status::NotSupported("PutEntityCF not implemented");
+    }
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        Delete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and DeleteCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Delete(const Slice& /*key*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        SingleDelete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and SingleDeleteCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void SingleDelete(const Slice& /*key*/) {}
+
+    // If user-defined timestamp is enabled, then `begin_key` and `end_key`
+    // both include timestamp.
+    virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
+                                 const Slice& /*begin_key*/,
+                                 const Slice& /*end_key*/) {
+      return Status::InvalidArgument("DeleteRangeCF not implemented");
+    }
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) {
+      if (column_family_id == 0) {
+        Merge(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and MergeCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+                                  const Slice& /*key*/,
+                                  const Slice& /*value*/) {
+      return Status::InvalidArgument("PutBlobIndexCF not implemented");
+    }
+
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+
+    virtual Status MarkBeginPrepare(bool = false) {
+      return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
+    }
+
+    virtual Status MarkEndPrepare(const Slice& /*xid*/) {
+      return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
+    }
+
+    virtual Status MarkNoop(bool /*empty_batch*/) {
+      return Status::InvalidArgument("MarkNoop() handler not defined.");
+    }
+
+    virtual Status MarkRollback(const Slice& /*xid*/) {
+      return Status::InvalidArgument(
+          "MarkRollbackPrepare() handler not defined.");
+    }
+
+    virtual Status MarkCommit(const Slice& /*xid*/) {
+      return Status::InvalidArgument("MarkCommit() handler not defined.");
+    }
+
+    virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+                                           const Slice& /*commit_ts*/) {
+      return Status::InvalidArgument(
+          "MarkCommitWithTimestamp() handler not defined.");
+    }
+
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+
+   protected:
+    friend class WriteBatchInternal;
+    enum class OptionState {
+      kUnknown,
+      kDisabled,
+      kEnabled,
+    };
+    virtual OptionState WriteAfterCommit() const {
+      return OptionState::kUnknown;
+    }
+    virtual OptionState WriteBeforePrepare() const {
+      return OptionState::kUnknown;
+    }
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
+
+  // Returns the number of updates in the batch
+  uint32_t Count() const;
+
+  // Returns true if PutCF will be called during Iterate
+  bool HasPut() const;
+
+  // Returns true if PutEntityCF will be called during Iterate
+  bool HasPutEntity() const;
+
+  // Returns true if DeleteCF will be called during Iterate
+  bool HasDelete() const;
+
+  // Returns true if SingleDeleteCF will be called during Iterate
+  bool HasSingleDelete() const;
+
+  // Returns true if DeleteRangeCF will be called during Iterate
+  bool HasDeleteRange() const;
+
+  // Returns true if MergeCF will be called during Iterate
+  bool HasMerge() const;
+
+  // Returns true if MarkBeginPrepare will be called during Iterate
+  bool HasBeginPrepare() const;
+
+  // Returns true if MarkEndPrepare will be called during Iterate
+  bool HasEndPrepare() const;
+
+  // Returns true if MarkCommit will be called during Iterate
+  bool HasCommit() const;
+
+  // Returns true if MarkRollback will be called during Iterate
+  bool HasRollback() const;
+
+  // Experimental.
+  //
+  // Update timestamps of existing entries in the write batch if
+  // applicable. If a key is intended for a column family that disables
+  // timestamp, then this API won't set the timestamp for this key.
+  // This requires that all keys, if enable timestamp, (possibly from multiple
+  // column families) in the write batch have timestamps of the same format.
+  //
+  // ts_sz_func: callable object to obtain the timestamp sizes of column
+  // families. If ts_sz_func() accesses data structures, then the caller of this
+  // API must guarantee thread-safety. Like other parts of RocksDB, this API is
+  // not exception-safe. Therefore, ts_sz_func() must not throw.
+  //
+  // in: cf, the column family id.
+  // ret: timestamp size of the given column family. Return
+  //      std::numeric_limits<size_t>::max() indicating "don't know or column
+  //      family info not found", this will cause UpdateTimestamps() to fail.
+  // size_t ts_sz_func(uint32_t cf);
+  Status UpdateTimestamps(const Slice& ts,
+                          std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
+
+  // Verify the per-key-value checksums of this write batch.
+  // Corruption status will be returned if the verification fails.
+  // If this write batch does not have per-key-value checksum,
+  // OK status will be returned.
+  Status VerifyChecksum() const;
+
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override { return this; }
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(const std::string& rep);
+  explicit WriteBatch(std::string&& rep);
+
+  WriteBatch(const WriteBatch& src);
+  WriteBatch(WriteBatch&& src) noexcept;
+  WriteBatch& operator=(const WriteBatch& src);
+  WriteBatch& operator=(WriteBatch&& src);
+
+  // marks this point in the WriteBatch as the last record to
+  // be inserted into the WAL, provided the WAL is enabled
+  void MarkWalTerminationPoint();
+  const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; }
+
+  void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
+
+  struct ProtectionInfo;
+  size_t GetProtectionBytesPerKey() const;
+
+ private:
+  friend class WriteBatchInternal;
+  friend class LocalSavePoint;
+  // TODO(myabandeh): this is needed for a hack to collapse the write batch and
+  // remove duplicate keys. Remove it when the hack is replaced with a proper
+  // solution.
+  friend class WriteBatchWithIndex;
+  std::unique_ptr<SavePoints> save_points_;
+
+  // When sending a WriteBatch through WriteImpl we might want to
+  // specify that only the first x records of the batch be written to
+  // the WAL.
+  SavePoint wal_term_point_;
+
+  // Is the content of the batch the application's latest state that meant only
+  // to be used for recovery? Refer to
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
+  // more details.
+  bool is_latest_persistent_state_ = false;
+
+  // False if all keys are from column families that disable user-defined
+  // timestamp OR UpdateTimestamps() has been called at least once.
+  // This flag will be set to true if any of the above Put(), Delete(),
+  // SingleDelete(), etc. APIs are called at least once.
+  // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag
+  // to true because the assumption is that these APIs have already set the
+  // timestamps to desired values.
+  bool needs_in_place_update_ts_ = false;
+
+  // True if the write batch contains at least one key from a column family
+  // that enables user-defined timestamp.
+  bool has_key_with_ts_ = false;
+
+  // For HasXYZ.  Mutable to allow lazy computation of results
+  mutable std::atomic<uint32_t> content_flags_;
+
+  // Performs deferred computation of content_flags if necessary
+  uint32_t ComputeContentFlags() const;
+
+  // Maximum size of rep_.
+  size_t max_bytes_;
+
+  std::unique_ptr<ProtectionInfo> prot_info_;
+
+  size_t default_cf_ts_sz_ = 0;
+
+ protected:
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
new file mode 100644
index 000000000..f6f39ef0b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstddef>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+class ColumnFamilyHandle;
+class WriteBatch;
+struct SliceParts;
+
+// Abstract base class that defines the basic interface for a write batch.
+// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
+// indexed implementation.
+class WriteBatchBase {
+ public:
+  virtual ~WriteBatchBase() {}
+
+  // Store the mapping "key->value" in the database.
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const Slice& key, const Slice& value) = 0;
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& ts, const Slice& value) = 0;
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatenations of arrays of
+  // slices.
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value);
+  virtual Status Put(const SliceParts& key, const SliceParts& value);
+
+  // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+  // column family specified by "column_family".
+  virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+                           const WideColumns& columns) = 0;
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const Slice& key, const Slice& value) = 0;
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& ts, const Slice& value) = 0;
+
+  // variant that takes SliceParts
+  virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+                       const SliceParts& value);
+  virtual Status Merge(const SliceParts& key, const SliceParts& value);
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key);
+  virtual Status Delete(const SliceParts& key);
+
+  // If the database contains a mapping for "key", erase it. Expects that the
+  // key was not overwritten. Else do nothing.
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const Slice& key) = 0;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key);
+  virtual Status SingleDelete(const SliceParts& key);
+
+  // If the database contains mappings in the range ["begin_key", "end_key"),
+  // erase them. Else do nothing.
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key) = 0;
+  virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0;
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key,
+                             const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const SliceParts& begin_key,
+                             const SliceParts& end_key);
+  virtual Status DeleteRange(const SliceParts& begin_key,
+                             const SliceParts& end_key);
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in which they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  virtual Status PutLogData(const Slice& blob) = 0;
+
+  // Clear all updates buffered in this batch.
+  virtual void Clear() = 0;
+
+  // Covert this batch into a WriteBatch.  This is an abstracted way of
+  // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
+  // WriteBatch.
+  virtual WriteBatch* GetWriteBatch() = 0;
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  virtual void SetSavePoint() = 0;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  virtual Status RollbackToSavePoint() = 0;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
+  // Sets the maximum size of the write batch in bytes. 0 means no limit.
+  virtual void SetMaxBytes(size_t max_bytes) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h
new file mode 100644
index 000000000..7fb18196d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h
@@ -0,0 +1,176 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBufferManager is for managing memory allocation for one or more
+// MemTables.
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <list>
+#include <mutex>
+
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManager;
+
+// Interface to block and signal DB instances, intended for RocksDB
+// internal use only. Each DB instance contains ptr to StallInterface.
+class StallInterface {
+ public:
+  virtual ~StallInterface() {}
+
+  virtual void Block() = 0;
+
+  virtual void Signal() = 0;
+};
+
+class WriteBufferManager final {
+ public:
+  // Parameters:
+  // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
+  // memory_usage() won't be valid and ShouldFlush() will always return true.
+  //
+  // cache_: if `cache` is provided, we'll put dummy entries in the cache and
+  // cost the memory allocated to the cache. It can be used even if _buffer_size
+  // = 0.
+  //
+  // allow_stall: if set true, it will enable stalling of writes when
+  // memory_usage() exceeds buffer_size. It will wait for flush to complete and
+  // memory usage to drop down.
+  explicit WriteBufferManager(size_t _buffer_size,
+                              std::shared_ptr<Cache> cache = {},
+                              bool allow_stall = false);
+  // No copying allowed
+  WriteBufferManager(const WriteBufferManager&) = delete;
+  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
+  ~WriteBufferManager();
+
+  // Returns true if buffer_limit is passed to limit the total memory usage and
+  // is greater than 0.
+  bool enabled() const { return buffer_size() > 0; }
+
+  // Returns true if pointer to cache is passed.
+  bool cost_to_cache() const { return cache_res_mgr_ != nullptr; }
+
+  // Returns the total memory used by memtables.
+  // Only valid if enabled()
+  size_t memory_usage() const {
+    return memory_used_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the total memory used by active memtables.
+  size_t mutable_memtable_memory_usage() const {
+    return memory_active_.load(std::memory_order_relaxed);
+  }
+
+  size_t dummy_entries_in_cache_usage() const;
+
+  // Returns the buffer_size.
+  size_t buffer_size() const {
+    return buffer_size_.load(std::memory_order_relaxed);
+  }
+
+  void SetBufferSize(size_t new_size) {
+    buffer_size_.store(new_size, std::memory_order_relaxed);
+    mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
+    // Check if stall is active and can be ended.
+    MaybeEndWriteStall();
+  }
+
+  // Below functions should be called by RocksDB internally.
+
+  // Should only be called from write thread
+  bool ShouldFlush() const {
+    if (enabled()) {
+      if (mutable_memtable_memory_usage() >
+          mutable_limit_.load(std::memory_order_relaxed)) {
+        return true;
+      }
+      size_t local_size = buffer_size();
+      if (memory_usage() >= local_size &&
+          mutable_memtable_memory_usage() >= local_size / 2) {
+        // If the memory exceeds the buffer size, we trigger more aggressive
+        // flush. But if already more than half memory is being flushed,
+        // triggering more flush may not help. We will hold it instead.
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Returns true if total memory usage exceeded buffer_size.
+  // We stall the writes untill memory_usage drops below buffer_size. When the
+  // function returns true, all writer threads (including one checking this
+  // condition) across all DBs will be stalled. Stall is allowed only if user
+  // pass allow_stall = true during WriteBufferManager instance creation.
+  //
+  // Should only be called by RocksDB internally .
+  bool ShouldStall() const {
+    if (!allow_stall_ || !enabled()) {
+      return false;
+    }
+
+    return IsStallActive() || IsStallThresholdExceeded();
+  }
+
+  // Returns true if stall is active.
+  bool IsStallActive() const {
+    return stall_active_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if stalling condition is met.
+  bool IsStallThresholdExceeded() const {
+    return memory_usage() >= buffer_size_;
+  }
+
+  void ReserveMem(size_t mem);
+
+  // We are in the process of freeing `mem` bytes, so it is not considered
+  // when checking the soft limit.
+  void ScheduleFreeMem(size_t mem);
+
+  void FreeMem(size_t mem);
+
+  // Add the DB instance to the queue and block the DB.
+  // Should only be called by RocksDB internally.
+  void BeginWriteStall(StallInterface* wbm_stall);
+
+  // If stall conditions have resolved, remove DB instances from queue and
+  // signal them to continue.
+  void MaybeEndWriteStall();
+
+  void RemoveDBFromQueue(StallInterface* wbm_stall);
+
+ private:
+  std::atomic<size_t> buffer_size_;
+  std::atomic<size_t> mutable_limit_;
+  std::atomic<size_t> memory_used_;
+  // Memory that hasn't been scheduled to free.
+  std::atomic<size_t> memory_active_;
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+  // Protects cache_res_mgr_
+  std::mutex cache_res_mgr_mu_;
+
+  std::list<StallInterface*> queue_;
+  // Protects the queue_ and stall_active_.
+  std::mutex mu_;
+  bool allow_stall_;
+  // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall()
+  // while holding mu_, but it can be read without a lock.
+  std::atomic<bool> stall_active_;
+
+  void ReserveMemWithCache(size_t mem);
+  void FreeMemWithCache(size_t mem);
+};
+}  // namespace ROCKSDB_NAMESPACE